Compare commits

...

3 Commits

Author SHA1 Message Date
Ashik K f312cb7982 Add README 2023-03-11 16:29:01 +01:00
Ashik K 73533fd0ae Make the parser a 2 pass system
- first run parser with --download
- then run the exif script
- finally run parser without any flags
2023-03-11 16:28:50 +01:00
Ashik K 0727f60927 Add C++ code and tooling for exif data genaration 2023-03-11 16:28:38 +01:00
3 changed files with 141 additions and 23 deletions

24
README Normal file
View File

@ -0,0 +1,24 @@
Dependencies: curlpp, rapidjson
To build please do:
mkdir build
cd build
cmake ..
make
How to use?
-----------
1. First run the fetch_all binary with your api key as argument.
It will fetch the json data into your local folder.
2. Run parser with argument "--download" (this will
download all the images).
3. Run the "genexifall" script- Supply the folder name
containing the dwnloaded images as parameter.
4. Run parser again without any arguments.
You should get the out.csv file generated at this step
(This is the file that you can use to import the data on OpenRefine).

5
genallexif Executable file
View File

@ -0,0 +1,5 @@
for f in "$1"/*.jpeg
do
exiftool -j "`echo $f`" > "`echo $f`.exif.json"
done
exit

135
parser.cc
View File

@ -7,17 +7,23 @@
#include <iostream>
#include <map>
#define debug_level 0
std::string yearstr(int year) {
return year==-1? "Unknown": std::to_string(year);
}
int main()
int main(int argc, char **argv)
{
bool download_mode =false;
if (argc == 2) {
std::string arg1 = argv[1];
download_mode = arg1=="--download";
}
char filename[64];
snprintf(filename, sizeof filename, "out.csv");
std::ofstream out_file1;
out_file1.open(filename);
out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, collection name, museum name, linceses\n";
out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, subjects, date published, collection name, museum name, exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_datetimeoriginal, liceses\n";
for (auto i = 0; i<4; i++) {
char in_file_i[64];
snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i);
@ -54,7 +60,6 @@ int main()
const rapidjson::Value& response = doc["response"];
if (response.HasMember("docs")
&& response["docs"].IsArray()) {
std::cout<<"yes, response is an array"<<std::endl;
for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) {
//std::cout<<i<<std::endl;
int yearb = -1, yeare = -1;
@ -126,6 +131,27 @@ int main()
}
}
std::string subjects;
if (article.HasMember("artifact.ingress.subjects")) {
const rapidjson::Value& subjv = article["artifact.ingress.subjects"];
if (subjv.IsArray()) {
for (auto i = 0; i<subjv.GetArray().Size(); i++) {
subjects += subjv[i].GetString();
subjects += " ";
}
std::replace( description.begin(), description.end(), ',', ':');
std::replace( description.begin(), description.end(), '\n', ' ');
}
}
std::string publishdate;
if (article.HasMember("artifact.publishedDate")) {
const rapidjson::Value& pubdatev = article["artifact.publishedDate"];
if (pubdatev.IsString()) {
publishdate = pubdatev.GetString();
}
}
std::string license = "";
if (article.HasMember("artifact.ingress.license")) {
if (article[ "artifact.ingress.license"].IsString())
@ -142,27 +168,90 @@ int main()
is_cc_license = true;
char imgfetch[256];
snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", imglink, article_id.c_str(), mediaid.c_str());
std::cout<<imgfetch<<std::endl;
// system (imgfetch);
std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n";
if (debug_level > 0) std::cout<<imgfetch<<std::endl;
if (download_mode) {
std::cout<<"running in download mode\n";
system (imgfetch);
if (debug_level >0) std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n";
}
}
if (is_cc_license) {
out_file1<<
/* article["artifact.defaultMediaIdentifier"].GetString()<<
", "<< article["artifact.defaultPictureIndex"].GetInt()<<
", "<< article["artifact.defaultPictureDimension"].GetString()<<
", "<<*/
article_id <<
", "<< title <<
", "<< yearstr(yearb) <<
", "<< yearstr(yeare) <<
", "<< descfilename <<
", "<< imglink <<
", "<< article_id+"-"+ mediaid +".jpeg" <<
", "<<"Länge leve Kosta! exhibition" <<
", "<<"Kulturparken Småland / Smålands museum" <<
", "<< license <<
std::endl;
if (!download_mode) {
if (is_cc_license) {
char exif_file[128];
snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", article_id.c_str(), mediaid.c_str());
// Open the file
FILE* exiffp = fopen(exif_file, "rb");
if (!exiffp) {
std::cerr << "Error: unable to open file" << std::string(exif_file)
<< std::endl;
return -1;
}
char exifreadBuffer[4096];
rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, sizeof(exifreadBuffer));
rapidjson::Document exifdoc;
exifdoc.ParseStream(exifis);
// Check if the document is valid
if (exifdoc.HasParseError()) {
std::cerr << "Error: failed to parse JSON document exif data"
<< std::endl;
}
fclose(exiffp);
std::string exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_fnumber, exif_datetimeoriginal;
if (exifdoc.IsArray()) {
if (exifdoc[0].IsObject()) {
if (exifdoc[0].GetObject().HasMember("Model")) {
exif_model = exifdoc[0].GetObject()["Model"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ISO")) {
exif_iso = std::to_string(exifdoc[0].GetObject()["ISO"].GetInt());
}
if (exifdoc[0].GetObject().HasMember("FocalLength")) {
exif_focallength = exifdoc[0].GetObject()["FocalLength"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ExposureTime")) {
if (exifdoc[0].GetObject()["ExposureTime"].IsString())
exif_exposuretime = exifdoc[0].GetObject()["ExposureTime"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ApertureValue")) {
exif_aperture = std::to_string(exifdoc[0].GetObject()["ApertureValue"].GetDouble());
}
if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) {
exif_datetimeoriginal = exifdoc[0].GetObject()["DateTimeOriginal"].GetString();
}
}
}
out_file1<<
/* article["artifact.defaultMediaIdentifier"].GetString()<<
", "<< article["artifact.defaultPictureIndex"].GetInt()<<
", "<< article["artifact.defaultPictureDimension"].GetString()<<
", "<<*/
article_id <<
", "<< title <<
", "<< yearstr(yearb) <<
", "<< yearstr(yeare) <<
", "<< description <<
", "<< imglink <<
", "<< article_id+"-"+ mediaid +".jpeg" <<
", "<< subjects <<
", "<< publishdate <<
", "<<"Länge leve Kosta! exhibition" <<
", "<<"Kulturparken Småland / Smålands museum" <<
", "<< exif_model <<
", "<< exif_iso <<
", "<< exif_focallength <<
", "<< exif_exposuretime <<
", "<< exif_aperture <<
", "<< exif_datetimeoriginal<<
", "<< license <<
std::endl;
}
}
}
}