Compare commits

...

3 Commits

Author SHA1 Message Date
Ashik K f806a30530 add README 2023-03-11 15:39:54 +01:00
Ashik K 3abcdabce9 make the parser 2 pass system
- first run parser with --download
- then run the exif script
- finally run parser without any flags
2023-03-11 15:32:28 +01:00
Ashik K cfd21da55b add C++ code and tooling for exif data genaration 2023-03-11 15:32:28 +01:00
3 changed files with 141 additions and 23 deletions

24
README Normal file
View File

@ -0,0 +1,24 @@
Dependencies: curlpp, rapidjson
To build please do:
mkdir build
cd build
cmake ..
make
How to use?
-----------
1. First run the fetch_all binary with your api key as argument.
It will fetch the json data into your local folder.
2. Run parser with argument "--download" (this will
download all the images).
3. Run the "genexifall" script- Supply the folder name
containing the dwnloaded images as parameter.
4. Run parser again without any arguments.
You should get the out.csv file generated at this step
(This is the file that you can use to import the data on OpenRefine).

5
genallexif Executable file
View File

@ -0,0 +1,5 @@
for f in "$1"/*.jpeg
do
exiftool -j "`echo $f`" > "`echo $f`.exif.json"
done
exit

135
parser.cc
View File

@ -7,17 +7,23 @@
#include <iostream> #include <iostream>
#include <map> #include <map>
#define debug_level 0
std::string yearstr(int year) { std::string yearstr(int year) {
return year==-1? "Unknown": std::to_string(year); return year==-1? "Unknown": std::to_string(year);
} }
int main() int main(int argc, char **argv)
{ {
bool download_mode =false;
if (argc == 2) {
std::string arg1 = argv[1];
download_mode = arg1=="--download";
}
char filename[64]; char filename[64];
snprintf(filename, sizeof filename, "out.csv"); snprintf(filename, sizeof filename, "out.csv");
std::ofstream out_file1; std::ofstream out_file1;
out_file1.open(filename); out_file1.open(filename);
out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, collection name, museum name, linceses\n"; out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, subjects, date published, collection name, museum name, exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_datetimeoriginal, liceses\n";
for (auto i = 0; i<4; i++) { for (auto i = 0; i<4; i++) {
char in_file_i[64]; char in_file_i[64];
snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i); snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i);
@ -54,7 +60,6 @@ int main()
const rapidjson::Value& response = doc["response"]; const rapidjson::Value& response = doc["response"];
if (response.HasMember("docs") if (response.HasMember("docs")
&& response["docs"].IsArray()) { && response["docs"].IsArray()) {
std::cout<<"yes, response is an array"<<std::endl;
for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) { for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) {
//std::cout<<i<<std::endl; //std::cout<<i<<std::endl;
int yearb = -1, yeare = -1; int yearb = -1, yeare = -1;
@ -126,6 +131,27 @@ int main()
} }
} }
std::string subjects;
if (article.HasMember("artifact.ingress.subjects")) {
const rapidjson::Value& subjv = article["artifact.ingress.subjects"];
if (subjv.IsArray()) {
for (auto i = 0; i<subjv.GetArray().Size(); i++) {
subjects += subjv[i].GetString();
subjects += " ";
}
std::replace( description.begin(), description.end(), ',', ':');
std::replace( description.begin(), description.end(), '\n', ' ');
}
}
std::string publishdate;
if (article.HasMember("artifact.publishedDate")) {
const rapidjson::Value& pubdatev = article["artifact.publishedDate"];
if (pubdatev.IsString()) {
publishdate = pubdatev.GetString();
}
}
std::string license = ""; std::string license = "";
if (article.HasMember("artifact.ingress.license")) { if (article.HasMember("artifact.ingress.license")) {
if (article[ "artifact.ingress.license"].IsString()) if (article[ "artifact.ingress.license"].IsString())
@ -142,27 +168,90 @@ int main()
is_cc_license = true; is_cc_license = true;
char imgfetch[256]; char imgfetch[256];
snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", imglink, article_id.c_str(), mediaid.c_str()); snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", imglink, article_id.c_str(), mediaid.c_str());
std::cout<<imgfetch<<std::endl; if (debug_level > 0) std::cout<<imgfetch<<std::endl;
// system (imgfetch); if (download_mode) {
std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n"; std::cout<<"running in download mode\n";
system (imgfetch);
if (debug_level >0) std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n";
}
} }
if (is_cc_license) {
out_file1<< if (!download_mode) {
/* article["artifact.defaultMediaIdentifier"].GetString()<< if (is_cc_license) {
", "<< article["artifact.defaultPictureIndex"].GetInt()<< char exif_file[128];
", "<< article["artifact.defaultPictureDimension"].GetString()<< snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", article_id.c_str(), mediaid.c_str());
", "<<*/ // Open the file
article_id << FILE* exiffp = fopen(exif_file, "rb");
", "<< title << if (!exiffp) {
", "<< yearstr(yearb) << std::cerr << "Error: unable to open file" << std::string(exif_file)
", "<< yearstr(yeare) << << std::endl;
", "<< descfilename << return -1;
", "<< imglink << }
", "<< article_id+"-"+ mediaid +".jpeg" <<
", "<<"Länge leve Kosta! exhibition" <<
", "<<"Kulturparken Småland / Smålands museum" << char exifreadBuffer[4096];
", "<< license << rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, sizeof(exifreadBuffer));
std::endl; rapidjson::Document exifdoc;
exifdoc.ParseStream(exifis);
// Check if the document is valid
if (exifdoc.HasParseError()) {
std::cerr << "Error: failed to parse JSON document exif data"
<< std::endl;
}
fclose(exiffp);
std::string exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_fnumber, exif_datetimeoriginal;
if (exifdoc.IsArray()) {
if (exifdoc[0].IsObject()) {
if (exifdoc[0].GetObject().HasMember("Model")) {
exif_model = exifdoc[0].GetObject()["Model"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ISO")) {
exif_iso = std::to_string(exifdoc[0].GetObject()["ISO"].GetInt());
}
if (exifdoc[0].GetObject().HasMember("FocalLength")) {
exif_focallength = exifdoc[0].GetObject()["FocalLength"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ExposureTime")) {
if (exifdoc[0].GetObject()["ExposureTime"].IsString())
exif_exposuretime = exifdoc[0].GetObject()["ExposureTime"].GetString();
}
if (exifdoc[0].GetObject().HasMember("ApertureValue")) {
exif_aperture = std::to_string(exifdoc[0].GetObject()["ApertureValue"].GetDouble());
}
if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) {
exif_datetimeoriginal = exifdoc[0].GetObject()["DateTimeOriginal"].GetString();
}
}
}
out_file1<<
/* article["artifact.defaultMediaIdentifier"].GetString()<<
", "<< article["artifact.defaultPictureIndex"].GetInt()<<
", "<< article["artifact.defaultPictureDimension"].GetString()<<
", "<<*/
article_id <<
", "<< title <<
", "<< yearstr(yearb) <<
", "<< yearstr(yeare) <<
", "<< description <<
", "<< imglink <<
", "<< article_id+"-"+ mediaid +".jpeg" <<
", "<< subjects <<
", "<< publishdate <<
", "<<"Länge leve Kosta! exhibition" <<
", "<<"Kulturparken Småland / Smålands museum" <<
", "<< exif_model <<
", "<< exif_iso <<
", "<< exif_focallength <<
", "<< exif_exposuretime <<
", "<< exif_aperture <<
", "<< exif_datetimeoriginal<<
", "<< license <<
std::endl;
}
} }
} }
} }