diff --git a/fetcher.cc b/fetcher.cc index b8885dc..16686a6 100644 --- a/fetcher.cc +++ b/fetcher.cc @@ -1,83 +1,83 @@ -#include #include +#include -#include #include +#include #include #include -#include #include -#include #include +#include +#include #define DEBUG_LEVEL -1 -int main(int argc, char *argv[]) -{ +int main(int argc, char *argv[]) { std::string exbid, apikey; - if (argc !=7 ) { - std::cout<<"Usage: "< -k -e "< -k -e " + << std::endl; exit(0); } int c; int numrec = 0; - while ((c = getopt (argc, argv, "e:k:n:")) != -1) - switch(c) - { - case 'e': - exbid = optarg; - break; - case 'k': - apikey = optarg; - break; - case 'n': - numrec = atoi(optarg); - break; - default: - break; - } - std::cout< 0) { - std::cout<<"url to fetch is "< 0) { + std::cout << "url to fetch is " << std::endl << url << std::endl; + std::cout << "writing output to " << outfilename << std::endl; + } - if (DEBUG_LEVEL > 0) std::cout << request1 << std::endl; + try { + curlpp::Cleanup cleaner; + curlpp::Easy request1; - std::ofstream outfile; - outfile.open (outfilename); - outfile << curlpp::options::Url(url) << std::endl; - outfile.close(); + // Setting the URL to retrive. + request1.setOpt(new curlpp::options::Url(url)); + + if (DEBUG_LEVEL > 0) + std::cout << request1 << std::endl; + + std::ofstream outfile; + outfile.open(outfilename); + outfile << curlpp::options::Url(url) << std::endl; + outfile.close(); + } catch (curlpp::LogicError &e) { + if (DEBUG_LEVEL > 0) + std::cout << e.what() << std::endl; + } catch (curlpp::RuntimeError &e) { + if (DEBUG_LEVEL > 0) + std::cout << e.what() << std::endl; + } } - catch ( curlpp::LogicError & e ) { - if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl; - } - catch ( curlpp::RuntimeError & e ) { - if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl; - } - - -} - return 0; + return 0; } diff --git a/parser.cc b/parser.cc index ed8bb83..402361b 100644 --- a/parser.cc +++ b/parser.cc @@ -1,8 +1,8 @@ #include #include -#include #include +#include #include #include @@ -12,45 +12,46 @@ #define debug_level 0 std::string yearstr(int year) { - return year==-1? "Unknown": std::to_string(year); + return year == -1 ? "Unknown" : std::to_string(year); } -int main(int argc, char **argv) -{ +int main(int argc, char **argv) { int c; - bool download_mode =false; + bool download_mode = false; int numfiles = 0; - while ((c = getopt (argc, argv, "d:n:")) != -1) - switch(c) - { - case 'd': // download mode - download_mode = true; - break; - case 'n': // number of files to process - numfiles = atoi(optarg); - break; - default: - break; - } + while ((c = getopt(argc, argv, "d:n:")) != -1) + switch (c) { + case 'd': // download mode + download_mode = true; + break; + case 'n': // number of files to process + numfiles = atoi(optarg); + break; + default: + break; + } char filename[64]; snprintf(filename, sizeof filename, "out.csv"); std::ofstream out_file1; out_file1.open(filename); - out_file1<<"id, Caption/title, production start year, end year, Description, Item url, Image Source, image_filename, subjects, date published, collection name, museum name, exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_datetimeoriginal, liceses\n"; - for (auto i = 0; i?dimension= - // as per documentation available at http://api.dimu.org/doc/public_api.html - char imglink[128]; - snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", mediaid.c_str()); + if (article.HasMember("artifact.hasPictures") && + article["artifact.hasPictures"].GetBool() == true) { - char itemlink[128]; - snprintf(itemlink, sizeof itemlink, "https://digitaltmuseum.se/%s", unique_id.c_str()); - - - - char descfilename[64]; - snprintf(descfilename, sizeof descfilename, "%s.txt", article_id.c_str()); - - std::string description = ""; - if (article.HasMember("artifact.ingress.description")) { - const rapidjson::Value& descv = article["artifact.ingress.description"]; - if (descv.IsString()) { - description = descv.GetString(); - std::replace( description.begin(), description.end(), ',', ':'); - std::replace( description.begin(), description.end(), '\n', ' '); - std::ofstream desc_file; - desc_file.open(descfilename); - desc_file< 0) std::cout<0) std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n"; - } - } - - if (!download_mode) { - if (is_cc_license) { - char exif_file[128]; - snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", article_id.c_str(), mediaid.c_str()); - // Open the file - FILE* exiffp = fopen(exif_file, "rb"); - if (!exiffp) { - std::cerr << "Error: unable to open file" << std::string(exif_file) - << std::endl; - return -1; - } - - - char exifreadBuffer[4096]; - rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, sizeof(exifreadBuffer)); - rapidjson::Document exifdoc; - exifdoc.ParseStream(exifis); - - // Check if the document is valid - if (exifdoc.HasParseError()) { - std::cerr << "Error: failed to parse JSON document exif data" - << std::endl; - } - fclose(exiffp); - - std::string exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_fnumber, exif_datetimeoriginal; - if (exifdoc.IsArray()) { - if (exifdoc[0].IsObject()) { - if (exifdoc[0].GetObject().HasMember("Model")) { - exif_model = exifdoc[0].GetObject()["Model"].GetString(); - } - if (exifdoc[0].GetObject().HasMember("ISO")) { - exif_iso = std::to_string(exifdoc[0].GetObject()["ISO"].GetInt()); - } - if (exifdoc[0].GetObject().HasMember("FocalLength")) { - exif_focallength = exifdoc[0].GetObject()["FocalLength"].GetString(); - } - if (exifdoc[0].GetObject().HasMember("ExposureTime")) { - if (exifdoc[0].GetObject()["ExposureTime"].IsString()) - exif_exposuretime = exifdoc[0].GetObject()["ExposureTime"].GetString(); - } - if (exifdoc[0].GetObject().HasMember("ApertureValue")) { - exif_aperture = std::to_string(exifdoc[0].GetObject()["ApertureValue"].GetDouble()); - } - if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) { - exif_datetimeoriginal = exifdoc[0].GetObject()["DateTimeOriginal"].GetString(); - } + if (article.HasMember("artifact.defaultMediaIdentifier")) { + const rapidjson::Value &mediaidv = + article["artifact.defaultMediaIdentifier"]; + if (mediaidv.IsString()) { + mediaid = mediaidv.GetString(); } } + if (article.HasMember("artifact.defaultPictureIndex") && + article["artifact.defaultPictureIndex"].IsInt()) { + picid = article["artifact.defaultPictureIndex"].GetInt(); + } - out_file1<< - /* article["artifact.defaultMediaIdentifier"].GetString()<< - ", "<< article["artifact.defaultPictureIndex"].GetInt()<< - ", "<< article["artifact.defaultPictureDimension"].GetString()<< - ", "<<*/ - article_id << - ", "<< title << - ", "<< yearstr(yearb) << - ", "<< yearstr(yeare) << - ", "<< description << - ", "<< itemlink << - ", "<< imglink << - ", "<< article_id+"-"+ mediaid +".jpeg" << - ", "<< subjects << - ", "<< publishdate << - ", "<<"Länge leve Kosta! exhibition" << - ", "<<"Kulturparken Småland / Smålands museum" << - ", "<< exif_model << - ", "<< exif_iso << - ", "<< exif_focallength << - ", "<< exif_exposuretime << - ", "<< exif_aperture << - ", "<< exif_datetimeoriginal<< - ", "<< license << - std::endl; + if (article.HasMember("artifact.defaultPictureDimension") && + article["artifact.defaultPictureDimension"].IsString()) { + picdim = article["artifact.defaultPictureDimension"].GetString(); + } + } + + // image file is at + // https://mm.dimu.org/image/?dimension= + // as per documentation available at + // http://api.dimu.org/doc/public_api.html + char imglink[128]; + snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", + mediaid.c_str()); + + char itemlink[128]; + snprintf(itemlink, sizeof itemlink, "https://digitaltmuseum.se/%s", + unique_id.c_str()); + + char descfilename[64]; + snprintf(descfilename, sizeof descfilename, "%s.txt", + article_id.c_str()); + + std::string description = ""; + if (article.HasMember("artifact.ingress.description")) { + const rapidjson::Value &descv = + article["artifact.ingress.description"]; + if (descv.IsString()) { + description = descv.GetString(); + std::replace(description.begin(), description.end(), ',', ':'); + std::replace(description.begin(), description.end(), '\n', ' '); + std::ofstream desc_file; + desc_file.open(descfilename); + desc_file << description; + desc_file.close(); + } + } + + std::string subjects; + if (article.HasMember("artifact.ingress.subjects")) { + const rapidjson::Value &subjv = + article["artifact.ingress.subjects"]; + if (subjv.IsArray()) { + for (auto i = 0; i < subjv.GetArray().Size(); i++) { + subjects += subjv[i].GetString(); + subjects += " "; + } + std::replace(description.begin(), description.end(), ',', ':'); + std::replace(description.begin(), description.end(), '\n', ' '); + } + } + + std::string publishdate; + if (article.HasMember("artifact.publishedDate")) { + const rapidjson::Value &pubdatev = + article["artifact.publishedDate"]; + if (pubdatev.IsString()) { + publishdate = pubdatev.GetString(); + } + } + + std::string license = ""; + if (article.HasMember("artifact.ingress.license")) { + if (article["artifact.ingress.license"].IsString()) + license = article["artifact.ingress.license"].GetString(); + else if (article["artifact.ingress.license"].IsArray()) { + for (auto i = 0; i < article["artifact.ingress.license"].Size(); + i++) { + license += article["artifact.ingress.license"][i].GetString(); + } + } + } + + bool is_cc_license = false; + if (license.find("CC by") != std::string::npos) { + is_cc_license = true; + char imgfetch[256]; + snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", + imglink, article_id.c_str(), mediaid.c_str()); + if (debug_level > 0) + std::cout << imgfetch << std::endl; + if (download_mode) { + std::cout << "running in download mode\n"; + system(imgfetch); + if (debug_level > 0) + std::cout << "Found CC by license, fetched the image for " + << mediaid << "\n"; + } + } + + if (!download_mode) { + if (is_cc_license) { + char exif_file[128]; + snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", + article_id.c_str(), mediaid.c_str()); + // Open the file + FILE *exiffp = fopen(exif_file, "rb"); + if (!exiffp) { + std::cerr << "Error: unable to open file" + << std::string(exif_file) << std::endl; + return -1; + } + + char exifreadBuffer[4096]; + rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, + sizeof(exifreadBuffer)); + rapidjson::Document exifdoc; + exifdoc.ParseStream(exifis); + + // Check if the document is valid + if (exifdoc.HasParseError()) { + std::cerr << "Error: failed to parse JSON document exif data" + << std::endl; + } + fclose(exiffp); + + std::string exif_model, exif_iso, exif_focallength, + exif_exposuretime, exif_aperture, exif_fnumber, + exif_datetimeoriginal; + if (exifdoc.IsArray()) { + if (exifdoc[0].IsObject()) { + if (exifdoc[0].GetObject().HasMember("Model")) { + exif_model = exifdoc[0].GetObject()["Model"].GetString(); + } + if (exifdoc[0].GetObject().HasMember("ISO")) { + exif_iso = + std::to_string(exifdoc[0].GetObject()["ISO"].GetInt()); + } + if (exifdoc[0].GetObject().HasMember("FocalLength")) { + exif_focallength = + exifdoc[0].GetObject()["FocalLength"].GetString(); + } + if (exifdoc[0].GetObject().HasMember("ExposureTime")) { + if (exifdoc[0].GetObject()["ExposureTime"].IsString()) + exif_exposuretime = + exifdoc[0].GetObject()["ExposureTime"].GetString(); + } + if (exifdoc[0].GetObject().HasMember("ApertureValue")) { + exif_aperture = std::to_string( + exifdoc[0].GetObject()["ApertureValue"].GetDouble()); + } + if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) { + exif_datetimeoriginal = + exifdoc[0].GetObject()["DateTimeOriginal"].GetString(); + } + } + } + + out_file1 << + /* article["artifact.defaultMediaIdentifier"].GetString()<< + ", "<< article["artifact.defaultPictureIndex"].GetInt()<< + ", "<< + article["artifact.defaultPictureDimension"].GetString()<< + ", "<<*/ + article_id + << ", " << title << ", " << yearstr(yearb) << ", " + << yearstr(yeare) << ", " << description << ", " + << itemlink << ", " << imglink << ", " + << article_id + "-" + mediaid + ".jpeg" + << ", " << subjects << ", " << publishdate << ", " + << "Länge leve Kosta! exhibition" + << ", " + << "Kulturparken Småland / Smålands museum" + << ", " << exif_model << ", " << exif_iso << ", " + << exif_focallength << ", " << exif_exposuretime << ", " + << exif_aperture << ", " << exif_datetimeoriginal + << ", " << license << std::endl; + } } - } } } } } - out_file1.close(); + out_file1.close(); return 0; } -