Compare commits

..

No commits in common. "d15c1138545297f98ff7611ce04cb7927a285331" and "d9cf6f02df04fbd0c620b90aeae290d37dfd8d8c" have entirely different histories.

3 changed files with 255 additions and 271 deletions

1
.gitignore vendored
View File

@ -1,3 +1,2 @@
/build/ /build/
/build/* /build/*
.vscode/c_cpp_properties.json

View File

@ -1,31 +1,31 @@
#include <fstream>
#include <sstream> #include <sstream>
#include <fstream>
#include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstdio>
#include <cstring> #include <cstring>
#include <unistd.h> #include <unistd.h>
#include <curlpp/Easy.hpp>
#include <curlpp/Exception.hpp>
#include <curlpp/Options.hpp>
#include <curlpp/cURLpp.hpp> #include <curlpp/cURLpp.hpp>
#include <curlpp/Easy.hpp>
#include <curlpp/Options.hpp>
#include <curlpp/Exception.hpp>
#define DEBUG_LEVEL -1 #define DEBUG_LEVEL -1
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
{
std::string exbid, apikey; std::string exbid, apikey;
if (argc != 7) { if (argc !=7 ) {
std::cout << "Usage: " << argv[0] std::cout<<"Usage: "<<argv[0] <<" -e <exhibition id> -k <api key> -e <number of records>"<<std::endl;
<< " -e <exhibition id> -k <api key> -e <number of records>"
<< std::endl;
exit(0); exit(0);
} }
int c; int c;
int numrec = 0; int numrec = 0;
while ((c = getopt(argc, argv, "e:k:n:")) != -1) while ((c = getopt (argc, argv, "e:k:n:")) != -1)
switch (c) { switch(c)
{
case 'e': case 'e':
exbid = optarg; exbid = optarg;
break; break;
@ -38,23 +38,22 @@ int main(int argc, char *argv[]) {
default: default:
break; break;
} }
std::cout << numrec << " records to fetch\n"; std::cout<<numrec<<" records to fetch\n";
int numfetches = (numrec / 100) + 1; int numfetches = (numrec / 100) + 1;
char url[256], outfilename[64]; char url[256], outfilename[64];
for (int i = 0; i < numfetches; i++) { for (int i = 0; i<numfetches; i++) {
snprintf(url, sizeof url, snprintf(
"https://api.dimu.org/api/solr/" url,
"select?q=Kosta&wt=json&fq=(artifact.exhibitionUids:\"%s\")&start=" sizeof url,
"%d&rows=100&api.key=%s", "https://api.dimu.org/api/solr/select?q=Kosta&wt=json&fq=(artifact.exhibitionUids:\"%s\")&start=%d&rows=100&api.key=%s", exbid.c_str(), i*100, apikey.c_str());
exbid.c_str(), i * 100, apikey.c_str());
std::cout << url << "\n"; std::cout<<url<<"\n";
snprintf(outfilename, sizeof outfilename, "data_%d.json", i); snprintf(outfilename, sizeof outfilename, "data_%d.json", i);
if (DEBUG_LEVEL > 0) { if (DEBUG_LEVEL > 0) {
std::cout << "url to fetch is " << std::endl << url << std::endl; std::cout<<"url to fetch is "<<std::endl<<url<<std::endl;
std::cout << "writing output to " << outfilename << std::endl; std::cout<<"writing output to "<<outfilename<<std::endl;
} }
try { try {
@ -64,20 +63,21 @@ int main(int argc, char *argv[]) {
// Setting the URL to retrive. // Setting the URL to retrive.
request1.setOpt(new curlpp::options::Url(url)); request1.setOpt(new curlpp::options::Url(url));
if (DEBUG_LEVEL > 0) if (DEBUG_LEVEL > 0) std::cout << request1 << std::endl;
std::cout << request1 << std::endl;
std::ofstream outfile; std::ofstream outfile;
outfile.open(outfilename); outfile.open (outfilename);
outfile << curlpp::options::Url(url) << std::endl; outfile << curlpp::options::Url(url) << std::endl;
outfile.close(); outfile.close();
} catch (curlpp::LogicError &e) {
if (DEBUG_LEVEL > 0)
std::cout << e.what() << std::endl;
} catch (curlpp::RuntimeError &e) {
if (DEBUG_LEVEL > 0)
std::cout << e.what() << std::endl;
} }
catch ( curlpp::LogicError & e ) {
if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl;
} }
catch ( curlpp::RuntimeError & e ) {
if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl;
}
}
return 0; return 0;
} }

199
parser.cc
View File

@ -1,8 +1,8 @@
#include <rapidjson/document.h> #include <rapidjson/document.h>
#include <rapidjson/filereadstream.h> #include <rapidjson/filereadstream.h>
#include <cstdlib>
#include <unistd.h> #include <unistd.h>
#include <cstdlib>
#include <algorithm> #include <algorithm>
#include <cstdio> #include <cstdio>
@ -12,15 +12,17 @@
#define debug_level 0 #define debug_level 0
std::string yearstr(int year) { std::string yearstr(int year) {
return year == -1 ? "Unknown" : std::to_string(year); return year==-1? "Unknown": std::to_string(year);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
int c; int c;
bool download_mode = false; bool download_mode =false;
int numfiles = 0; int numfiles = 0;
while ((c = getopt(argc, argv, "d:n:")) != -1) while ((c = getopt (argc, argv, "d:n:")) != -1)
switch (c) { switch(c)
{
case 'd': // download mode case 'd': // download mode
download_mode = true; download_mode = true;
break; break;
@ -35,19 +37,16 @@ int main(int argc, char **argv) {
snprintf(filename, sizeof filename, "out.csv"); snprintf(filename, sizeof filename, "out.csv");
std::ofstream out_file1; std::ofstream out_file1;
out_file1.open(filename); out_file1.open(filename);
out_file1 << "id, Caption/title, production start year, end year, " out_file1<<"id, Caption/title, production start year, end year, Description, Item url, Image Source, image_filename, subjects, date published, collection name, museum name, exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_datetimeoriginal, liceses\n";
"Description, Item url, Image Source, image_filename, subjects, " for (auto i = 0; i<numfiles; i++) {
"date published, collection name, museum name, exif_model, "
"exif_iso, exif_focallength, exif_exposuretime, exif_aperture, "
"exif_datetimeoriginal, liceses\n";
for (auto i = 0; i < numfiles; i++) {
char in_file_i[64]; char in_file_i[64];
snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i); snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i);
// Open the file // Open the file
FILE *fp = fopen(in_file_i, "rb"); FILE* fp = fopen(in_file_i, "rb");
// Check if the file was opened successfully // Check if the file was opened successfully
if (!fp) { if (!fp) {
std::cerr << "Error: unable to open file" << std::endl; std::cerr << "Error: unable to open file"
<< std::endl;
return -1; return -1;
} }
@ -61,7 +60,8 @@ int main(int argc, char **argv) {
// Check if the document is valid // Check if the document is valid
if (doc.HasParseError()) { if (doc.HasParseError()) {
std::cerr << "Error: failed to parse JSON document" << std::endl; std::cerr << "Error: failed to parse JSON document"
<< std::endl;
fclose(fp); fclose(fp);
return 1; return 1;
} }
@ -69,14 +69,16 @@ int main(int argc, char **argv) {
// Close the file // Close the file
fclose(fp); fclose(fp);
if (doc.HasMember("response") && doc["response"].IsObject()) { if (doc.HasMember("response")
const rapidjson::Value &response = doc["response"]; && doc["response"].IsObject()) {
if (response.HasMember("docs") && response["docs"].IsArray()) { const rapidjson::Value& response = doc["response"];
if (response.HasMember("docs")
&& response["docs"].IsArray()) {
for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) { for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) {
// std::cout<<i<<std::endl; //std::cout<<i<<std::endl;
int yearb = -1, yeare = -1; int yearb = -1, yeare = -1;
std::string title = ""; std::string title = "";
const rapidjson::Value &article = response["docs"][i]; const rapidjson::Value& article = response["docs"][i];
std::string article_id = ""; std::string article_id = "";
if (article.HasMember("identifier.id")) { if (article.HasMember("identifier.id")) {
@ -88,20 +90,18 @@ int main(int argc, char **argv) {
} }
if (article.HasMember("artifact.ingress.production.fromYear")) { if (article.HasMember("artifact.ingress.production.fromYear")) {
const rapidjson::Value &yearbval = const rapidjson::Value& yearbval = article["artifact.ingress.production.fromYear"];
article["artifact.ingress.production.fromYear"];
yearb = yearbval.GetInt(); yearb = yearbval.GetInt();
} }
if (article.HasMember("artifact.ingress.production.toYear")) { if (article.HasMember("artifact.ingress.production.toYear")) {
const rapidjson::Value &yeareval = const rapidjson::Value& yeareval = article["artifact.ingress.production.toYear"];
article["artifact.ingress.production.toYear"];
yeare = yeareval.GetInt(); yeare = yeareval.GetInt();
} }
std::string unique_id = ""; std::string unique_id = "";
if (article.HasMember("artifact.uniqueId")) { if (article.HasMember("artifact.uniqueId")) {
const rapidjson::Value &uniqidval = article["artifact.uniqueId"]; const rapidjson::Value& uniqidval = article["artifact.uniqueId"];
if (uniqidval.IsString()) { if (uniqidval.IsString()) {
unique_id = uniqidval.GetString(); unique_id = uniqidval.GetString();
} }
@ -111,77 +111,67 @@ int main(int argc, char **argv) {
int picid = -1; int picid = -1;
std::string picdim = ""; std::string picdim = "";
if (article.HasMember("artifact.hasPictures") && if (article.HasMember("artifact.hasPictures") && article["artifact.hasPictures"].GetBool() == true) {
article["artifact.hasPictures"].GetBool() == true) {
if (article.HasMember("artifact.defaultMediaIdentifier")) { if (article.HasMember("artifact.defaultMediaIdentifier")) {
const rapidjson::Value &mediaidv = const rapidjson::Value& mediaidv = article["artifact.defaultMediaIdentifier"];
article["artifact.defaultMediaIdentifier"];
if (mediaidv.IsString()) { if (mediaidv.IsString()) {
mediaid = mediaidv.GetString(); mediaid = mediaidv.GetString();
} }
} }
if (article.HasMember("artifact.defaultPictureIndex") && if (article.HasMember("artifact.defaultPictureIndex") && article["artifact.defaultPictureIndex"].IsInt()) {
article["artifact.defaultPictureIndex"].IsInt()) {
picid = article["artifact.defaultPictureIndex"].GetInt(); picid = article["artifact.defaultPictureIndex"].GetInt();
} }
if (article.HasMember("artifact.defaultPictureDimension") && if (article.HasMember("artifact.defaultPictureDimension") && article["artifact.defaultPictureDimension"].IsString()) {
article["artifact.defaultPictureDimension"].IsString()) {
picdim = article["artifact.defaultPictureDimension"].GetString(); picdim = article["artifact.defaultPictureDimension"].GetString();
} }
} }
// image file is at // image file is at https://mm.dimu.org/image/<identifier>?dimension=<dim>
// https://mm.dimu.org/image/<identifier>?dimension=<dim> // as per documentation available at http://api.dimu.org/doc/public_api.html
// as per documentation available at
// http://api.dimu.org/doc/public_api.html
char imglink[128]; char imglink[128];
snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", mediaid.c_str());
mediaid.c_str());
char itemlink[128]; char itemlink[128];
snprintf(itemlink, sizeof itemlink, "https://digitaltmuseum.se/%s", snprintf(itemlink, sizeof itemlink, "https://digitaltmuseum.se/%s", unique_id.c_str());
unique_id.c_str());
char descfilename[64]; char descfilename[64];
snprintf(descfilename, sizeof descfilename, "%s.txt", snprintf(descfilename, sizeof descfilename, "%s.txt", article_id.c_str());
article_id.c_str());
std::string description = ""; std::string description = "";
if (article.HasMember("artifact.ingress.description")) { if (article.HasMember("artifact.ingress.description")) {
const rapidjson::Value &descv = const rapidjson::Value& descv = article["artifact.ingress.description"];
article["artifact.ingress.description"];
if (descv.IsString()) { if (descv.IsString()) {
description = descv.GetString(); description = descv.GetString();
std::replace(description.begin(), description.end(), ',', ':'); std::replace( description.begin(), description.end(), ',', ':');
std::replace(description.begin(), description.end(), '\n', ' '); std::replace( description.begin(), description.end(), '\n', ' ');
std::ofstream desc_file; std::ofstream desc_file;
desc_file.open(descfilename); desc_file.open(descfilename);
desc_file << description; desc_file<<description;
desc_file.close(); desc_file.close();
} }
} }
std::string subjects; std::string subjects;
if (article.HasMember("artifact.ingress.subjects")) { if (article.HasMember("artifact.ingress.subjects")) {
const rapidjson::Value &subjv = const rapidjson::Value& subjv = article["artifact.ingress.subjects"];
article["artifact.ingress.subjects"];
if (subjv.IsArray()) { if (subjv.IsArray()) {
for (auto i = 0; i < subjv.GetArray().Size(); i++) { for (auto i = 0; i<subjv.GetArray().Size(); i++) {
subjects += subjv[i].GetString(); subjects += subjv[i].GetString();
subjects += " "; subjects += " ";
} }
std::replace(description.begin(), description.end(), ',', ':'); std::replace( description.begin(), description.end(), ',', ':');
std::replace(description.begin(), description.end(), '\n', ' '); std::replace( description.begin(), description.end(), '\n', ' ');
} }
} }
std::string publishdate; std::string publishdate;
if (article.HasMember("artifact.publishedDate")) { if (article.HasMember("artifact.publishedDate")) {
const rapidjson::Value &pubdatev = const rapidjson::Value& pubdatev = article["artifact.publishedDate"];
article["artifact.publishedDate"];
if (pubdatev.IsString()) { if (pubdatev.IsString()) {
publishdate = pubdatev.GetString(); publishdate = pubdatev.GetString();
} }
@ -189,49 +179,43 @@ int main(int argc, char **argv) {
std::string license = ""; std::string license = "";
if (article.HasMember("artifact.ingress.license")) { if (article.HasMember("artifact.ingress.license")) {
if (article["artifact.ingress.license"].IsString()) if (article[ "artifact.ingress.license"].IsString())
license = article["artifact.ingress.license"].GetString(); license = article[ "artifact.ingress.license"].GetString();
else if (article["artifact.ingress.license"].IsArray()) { else if (article[ "artifact.ingress.license"].IsArray()) {
for (auto i = 0; i < article["artifact.ingress.license"].Size(); for (auto i = 0; i<article[ "artifact.ingress.license"].Size(); i++) {
i++) { license += article[ "artifact.ingress.license"][i].GetString();
license += article["artifact.ingress.license"][i].GetString();
} }
} }
} }
bool is_cc_license = false; bool is_cc_license = false;
if (license.find("CC by") != std::string::npos) { if (license.find("CC by")!= std::string::npos) {
is_cc_license = true; is_cc_license = true;
char imgfetch[256]; char imgfetch[256];
snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", imglink, article_id.c_str(), mediaid.c_str());
imglink, article_id.c_str(), mediaid.c_str()); if (debug_level > 0) std::cout<<imgfetch<<std::endl;
if (debug_level > 0)
std::cout << imgfetch << std::endl;
if (download_mode) { if (download_mode) {
std::cout << "running in download mode\n"; std::cout<<"running in download mode\n";
system(imgfetch); system (imgfetch);
if (debug_level > 0) if (debug_level >0) std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n";
std::cout << "Found CC by license, fetched the image for "
<< mediaid << "\n";
} }
} }
if (!download_mode) { if (!download_mode) {
if (is_cc_license) { if (is_cc_license) {
char exif_file[128]; char exif_file[128];
snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", snprintf(exif_file, sizeof exif_file, "%s-%s.jpeg.exif.json", article_id.c_str(), mediaid.c_str());
article_id.c_str(), mediaid.c_str());
// Open the file // Open the file
FILE *exiffp = fopen(exif_file, "rb"); FILE* exiffp = fopen(exif_file, "rb");
if (!exiffp) { if (!exiffp) {
std::cerr << "Error: unable to open file" std::cerr << "Error: unable to open file" << std::string(exif_file)
<< std::string(exif_file) << std::endl; << std::endl;
return -1; return -1;
} }
char exifreadBuffer[4096]; char exifreadBuffer[4096];
rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, rapidjson::FileReadStream exifis(exiffp, exifreadBuffer, sizeof(exifreadBuffer));
sizeof(exifreadBuffer));
rapidjson::Document exifdoc; rapidjson::Document exifdoc;
exifdoc.ParseStream(exifis); exifdoc.ParseStream(exifis);
@ -242,57 +226,57 @@ int main(int argc, char **argv) {
} }
fclose(exiffp); fclose(exiffp);
std::string exif_model, exif_iso, exif_focallength, std::string exif_model, exif_iso, exif_focallength, exif_exposuretime, exif_aperture, exif_fnumber, exif_datetimeoriginal;
exif_exposuretime, exif_aperture, exif_fnumber,
exif_datetimeoriginal;
if (exifdoc.IsArray()) { if (exifdoc.IsArray()) {
if (exifdoc[0].IsObject()) { if (exifdoc[0].IsObject()) {
if (exifdoc[0].GetObject().HasMember("Model")) { if (exifdoc[0].GetObject().HasMember("Model")) {
exif_model = exifdoc[0].GetObject()["Model"].GetString(); exif_model = exifdoc[0].GetObject()["Model"].GetString();
} }
if (exifdoc[0].GetObject().HasMember("ISO")) { if (exifdoc[0].GetObject().HasMember("ISO")) {
exif_iso = exif_iso = std::to_string(exifdoc[0].GetObject()["ISO"].GetInt());
std::to_string(exifdoc[0].GetObject()["ISO"].GetInt());
} }
if (exifdoc[0].GetObject().HasMember("FocalLength")) { if (exifdoc[0].GetObject().HasMember("FocalLength")) {
exif_focallength = exif_focallength = exifdoc[0].GetObject()["FocalLength"].GetString();
exifdoc[0].GetObject()["FocalLength"].GetString();
} }
if (exifdoc[0].GetObject().HasMember("ExposureTime")) { if (exifdoc[0].GetObject().HasMember("ExposureTime")) {
if (exifdoc[0].GetObject()["ExposureTime"].IsString()) if (exifdoc[0].GetObject()["ExposureTime"].IsString())
exif_exposuretime = exif_exposuretime = exifdoc[0].GetObject()["ExposureTime"].GetString();
exifdoc[0].GetObject()["ExposureTime"].GetString();
} }
if (exifdoc[0].GetObject().HasMember("ApertureValue")) { if (exifdoc[0].GetObject().HasMember("ApertureValue")) {
exif_aperture = std::to_string( exif_aperture = std::to_string(exifdoc[0].GetObject()["ApertureValue"].GetDouble());
exifdoc[0].GetObject()["ApertureValue"].GetDouble());
} }
if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) { if (exifdoc[0].GetObject().HasMember("DateTimeOriginal")) {
exif_datetimeoriginal = exif_datetimeoriginal = exifdoc[0].GetObject()["DateTimeOriginal"].GetString();
exifdoc[0].GetObject()["DateTimeOriginal"].GetString();
} }
} }
} }
out_file1 <<
out_file1<<
/* article["artifact.defaultMediaIdentifier"].GetString()<< /* article["artifact.defaultMediaIdentifier"].GetString()<<
", "<< article["artifact.defaultPictureIndex"].GetInt()<< ", "<< article["artifact.defaultPictureIndex"].GetInt()<<
", "<< ", "<< article["artifact.defaultPictureDimension"].GetString()<<
article["artifact.defaultPictureDimension"].GetString()<<
", "<<*/ ", "<<*/
article_id article_id <<
<< ", " << title << ", " << yearstr(yearb) << ", " ", "<< title <<
<< yearstr(yeare) << ", " << description << ", " ", "<< yearstr(yearb) <<
<< itemlink << ", " << imglink << ", " ", "<< yearstr(yeare) <<
<< article_id + "-" + mediaid + ".jpeg" ", "<< description <<
<< ", " << subjects << ", " << publishdate << ", " ", "<< itemlink <<
<< "Länge leve Kosta! exhibition" ", "<< imglink <<
<< ", " ", "<< article_id+"-"+ mediaid +".jpeg" <<
<< "Kulturparken Småland / Smålands museum" ", "<< subjects <<
<< ", " << exif_model << ", " << exif_iso << ", " ", "<< publishdate <<
<< exif_focallength << ", " << exif_exposuretime << ", " ", "<<"Länge leve Kosta! exhibition" <<
<< exif_aperture << ", " << exif_datetimeoriginal ", "<<"Kulturparken Småland / Smålands museum" <<
<< ", " << license << std::endl; ", "<< exif_model <<
", "<< exif_iso <<
", "<< exif_focallength <<
", "<< exif_exposuretime <<
", "<< exif_aperture <<
", "<< exif_datetimeoriginal<<
", "<< license <<
std::endl;
} }
} }
} }
@ -302,3 +286,4 @@ int main(int argc, char **argv) {
out_file1.close(); out_file1.close();
return 0; return 0;
} }