wip: fetcher and parser for Wikimedia museum data project
This commit is contained in:
commit
b887e35554
|
@ -0,0 +1,19 @@
|
||||||
|
cmake_minimum_required(VERSION 3.14)
|
||||||
|
project(testCurlPP LANGUAGES CXX)
|
||||||
|
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
include(FindPkgConfig)
|
||||||
|
pkg_check_modules(CURLPP REQUIRED curlpp)
|
||||||
|
|
||||||
|
add_executable(fetch_all
|
||||||
|
fetcher.cc
|
||||||
|
)
|
||||||
|
add_executable(parse_all
|
||||||
|
parser.cc
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(fetch_all
|
||||||
|
${CURLPP_LDFLAGS}
|
||||||
|
)
|
|
@ -0,0 +1,52 @@
|
||||||
|
#include <sstream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#include <curlpp/cURLpp.hpp>
|
||||||
|
#include <curlpp/Easy.hpp>
|
||||||
|
#include <curlpp/Options.hpp>
|
||||||
|
#include <curlpp/Exception.hpp>
|
||||||
|
|
||||||
|
#define DEBUG_LEVEL -1
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
if (argc !=2 ) {
|
||||||
|
std::cout<<"Please run as "<<argv[0] <<" <api key>"<<std::endl;
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
char url[256], outfilename[64];
|
||||||
|
for (int i = 0; i<4; i++) {
|
||||||
|
snprintf(url, sizeof url, "https://api.dimu.org/api/solr/select?q=Kosta&wt=json&fq=(artifact.exhibitionUids:\"336485B3-1E4E-443C-8800-F93BDD12520A\")&start=%d&rows=100&api.key=%s", i*100, argv[1]);
|
||||||
|
snprintf(outfilename, sizeof outfilename, "data_%d.json", i);
|
||||||
|
|
||||||
|
/*if (DEBUG_LEVEL > 0) */std::cout<<"url to fetch is "<<std::endl<<url<<std::endl;
|
||||||
|
std::cout<<"writing output to "<<outfilename<<std::endl;
|
||||||
|
|
||||||
|
try {
|
||||||
|
curlpp::Cleanup cleaner;
|
||||||
|
curlpp::Easy request1;
|
||||||
|
|
||||||
|
// Setting the URL to retrive.
|
||||||
|
request1.setOpt(new curlpp::options::Url(url));
|
||||||
|
|
||||||
|
if (DEBUG_LEVEL > 0) std::cout << request1 << std::endl;
|
||||||
|
|
||||||
|
std::ofstream outfile;
|
||||||
|
outfile.open (outfilename);
|
||||||
|
outfile << curlpp::options::Url(url) << std::endl;
|
||||||
|
outfile.close();
|
||||||
|
}
|
||||||
|
catch ( curlpp::LogicError & e ) {
|
||||||
|
if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl;
|
||||||
|
}
|
||||||
|
catch ( curlpp::RuntimeError & e ) {
|
||||||
|
if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,174 @@
|
||||||
|
#include <rapidjson/document.h>
|
||||||
|
#include <rapidjson/filereadstream.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
std::string yearstr(int year) {
|
||||||
|
return year==-1? "Unknown": std::to_string(year);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
char filename[64];
|
||||||
|
snprintf(filename, sizeof filename, "out.csv");
|
||||||
|
std::ofstream out_file1;
|
||||||
|
out_file1.open(filename);
|
||||||
|
out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, collection name, museum name, linceses\n";
|
||||||
|
for (auto i = 0; i<4; i++) {
|
||||||
|
char in_file_i[64];
|
||||||
|
snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i);
|
||||||
|
// Open the file
|
||||||
|
FILE* fp = fopen(in_file_i, "rb");
|
||||||
|
// Check if the file was opened successfully
|
||||||
|
if (!fp) {
|
||||||
|
std::cerr << "Error: unable to open file"
|
||||||
|
<< std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the file
|
||||||
|
char readBuffer[192000];
|
||||||
|
rapidjson::FileReadStream is(fp, readBuffer, sizeof(readBuffer));
|
||||||
|
|
||||||
|
// Parse the JSON document
|
||||||
|
rapidjson::Document doc;
|
||||||
|
doc.ParseStream(is);
|
||||||
|
|
||||||
|
// Check if the document is valid
|
||||||
|
if (doc.HasParseError()) {
|
||||||
|
std::cerr << "Error: failed to parse JSON document"
|
||||||
|
<< std::endl;
|
||||||
|
fclose(fp);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the file
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
if (doc.HasMember("response")
|
||||||
|
&& doc["response"].IsObject()) {
|
||||||
|
const rapidjson::Value& response = doc["response"];
|
||||||
|
if (response.HasMember("docs")
|
||||||
|
&& response["docs"].IsArray()) {
|
||||||
|
std::cout<<"yes, response is an array"<<std::endl;
|
||||||
|
for (rapidjson::SizeType i = 0; i < response["docs"].Size(); i++) {
|
||||||
|
//std::cout<<i<<std::endl;
|
||||||
|
int yearb = -1, yeare = -1;
|
||||||
|
std::string title = "";
|
||||||
|
const rapidjson::Value& article = response["docs"][i];
|
||||||
|
|
||||||
|
std::string article_id = "";
|
||||||
|
if (article.HasMember("identifier.id")) {
|
||||||
|
article_id = article["identifier.id"].GetString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.ingress.title")) {
|
||||||
|
title = article["artifact.ingress.title"].GetString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.ingress.production.fromYear")) {
|
||||||
|
const rapidjson::Value& yearbval = article["artifact.ingress.production.fromYear"];
|
||||||
|
yearb = yearbval.GetInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.ingress.production.toYear")) {
|
||||||
|
const rapidjson::Value& yeareval = article["artifact.ingress.production.toYear"];
|
||||||
|
yeare = yeareval.GetInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string mediaid = "";
|
||||||
|
int picid = -1;
|
||||||
|
std::string picdim = "";
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.hasPictures") && article["artifact.hasPictures"].GetBool() == true) {
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.defaultMediaIdentifier")) {
|
||||||
|
const rapidjson::Value& mediaidv = article["artifact.defaultMediaIdentifier"];
|
||||||
|
if (mediaidv.IsString()) {
|
||||||
|
mediaid = mediaidv.GetString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.defaultPictureIndex") && article["artifact.defaultPictureIndex"].IsInt()) {
|
||||||
|
picid = article["artifact.defaultPictureIndex"].GetInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (article.HasMember("artifact.defaultPictureDimension") && article["artifact.defaultPictureDimension"].IsString()) {
|
||||||
|
picdim = article["artifact.defaultPictureDimension"].GetString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// image file is at https://mm.dimu.org/image/<identifier>?dimension=<dim>
|
||||||
|
// as per documentation available at http://api.dimu.org/doc/public_api.html
|
||||||
|
char imglink[128];
|
||||||
|
snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", mediaid.c_str());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
char descfilename[64];
|
||||||
|
snprintf(descfilename, sizeof descfilename, "%s.txt", article_id.c_str());
|
||||||
|
|
||||||
|
std::string description = "";
|
||||||
|
if (article.HasMember("artifact.ingress.description")) {
|
||||||
|
const rapidjson::Value& descv = article["artifact.ingress.description"];
|
||||||
|
if (descv.IsString()) {
|
||||||
|
description = descv.GetString();
|
||||||
|
std::replace( description.begin(), description.end(), ',', ':');
|
||||||
|
std::replace( description.begin(), description.end(), '\n', ' ');
|
||||||
|
std::ofstream desc_file;
|
||||||
|
desc_file.open(descfilename);
|
||||||
|
desc_file<<description;
|
||||||
|
desc_file.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string license = "";
|
||||||
|
if (article.HasMember("artifact.ingress.license")) {
|
||||||
|
if (article[ "artifact.ingress.license"].IsString())
|
||||||
|
license = article[ "artifact.ingress.license"].GetString();
|
||||||
|
else if (article[ "artifact.ingress.license"].IsArray()) {
|
||||||
|
for (auto i = 0; i<article[ "artifact.ingress.license"].Size(); i++) {
|
||||||
|
license += article[ "artifact.ingress.license"][i].GetString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_cc_license = false;
|
||||||
|
if (license.find("CC by")!= std::string::npos) {
|
||||||
|
is_cc_license = true;
|
||||||
|
char imgfetch[256];
|
||||||
|
snprintf(imgfetch, sizeof imgfetch, "wget %s -O \"%s-%s.jpeg\"", imglink, article_id.c_str(), mediaid.c_str());
|
||||||
|
std::cout<<imgfetch<<std::endl;
|
||||||
|
// system (imgfetch);
|
||||||
|
std::cout<<"Found CC by license, fetched the image for "<< mediaid<<"\n";
|
||||||
|
}
|
||||||
|
if (is_cc_license) {
|
||||||
|
out_file1<<
|
||||||
|
/* article["artifact.defaultMediaIdentifier"].GetString()<<
|
||||||
|
", "<< article["artifact.defaultPictureIndex"].GetInt()<<
|
||||||
|
", "<< article["artifact.defaultPictureDimension"].GetString()<<
|
||||||
|
", "<<*/
|
||||||
|
article_id <<
|
||||||
|
", "<< title <<
|
||||||
|
", "<< yearstr(yearb) <<
|
||||||
|
", "<< yearstr(yeare) <<
|
||||||
|
", "<< descfilename <<
|
||||||
|
", "<< imglink <<
|
||||||
|
", "<< article_id+"-"+ mediaid +".jpeg" <<
|
||||||
|
", "<<"Länge leve Kosta! exhibition" <<
|
||||||
|
", "<<"Kulturparken Småland / Smålands museum" <<
|
||||||
|
", "<< license <<
|
||||||
|
std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out_file1.close();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue