From b887e35554a4d274cba73d50d0de21601b395e52 Mon Sep 17 00:00:00 2001 From: Ashik K Date: Wed, 8 Mar 2023 14:18:12 +0100 Subject: [PATCH] wip: fetcher and parser for Wikimedia museum data project --- CMakeLists.txt | 19 ++++++ fetcher.cc | 52 +++++++++++++++ parser.cc | 174 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 fetcher.cc create mode 100644 parser.cc diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..d60d910 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.14) +project(testCurlPP LANGUAGES CXX) +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +include(FindPkgConfig) +pkg_check_modules(CURLPP REQUIRED curlpp) + +add_executable(fetch_all + fetcher.cc +) +add_executable(parse_all + parser.cc +) + +target_link_libraries(fetch_all + ${CURLPP_LDFLAGS} +) diff --git a/fetcher.cc b/fetcher.cc new file mode 100644 index 0000000..d548170 --- /dev/null +++ b/fetcher.cc @@ -0,0 +1,52 @@ +#include +#include + +#include +#include + +#include +#include +#include +#include + +#define DEBUG_LEVEL -1 + +int main(int argc, char *argv[]) +{ + if (argc !=2 ) { + std::cout<<"Please run as "<"< 0) */std::cout<<"url to fetch is "< 0) std::cout << request1 << std::endl; + + std::ofstream outfile; + outfile.open (outfilename); + outfile << curlpp::options::Url(url) << std::endl; + outfile.close(); + } + catch ( curlpp::LogicError & e ) { + if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl; + } + catch ( curlpp::RuntimeError & e ) { + if (DEBUG_LEVEL > 0) std::cout << e.what() << std::endl; + } + + +} + return 0; +} diff --git a/parser.cc b/parser.cc new file mode 100644 index 0000000..b5dbd61 --- /dev/null +++ b/parser.cc @@ -0,0 +1,174 @@ +#include +#include + +#include +#include +#include +#include +#include + +std::string yearstr(int year) { + return year==-1? "Unknown": std::to_string(year); +} + +int main() +{ + char filename[64]; + snprintf(filename, sizeof filename, "out.csv"); + std::ofstream out_file1; + out_file1.open(filename); + out_file1<<"id, Caption/title, production start year, end year, Description, Source, image_filename, collection name, museum name, linceses\n"; + for (auto i = 0; i<4; i++) { + char in_file_i[64]; + snprintf(in_file_i, sizeof in_file_i, "data_%d.json", i); + // Open the file + FILE* fp = fopen(in_file_i, "rb"); + // Check if the file was opened successfully + if (!fp) { + std::cerr << "Error: unable to open file" + << std::endl; + return -1; + } + + // Read the file + char readBuffer[192000]; + rapidjson::FileReadStream is(fp, readBuffer, sizeof(readBuffer)); + + // Parse the JSON document + rapidjson::Document doc; + doc.ParseStream(is); + + // Check if the document is valid + if (doc.HasParseError()) { + std::cerr << "Error: failed to parse JSON document" + << std::endl; + fclose(fp); + return 1; + } + + // Close the file + fclose(fp); + + if (doc.HasMember("response") + && doc["response"].IsObject()) { + const rapidjson::Value& response = doc["response"]; + if (response.HasMember("docs") + && response["docs"].IsArray()) { + std::cout<<"yes, response is an array"<?dimension= + // as per documentation available at http://api.dimu.org/doc/public_api.html + char imglink[128]; + snprintf(imglink, sizeof imglink, "https://mm.dimu.org/image/%s", mediaid.c_str()); + + + + char descfilename[64]; + snprintf(descfilename, sizeof descfilename, "%s.txt", article_id.c_str()); + + std::string description = ""; + if (article.HasMember("artifact.ingress.description")) { + const rapidjson::Value& descv = article["artifact.ingress.description"]; + if (descv.IsString()) { + description = descv.GetString(); + std::replace( description.begin(), description.end(), ',', ':'); + std::replace( description.begin(), description.end(), '\n', ' '); + std::ofstream desc_file; + desc_file.open(descfilename); + desc_file<