Support fetching accn number

This commit is contained in:
Ashik K 2023-10-04 20:25:41 +02:00
parent fcf58d888b
commit ba077bb64a
1 changed files with 31 additions and 2 deletions

View File

@ -8,7 +8,7 @@ if len(sys.argv) != 6:
print("Usage: python titlegen.py inputfilename.csv startlineno endlineno outputfilename.csv API_key")
sys.exit(1)
# set the api key from the command line argument
nonempty_artnr = 0
startlineno = int(sys.argv[2])
endlineno = int(sys.argv[3])
lineno = 0
@ -25,6 +25,7 @@ with open(sys.argv[1], "r") as file:
newrow = row
row.append("Materials")
row.append("Dimensions")
row.append("Accession Number")
writer.writerow(newrow)
continue
@ -34,7 +35,7 @@ with open(sys.argv[1], "r") as file:
# Extract the string after the last slash in the itemurl
itemid = itemurl.rsplit('/', 1)[-1]
print(itemid)
print("=====================================" + str(lineno) + "=====================================")
# Fetch the xml data from "https://api.dimu.org/api/artifact?unique_id=<itemid>&mapping=ESE&api.key=<api_key"
url = "https://api.dimu.org/api/artifact?unique_id=" + itemid + "&mapping=ESE&api.key=" + api_key
print(url)
@ -48,6 +49,29 @@ with open(sys.argv[1], "r") as file:
from xml.dom.minidom import parse, parseString
dom = parseString(data)
# Extract the fields in the xml data that has are inside <dc:description> tag as string variables
artnr = ""
description_artnr = ""
description_list = dom.getElementsByTagName ('dc:description')
# iterate through the list of description_list and find a line that starts with the string "Art.nr."
for description in description_list:
descriptionitem = description.firstChild.nodeValue
# if "Art.nr." is a substring of the descriptionitem, then set description_artnr to descriptionitem and break
if "Art.nr." in descriptionitem:
description_artnr = descriptionitem
# Store the integer value of the string after the string "Art.nr." in the variable artnr
artnr = description_artnr.split("Art.nr. ")[1]
# remove any terminating '.' from the string artnr
artnr = artnr.rstrip('.')
print ("Found accn number and it is: ")
print(artnr)
break
# Extract the fields in the xml data that has are inside <dcterms:medium> tag as string variables
allmediums = ""
medium_list = dom.getElementsByTagName('dcterms:medium')
@ -66,11 +90,16 @@ with open(sys.argv[1], "r") as file:
allextents += extentitem + " "
print(allextents)
if (artnr != ""):
nonempty_artnr+=1
# Generate a new row with the previosu row appended with the new fields medium and extent
newrow = row
newrow.append(allmediums)
newrow.append(allextents)
newrow.append(artnr)
writer.writerow(newrow)
outfile.close()
file.close()
print("Number of non-empty accession numbers: " + str(nonempty_artnr))