This repository was archived by the owner on Oct 10, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDryadRepository.py
212 lines (186 loc) · 8.95 KB
/
DryadRepository.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
from harvester.HarvestRepository import HarvestRepository
from harvester.rate_limited import rate_limited
import requests
import time
import json
from datetime import datetime
import urllib
class DryadRepository(HarvestRepository):
""" Dryad Repository """
def setRepoParams(self, repoParams):
self.metadataprefix = "dryad"
self.default_language = "en"
super(DryadRepository, self).setRepoParams(repoParams)
self.domain_metadata = []
self.headers = {
'accept': "application/json",
"content-type": "application/json"
}
self.ror_data = None
def _crawl(self):
if not self.load_ror_data():
self.logger.error("ROR data could not be fetched from remote")
return
if not self.ror_data:
self.logger.error("ROR data could not be loaded from the local JSON file")
return
kwargs = {
"repo_id": self.repository_id,
"repo_url": self.url,
"repo_set": self.set,
"repo_name": self.name,
"repo_name_fr": self.name_fr,
"repo_type": "dryad",
"enabled": self.enabled,
"repo_thumbnail": self.thumbnail,
"item_url_pattern": self.item_url_pattern,
"abort_after_numerrors": self.abort_after_numerrors,
"max_records_updated_per_run": self.max_records_updated_per_run,
"update_log_after_numitems": self.update_log_after_numitems,
"record_refresh_days": self.record_refresh_days,
"repo_refresh_days": self.repo_refresh_days,
"homepage_url": self.homepage_url,
"repo_oai_name": self.repo_oai_name,
"repo_registry_uri": self.repo_registry_uri
}
self.repository_id = self.db.update_repo(**kwargs)
try:
# Initial API call
url = self.url + "/search/"
# Check for records updated in the past 30 days
mod_since = self.last_crawl - 60*60*24*7 if self.last_crawl - 60*60*24*7 >= 0 else 0
querystring = {"per_page": str(100), "modifiedSince": datetime.strftime(datetime.fromtimestamp(mod_since), '%Y-%m-%dT%H:%M:%SZ')}
r = requests.request("GET", url, headers=self.headers, params=querystring)
response = r.json()
records = response['_embedded']['stash:datasets']
item_count = 0
total_dryad_item_count = response['total'] # hardcode 1000 for testing
while item_count < total_dryad_item_count:
for record in records:
if '_links' in record and record['_links']:
item_identifier = record["identifier"]
self.db.write_header(item_identifier, self.item_url_pattern, self.repository_id)
item_count = item_count + 1
if (item_count % self.update_log_after_numitems == 0):
tdelta = time.time() - self.tstart + 0.1
self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(
item_count, self.formatter.humanize(tdelta), item_count / tdelta))
if 'next' in response['_links']:
url = self.url.replace("/api/v2", "") + response['_links']['next']['href']
r = requests.request("GET", url, headers=self.headers, params=querystring)
response = r.json()
records = response['_embedded']['stash:datasets']
else:
break
self.logger.info("Found {} items in feed".format(item_count))
return True
except Exception as e:
self.logger.error("Updating Dryad Repository failed: {} {}".format(type(e).__name__, e))
self.error_count = self.error_count + 1
if self.error_count < self.abort_after_numerrors:
return True
return False
def format_dryad_to_oai(self, dryad_record):
record = {}
if "identifier" not in dryad_record:
return None
record["identifier"] = dryad_record["identifier"]
record["item_url"] = "https://doi.org/" + dryad_record["identifier"].split("doi:")[1]
is_canadian = False
for author in dryad_record['authors']:
if 'affiliationROR' in author and author['affiliationROR']:
try:
ror_record = self.ror_data[author["affiliationROR"]]
try:
if ror_record["country"]["country_code"] == "CA":
is_canadian = True
break
except KeyError:
self.logger.error("ROR record {} missing country".format(author['affiliationROR']))
continue
except KeyError:
self.logger.info("ROR ID {} does not exist".format(author["affiliationROR"]))
continue
if not is_canadian:
return False
record["creator"] = []
record["affiliation"] = []
for creator in dryad_record["authors"]:
creatorName = ""
if "lastName" in creator and creator["lastName"]:
creatorName = creator["lastName"]
if "firstName" in creator and creator["firstName"]:
creatorName = creatorName + ", " + creator["firstName"]
if creatorName:
record["creator"].append(creatorName)
affiliation = ""
if "affiliation" in creator and creator["affiliation"]:
affiliation = creator["affiliation"]
if "affiliationROR" in creator and creator["affiliationROR"]:
affiliation = {"affiliation_name": affiliation, "affiliation_ror": creator["affiliationROR"]}
if affiliation not in record["affiliation"]:
record["affiliation"].append(affiliation)
if len(record["affiliation"]) == 0:
record.pop("affiliation")
record["title"] = dryad_record["title"]
record["title_fr"] = ""
record["series"] = ""
try:
record["pub_date"] = dryad_record["publicationDate"]
except Exception as e:
record["pub_date"] = dryad_record["lastModificationDate"]
record["description"] = dryad_record.get("abstract", "")
record["rights"] = dryad_record["license"]
if "keywords" in dryad_record and dryad_record["keywords"]:
record["tags"] = dryad_record["keywords"]
if "locations" in dryad_record and dryad_record["locations"]:
record["geoplaces"] = []
for location in dryad_record["locations"]:
record["geoplaces"].append({"place_name": location["place"]})
return record
@rate_limited(5)
def _update_record(self, record):
try:
record_url = self.url + "/datasets/" + urllib.parse.quote_plus(record["local_identifier"])
try:
item_response = requests.get(record_url)
if (item_response.status_code == 200): # Dryad sends code 429 for rate limiting
dryad_record = json.loads(item_response.text)
else:
dryad_record = {}
except Exception as e:
# Exception means this URL was not found
self.logger.error("Fetching record {} failed: {} {}".format(record_url, type(e).__name__, e))
return True
oai_record = self.format_dryad_to_oai(dryad_record)
if oai_record:
self.db.write_record(oai_record, self)
else:
if oai_record is False:
# This dataset is not Canadian, remove it from the results
self.db.delete_record(record)
else:
# Some other problem, this record will be updated by a future crawl
self.db.touch_record(record)
return True
except Exception as e:
self.logger.error("Updating record {} failed: {} {}".format(record['local_identifier'], type(e).__name__, e))
if self.dump_on_failure == True:
try:
print(dryad_record)
except Exception as e:
pass
# Touch the record so we do not keep requesting it on every run
self.db.touch_record(record)
self.error_count = self.error_count + 1
if self.error_count < self.abort_after_numerrors:
return True
return False
def update_stale_records(self, dbparams):
if not self.load_ror_data():
self.logger.error("ROR data could not be fetched from remote")
return
if not self.ror_data:
self.logger.error("ROR data could not be loaded from the local JSON file")
return
super().update_stale_records(dbparams)