-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
154 lines (124 loc) · 6.37 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import requests
from lxml import etree
from tinydb import TinyDB, Query
from pathlib import Path
# Function to get the API key
def get_api_key():
apikey_path = Path(__file__).resolve().parent / 'apiKey.txt'
with open(apikey_path, 'r', encoding='utf-8') as file:
api_key = file.read().strip() # Read the API key from the file and remove any leading/trailing whitespace
return api_key
# Function to get the latest volume number
def get_latest_volume(issn, api_key):
headers = {
'Accept': 'application/xml',
'X-ELS-APIKey': api_key
}
response = requests.get(f"https://api.elsevier.com/content/search/scopus?query=ISSN({issn})", headers=headers)
root = etree.fromstring(response.content)
# Extract the latest volume number
latest_volume = None
for entry in root.xpath('//atom:entry', namespaces={'atom': 'http://www.w3.org/2005/Atom'}):
volume = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}volume')
if volume is not None:
latest_volume = volume.text
break # Assuming the first entry is the latest
return latest_volume
# Function to get articles for a specific volume
def get_articles_for_volume(issn, volume, api_key):
headers = {
'Accept': 'application/xml',
'X-ELS-APIKey': api_key
}
response = requests.get(
f"https://api.elsevier.com/content/search/scopus?query=ISSN({issn})%20AND%20volume({volume})", headers=headers)
root = etree.fromstring(response.content)
# Check if the total results are zero
total_results = root.find('{http://a9.com/-/spec/opensearch/1.1/}totalResults')
if total_results is not None and int(total_results.text) == 0:
print(f"Volume number {volume} is empty, skipping to the next one.")
return []
# Extract the article metadata
articles = []
for entry in root.xpath('//atom:entry', namespaces={'atom': 'http://www.w3.org/2005/Atom'}):
article = {}
title = entry.find('{http://purl.org/dc/elements/1.1/}title')
article['title'] = title.text if title is not None else None
author = entry.find('{http://purl.org/dc/elements/1.1/}creator')
article['author'] = author.text if author is not None else None
pub_name = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}publicationName')
article['issn'] = issn
article['publicationName'] = pub_name.text if pub_name is not None else None
doi = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}doi')
article['doi'] = doi.text if doi is not None else None
article['volume'] = volume
cover_date = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}coverDate')
article['cover_date'] = cover_date.text if cover_date is not None else None
display_date = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}coverDisplayDate')
article['display_date'] = display_date.text if display_date is not None else None
issue_identifier = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}issueIdentifier')
article['issue_identifier'] = issue_identifier.text if issue_identifier is not None else None
page_range = entry.find('{http://prismstandard.org/namespaces/basic/2.0/}pageRange')
article['page_range'] = page_range.text if page_range is not None else None
open_access = entry.find('{http://www.w3.org/2005/Atom}openaccessFlag')
article['open_access'] = open_access.text if open_access is not None else None
affiliations = entry.findall('{http://www.w3.org/2005/Atom}affiliation')
article['affiliations'] = [{'name': aff.find('{http://www.w3.org/2005/Atom}affilname').text,
'country': aff.find('{http://www.w3.org/2005/Atom}affiliation-country').text} for
aff in affiliations] if affiliations else None
articles.append(article)
return articles
# Function to save data to TinyDB
def save_to_tinydb(data, filename, existing_dois):
# Get the directory of the current script
script_dir = Path(__file__).resolve().parent
# Create the full path to the file
file_path = script_dir / filename
db = TinyDB(file_path, indent=4, ensure_ascii=False)
for item in data:
# Check if 'doi' key is in the item
if 'doi' in item:
# Use TinyDB query to check for existing DOI
query_result = db.search(Query().doi == item['doi'])
if not query_result:
db.insert(item)
existing_dois.add(item['doi'])
print(f"Inserting in database: {item['title']}")
else:
# If the entry exists, update it only if it doesn't have all the fields
if not all(key in query_result[0] for key in item):
db.update(item, Query().doi == item['doi'])
print(f"Entry updated: {item}")
return existing_dois # Return the updated set
# Function to get existing DOIs from TinyDB
def get_existing_dois(filename):
script_dir = Path(__file__).resolve().parent
db_path = script_dir / filename
# Fetch all DOIs from the database first and store them in a set
existing_dois = set()
db = TinyDB(db_path, indent=4, ensure_ascii=False)
for item in db.all():
existing_dois.add(item['doi'])
return existing_dois
# Function to process all volumes
def process_volumes(issn, latest_volume, api_key, existing_dois, filename):
if latest_volume is None:
print("No volumes found for this ISSN.")
else:
print(f"Latest volume: {latest_volume}")
for volume in range(int(latest_volume), 0, -1):
print(f"volume {volume}:")
articles = get_articles_for_volume(issn, volume, api_key)
existing_dois = save_to_tinydb(articles, filename, existing_dois) # Update the set with the returned value
# Main function to run the script
def main():
issn = "1574-0137"
filename = "teste.json"
# issn = input("Please enter the ISSN: ") # Prompt for ISSN
# filename = input("Please enter the name of the JSON file: ") # Prompt for JSON filename
existing_dois = get_existing_dois(filename)
api_key = get_api_key()
latest_volume = get_latest_volume(issn, api_key)
process_volumes(issn, latest_volume, api_key, existing_dois, filename)
if __name__ == "__main__":
main()