-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscopus-api-mining.py
54 lines (45 loc) · 2.18 KB
/
scopus-api-mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
import pymongo
from bson.binary import Binary
client = pymongo.MongoClient() # connecting to MongoDB client
mydb = client["career_project_dreu"] # accessing database
mycol = mydb["publications"] # accessing collection in that database
# Download full text locally
def download_full_text(doi, api_key, output_format='xml'):
url = f'https://api.elsevier.com/content/article/doi/{doi}?apiKey={api_key}&httpAccept=text%2F{output_format}'
response = requests.get(url)
if response.status_code == 200:
content_type = response.headers['Content-Type']
if content_type.startswith('text/'):
if (output_format == "plain"):
filename = f'{doi.replace("/", "_")}.txt'
else:
filename = f'{doi.replace("/", "_")}.xml'
with open(filename, 'w', encoding='utf-8') as file:
file.write(response.text)
print(f"Full text downloaded and saved as '{filename}'")
else:
print("Error: Unexpected content type received.")
else:
print(f"Error: Failed to retrieve full text. Status code: {response.status_code}")
# Store full text in database
def store_full_text(doi, api_key, output_format='xml'):
url = f'https://api.elsevier.com/content/article/doi/{doi}?apiKey={api_key}&httpAccept=text%2F{output_format}'
response = requests.get(url)
if response.status_code == 200:
content_type = response.headers['Content-Type']
if content_type.startswith('text/'):
if (output_format == "plain"):
mycol.update_one({'DOI': doi}, {'$set': {'plain_text': Binary(response.content)}})
else:
mycol.update_one({'DOI': doi}, {'$set': {'xml_text': Binary(response.content)}})
print("Full text stored in MongoDB")
else:
print("Error: Unexpected content type received.")
else:
print(f"Error: Failed to retrieve full text. Status code: {response.status_code}")
# Testing
doi = '10.1016/j.cell.2005.01.027'
api_key = '11852fb061166663a048c15071f4d873'
store_full_text(doi, api_key, output_format='xml')
store_full_text(doi, api_key, output_format='plain')