From 4187789665219892000d9f6de467e6e5be75749c Mon Sep 17 00:00:00 2001 From: Pavel Zhelnov Date: Wed, 31 May 2023 18:45:05 -0600 Subject: [PATCH] Upload two python scripts for doi capture This is in reference to this issue: https://github.com/p1m-ortho/xs-sebzer/issues/9 I tested the elid parser, and it seems to work. However, I could not make the doi grabber work due to IP block (see screenshot in issue). Thus, I also cannot confirm that the code itself works correctly. --- sebzer-doi-grabber.py | 70 +++++++++++++++++++++++++++++++++++++++++++ sebzer-elid-parser.py | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 sebzer-doi-grabber.py create mode 100644 sebzer-elid-parser.py diff --git a/sebzer-doi-grabber.py b/sebzer-doi-grabber.py new file mode 100644 index 0000000..9bd6fa7 --- /dev/null +++ b/sebzer-doi-grabber.py @@ -0,0 +1,70 @@ +# Credit: ChatGPT May 24 Version (https://help.openai.com/en/articles/6825453-chatgpt-release-notes). Free Research Preview. + +# Initial prompt: +# i have a list of ids that are integers. they fit to this url format: https://elibrary.ru/item.asp?id={id}. i need you to write a python script that iterates through this list of ids (read from newline-separated text file taken as a script argument), then uses beautifulsoup to retrieve innerhtml of the page and then parse it and retrieve a doi from that page. Dois on these urls are contained within this kind of element: `DOI: {DOI}`, which is unique on the page. + +# Final output, with modifications: + +# Certainly! I can help you with that. Here's a Python script that accomplishes the task you described using BeautifulSoup: + +import sys +import csv +import os +import requests +from bs4 import BeautifulSoup + +def get_doi_from_id(id): + url = f"https://elibrary.ru/item.asp?id={id}" + + # Send a GET request to the eLIBRARY.RU URL + response = requests.get(url) + if response.status_code == 200: + # Parse the HTML content using BeautifulSoup + soup = BeautifulSoup(response.content, 'html.parser') + + # Find the DOI element + doi_element = soup.find('td', text=lambda t: t and t.startswith('DOI: ')) + if doi_element: + # Extract the DOI value + doi = doi_element.find_next('font').find_next('a').text + print(f"DOI found for eLIBRARY.RU ID {id}: {doi}") + return doi + else: + print(f"No DOI found for eLIBRARY.RU ID {id}") + else: + print(f"Failed to retrieve URL for eLIBRARY.RU ID {id}") + +# Read the input file from command line argument, e.g., an elibrary_ru.lst file that sebzer-elid-parser.py produces +if len(sys.argv) < 2: + print("Please provide the input file as a command line argument. If you used sebzer-elid-parser.py to parse a Sebzer-generated BibTeX file, this would usually be elibrary_ru.lst.") + sys.exit(1) + +filename = sys.argv[1] +output_filename = os.path.splitext(filename)[0] + '.csv' # Generate output file name + +with open(filename, 'r') as file: + ids = file.read().splitlines() + +results = [] +# Iterate through the eLIBRARY.RU IDs and get the DOIs +for id in ids: + doi = get_doi_from_id(id) + results.append([id, doi]) + +# Save the results to a CSV file +with open(output_filename, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerow(['elibrary_id', 'doi']) # Write header row + writer.writerows(results) + +print(f"Results saved to {output_filename}") + +# To use this script, make sure you have the requests and beautifulsoup4 libraries installed. You can install them using pip install requests beautifulsoup4. Save the script in a Python file (e.g., retrieve_doi.py), and create a text file (elibrary_ru.lst) with the newline-separated list of IDs you want to process, e.g., using a Sebzer-generated BibTeX file and the sebzer-elid-parser.py script. + +# Run the script with the following command: + +# python sebzer-doi-grabber.py elibrary_ru.lst + +# The script will iterate through the eLIBRARY.RU IDs, retrieve the HTML content from each URL, find the DOI element using BeautifulSoup, and extract the DOI value. It will then print the DOI for each eLIBRARY.RU ID, or a message if no DOI is found. + +# Please note that this script assumes that the HTML structure of the page you mentioned remains the same. If there are any changes, the script may need modifications accordingly. diff --git a/sebzer-elid-parser.py b/sebzer-elid-parser.py new file mode 100644 index 0000000..61c2e5b --- /dev/null +++ b/sebzer-elid-parser.py @@ -0,0 +1,52 @@ +# Credit: ChatGPT May 24 Version (https://help.openai.com/en/articles/6825453-chatgpt-release-notes). Free Research Preview. + +# Initial prompt: +# i have a bibtex file, all elements of which contain the following part, in which %ID% is an integer id number i need to extract from this file: `url = {https://elibrary.ru/item.asp?id=%ID%}`. as a result, i want a newline-separated list of these id integers. write a python script. the script takes the input .bib file from command line argument and saves the resulting list as an .lst file with the same name as input and in the same dir. + +# Final output, with modifications: + +# Certainly! Here's a Python script that takes a Sebzer-generated BibTeX file as a command line argument, extracts the eLIBRARY.RU IDs from the URLs, and saves them in a newline-separated list in a file with the same name but a different extension: + +import sys +import re + +def extract_ids_from_bibtex(file_path): + # Read a Sebzer-generated BibTeX file + with open(file_path, 'r') as bib_file: + bib_data = bib_file.read() + + # Extract the eLIBRARY.RU IDs using regular expressions + pattern = r'url\s*=\s*{https://elibrary.ru/item.asp\?id=(\d+)}' + ids = re.findall(pattern, bib_data) + + return ids + +def save_ids_to_file(file_path, ids): + # Create the output file path + output_path = file_path.rsplit('.', 1)[0] + '.lst' + + # Save the eLIBRARY.RU IDs to the output file + with open(output_path, 'w') as output_file: + output_file.write('\n'.join(ids)) + + print(f"eLIBRARY.RU IDs extracted successfully. Saved in '{output_path}'.") + +if __name__ == '__main__': + # Check if a file path is provided as a command line argument + if len(sys.argv) != 2: + print("Please provide a Sebzer-generated BibTeX file as a command line argument.") + sys.exit(1) + + bib_file_path = sys.argv[1] + + # Extract the eLIBRARY.RU IDs from the Sebzer-generated BibTeX file + ids = extract_ids_from_bibtex(bib_file_path) + + # Save the eLIBRARY.RU IDs to the output file + save_ids_to_file(bib_file_path, ids) + +# To use this script, run it from the command line, providing a Sebzer-generated BibTeX file, e.g., elibrary_ru.bib, as an argument: + +# python sebzer-elid-parser.py elibrary_ru.bib + +# This will extract the eLIBRARY IDs from the Sebzer-generated elibrary_ru.bib file and save them in a file named elibrary_ru.lst in the same directory. Make sure you have Python installed and the necessary dependencies (such as re) available.