forked from gijswobben/pymed
-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: add mindate, maxdate args in query function
- Loading branch information
Showing
4 changed files
with
183 additions
and
134 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
import itertools | ||
import xml.etree.ElementTree as xml | ||
|
||
from typing import Union | ||
from typing import Any, Dict, Iterable, List, Union, cast | ||
|
||
import requests | ||
|
||
|
@@ -14,54 +14,63 @@ | |
BASE_URL = "https://eutils.ncbi.nlm.nih.gov" | ||
|
||
|
||
class PubMed(object): | ||
class PubMed: | ||
"""Wrapper around the PubMed API.""" | ||
|
||
def __init__( | ||
self: object, | ||
self, | ||
tool: str = "my_tool", | ||
email: str = "[email protected]", | ||
) -> None: | ||
"""Initialization of the object. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- tool String, name of the tool that is executing the query. | ||
This parameter is not required but kindly requested by | ||
PMC (PubMed Central). | ||
- email String, email of the user of the tool. This parameter | ||
is not required but kindly requested by PMC (PubMed Central). | ||
Returns | ||
------- | ||
Returns: | ||
- None | ||
""" | ||
|
||
# Store the input parameters | ||
self.tool = tool | ||
self.email = email | ||
|
||
# Keep track of the rate limit | ||
self._rateLimit = 3 | ||
self._requestsMade = [] | ||
|
||
self._rateLimit: int = 3 | ||
self._requestsMade: List[datetime.datetime] = [] | ||
self.parameters: Dict[str, Union[str, int, List[str]]] | ||
# Define the standard / default query parameters | ||
self.parameters = {"tool": tool, "email": email, "db": "pubmed"} | ||
|
||
def query(self: object, query: str, max_results: int = 100): | ||
def query( | ||
self, | ||
query: str, | ||
min_date: str, | ||
max_date: str, | ||
max_results: int = 100, | ||
) -> Iterable[Union[PubMedArticle, PubMedBookArticle]]: | ||
"""Method that executes a query agains the GraphQL schema, automatically | ||
inserting the PubMed data loader. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- query String, the GraphQL query to execute against the schema. | ||
Returns | ||
------- | ||
Returns: | ||
- result ExecutionResult, GraphQL object that contains the result | ||
in the "data" attribute. | ||
""" | ||
|
||
# Retrieve the article IDs for the query | ||
article_ids = self._getArticleIds(query=query, max_results=max_results) | ||
article_ids = self._getArticleIds( | ||
query=query, | ||
min_date=min_date, | ||
max_date=max_date, | ||
max_results=max_results, | ||
) | ||
|
||
# Get the articles themselves | ||
articles = list( | ||
|
@@ -74,17 +83,16 @@ def query(self: object, query: str, max_results: int = 100): | |
# Chain the batches back together and return the list | ||
return itertools.chain.from_iterable(articles) | ||
|
||
def getTotalResultsCount(self: object, query: str) -> int: | ||
def getTotalResultsCount(self, query: str) -> int: | ||
"""Helper method that returns the total number of results that match the query. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- query String, the query to send to PubMed | ||
Returns | ||
------- | ||
Returns: | ||
- total_results_count Int, total number of results for the query in PubMed | ||
""" | ||
|
||
# Get the default parameters | ||
parameters = self.parameters.copy() | ||
|
||
|
@@ -93,7 +101,7 @@ def getTotalResultsCount(self: object, query: str) -> int: | |
parameters["retmax"] = 1 | ||
|
||
# Make the request (request a single article ID for this search) | ||
response = self._get( | ||
response: requests.models.Response = self._get( | ||
url="/entrez/eutils/esearch.fcgi", parameters=parameters | ||
) | ||
|
||
|
@@ -108,10 +116,10 @@ def getTotalResultsCount(self: object, query: str) -> int: | |
def _exceededRateLimit(self) -> bool: | ||
"""Helper method to check if we've exceeded the rate limit. | ||
Returns | ||
------- | ||
Returns: | ||
- exceeded Bool, Whether or not the rate limit is exceeded. | ||
""" | ||
|
||
# Remove requests from the list that are longer than 1 second ago | ||
self._requestsMade = [ | ||
requestTime | ||
|
@@ -124,34 +132,37 @@ def _exceededRateLimit(self) -> bool: | |
return len(self._requestsMade) > self._rateLimit | ||
|
||
def _get( | ||
self: object, url: str, parameters: dict, output: str = "json" | ||
) -> Union[dict, str]: | ||
self, | ||
url: str, | ||
parameters: Dict[Any, Any] = dict(), | ||
output: str = "json", | ||
) -> Union[str, requests.models.Response]: | ||
"""Generic helper method that makes a request to PubMed. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- url Str, last part of the URL that is requested (will | ||
be combined with the base url) | ||
- parameters Dict, parameters to use for the request | ||
- output Str, type of output that is requested (defaults to | ||
JSON but can be used to retrieve XML) | ||
Returns | ||
------- | ||
Returns: | ||
- response Dict / str, if the response is valid JSON it will | ||
be parsed before returning, otherwise a string is | ||
returend | ||
""" | ||
|
||
# Make sure the rate limit is not exceeded | ||
while self._exceededRateLimit(): | ||
pass | ||
|
||
# Set the response mode | ||
parameters["retmode"] = output | ||
|
||
if parameters: | ||
parameters["retmode"] = output | ||
|
||
# Make the request to PubMed | ||
response = requests.get(f"{BASE_URL}{url}", params=parameters) | ||
|
||
# Check for any errors | ||
response.raise_for_status() | ||
|
||
|
@@ -164,17 +175,18 @@ def _get( | |
else: | ||
return response.text | ||
|
||
def _getArticles(self: object, article_ids: list) -> list: | ||
def _getArticles( | ||
self, article_ids: List[str] | ||
) -> Iterable[Union[PubMedArticle, PubMedBookArticle]]: | ||
"""Helper method that batches a list of article IDs and retrieves the content. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- article_ids List, article IDs. | ||
Returns | ||
------- | ||
Returns: | ||
- articles List, article objects. | ||
""" | ||
|
||
# Get the default parameters | ||
parameters = self.parameters.copy() | ||
parameters["id"] = article_ids | ||
|
@@ -195,18 +207,23 @@ def _getArticles(self: object, article_ids: list) -> list: | |
for book in root.iter("PubmedBookArticle"): | ||
yield PubMedBookArticle(xml_element=book) | ||
|
||
def _getArticleIds(self: object, query: str, max_results: int) -> list: | ||
def _getArticleIds( | ||
self, | ||
query: str, | ||
min_date: str, | ||
max_date: str, | ||
max_results: int, | ||
) -> List[str]: | ||
"""Helper method to retrieve the article IDs for a query. | ||
Parameters | ||
---------- | ||
Parameters: | ||
- query Str, query to be executed against the PubMed database. | ||
- max_results Int, the maximum number of results to retrieve. | ||
Returns | ||
------- | ||
Returns: | ||
- article_ids List, article IDs as a list. | ||
""" | ||
|
||
# Create a placeholder for the retrieved IDs | ||
article_ids = [] | ||
|
||
|
@@ -216,16 +233,29 @@ def _getArticleIds(self: object, query: str, max_results: int) -> list: | |
# Add specific query parameters | ||
parameters["term"] = query | ||
parameters["retmax"] = 50000 | ||
parameters["datetype"] = "edat" | ||
parameters["mindate"] = min_date | ||
parameters["maxdate"] = max_date | ||
|
||
retmax: int = cast(int, parameters["retmax"]) | ||
# Calculate a cut off point based on the max_results parameter | ||
if max_results < parameters["retmax"]: | ||
if max_results < retmax: | ||
parameters["retmax"] = max_results | ||
|
||
# Make the first request to PubMed | ||
response = self._get( | ||
url="/entrez/eutils/esearch.fcgi", parameters=parameters | ||
new_url = ( | ||
"/entrez/eutils/esearch.fcgi?" | ||
f"db={parameters['db']}&" | ||
f"term={parameters['term']}&" | ||
f"retmax={parameters['retmax']}&" | ||
f"datetype={parameters['datetype']}&" | ||
f"mindate={parameters['mindate']}&" | ||
f"maxdate={parameters['maxdate']}&" | ||
f"retmode=json" | ||
) | ||
|
||
# Make the first request to PubMed | ||
response: requests.models.Response = self._get(url=new_url) | ||
|
||
# Add the retrieved IDs to the list | ||
article_ids += response.get("esearchresult", {}).get("idlist", []) | ||
|
||
|
@@ -245,7 +275,9 @@ def _getArticleIds(self: object, query: str, max_results: int) -> list: | |
and retrieved_count < max_results | ||
): | ||
# Calculate a cut off point based on the max_results parameter | ||
if (max_results - retrieved_count) < parameters["retmax"]: | ||
if (max_results - retrieved_count) < cast( | ||
int, parameters["retmax"] | ||
): | ||
parameters["retmax"] = max_results - retrieved_count | ||
|
||
# Start the collection from the number of already retrieved articles | ||
|
Oops, something went wrong.