Skip to content

Commit

Permalink
Extract abstracts from Pubmed articles, even if they have no extra la…
Browse files Browse the repository at this point in the history
…bel (langchain-ai#10245)

### Description
This pull request involves modifications to the extraction method for
abstracts/summaries within the PubMed utility. A condition has been
added to verify the presence of unlabeled abstracts. Now an abstract
will be extracted even if it does not have a subtitle. In addition, the
extraction of the abstract was extended to books.

### Issue
The PubMed utility occasionally returns an empty result when extracting
abstracts from articles, despite the presence of an abstract for the
paper on PubMed. This issue arises due to the varying structure of
articles; some articles follow a "subtitle/label: text" format, while
others do not include subtitles in their abstracts. An example of the
latter case can be found at:
[https://pubmed.ncbi.nlm.nih.gov/37666905/](url)

---------

Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
FlorianH5 and baskaryan authored Oct 6, 2023
1 parent fd9da60 commit d78f418
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 12 deletions.
31 changes: 23 additions & 8 deletions libs/langchain/langchain/utilities/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,15 +158,30 @@ def retrieve_article(self, uid: str, webenv: str) -> dict:
return self._parse_article(uid, text_dict)

def _parse_article(self, uid: str, text_dict: dict) -> dict:
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
"Article"
]
summary = "\n".join(
[
f"{txt['@Label']}: {txt['#text']}"
for txt in ar.get("Abstract", {}).get("AbstractText", [])
if "#text" in txt and "@Label" in txt
try:
ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
"Article"
]
except KeyError:
ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"]
abstract_text = ar.get("Abstract", {}).get("AbstractText", [])
summaries = [
f"{txt['@Label']}: {txt['#text']}"
for txt in abstract_text
if "#text" in txt and "@Label" in txt
]
summary = (
"\n".join(summaries)
if summaries
else (
abstract_text
if isinstance(abstract_text, str)
else (
"\n".join(str(value) for value in abstract_text.values())
if isinstance(abstract_text, dict)
else "No abstract available"
)
)
)
a_d = ar.get("ArticleDate", {})
pub_date = "-".join(
Expand Down
71 changes: 67 additions & 4 deletions libs/langchain/tests/integration_tests/utilities/test_pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,16 @@ def api_client() -> PubMedAPIWrapper:
def test_run_success(api_client: PubMedAPIWrapper) -> None:
"""Test that returns the correct answer"""

output = api_client.run("chatgpt")
assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
search_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature"
)
output = api_client.run(search_string)
test_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature: Findings and Implications"
)
assert test_string in output
assert len(output) == api_client.doc_content_chars_max


Expand All @@ -32,6 +40,53 @@ def test_run_returns_no_result(api_client: PubMedAPIWrapper) -> None:
assert "No good PubMed Result was found" == output


def test_retrieve_article_returns_book_abstract(api_client: PubMedAPIWrapper) -> None:
"""Test that returns the excerpt of a book."""

output_nolabel = api_client.retrieve_article("25905357", "")
output_withlabel = api_client.retrieve_article("29262144", "")
test_string_nolabel = (
"Osteoporosis is a multifactorial disorder associated with low bone mass and "
"enhanced skeletal fragility. Although"
)
assert test_string_nolabel in output_nolabel["Summary"]
assert (
"Wallenberg syndrome was first described in 1808 by Gaspard Vieusseux. However,"
in output_withlabel["Summary"]
)


def test_retrieve_article_returns_article_abstract(
api_client: PubMedAPIWrapper,
) -> None:
"""Test that returns the abstract of an article."""

output_nolabel = api_client.retrieve_article("37666905", "")
output_withlabel = api_client.retrieve_article("37666551", "")
test_string_nolabel = (
"This work aims to: (1) Provide maximal hand force data on six different "
"grasp types for healthy subjects; (2) detect grasp types with maximal "
"force significantly affected by hand osteoarthritis (HOA) in women; (3) "
"look for predictors to detect HOA from the maximal forces using discriminant "
"analyses."
)
assert test_string_nolabel in output_nolabel["Summary"]
test_string_withlabel = (
"OBJECTIVES: To assess across seven hospitals from six different countries "
"the extent to which the COVID-19 pandemic affected the volumes of orthopaedic "
"hospital admissions and patient outcomes for non-COVID-19 patients admitted "
"for orthopaedic care."
)
assert test_string_withlabel in output_withlabel["Summary"]


def test_retrieve_article_no_abstract_available(api_client: PubMedAPIWrapper) -> None:
"""Test that returns 'No abstract available'."""

output = api_client.retrieve_article("10766884", "")
assert "No abstract available" == output["Summary"]


def assert_docs(docs: List[Document]) -> None:
for doc in docs:
assert doc.metadata
Expand Down Expand Up @@ -87,8 +142,16 @@ def _load_pubmed_from_universal_entry(**kwargs: Any) -> BaseTool:

def test_load_pupmed_from_universal_entry() -> None:
pubmed_tool = _load_pubmed_from_universal_entry()
output = pubmed_tool("chatgpt")
assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
search_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature"
)
output = pubmed_tool(search_string)
test_string = (
"Examining the Validity of ChatGPT in Identifying "
"Relevant Nephrology Literature: Findings and Implications"
)
assert test_string in output


def test_load_pupmed_from_universal_entry_with_params() -> None:
Expand Down

0 comments on commit d78f418

Please sign in to comment.