Extract abstracts from Pubmed articles, even if they have no extra la…

…bel (langchain-ai#10245) ### Description This pull request involves modifications to the extraction method for abstracts/summaries within the PubMed utility. A condition has been added to verify the presence of unlabeled abstracts. Now an abstract will be extracted even if it does not have a subtitle. In addition, the extraction of the abstract was extended to books. ### Issue The PubMed utility occasionally returns an empty result when extracting abstracts from articles, despite the presence of an abstract for the paper on PubMed. This issue arises due to the varying structure of articles; some articles follow a "subtitle/label: text" format, while others do not include subtitles in their abstracts. An example of the latter case can be found at: [https://pubmed.ncbi.nlm.nih.gov/37666905/](url) --------- Co-authored-by: Bagatur <[email protected]>
deepsense-ai · Oct 6, 2023 · d78f418 · d78f418
1 parent fd9da60
commit d78f418
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 12 deletions.
diff --git a/libs/langchain/langchain/utilities/pubmed.py b/libs/langchain/langchain/utilities/pubmed.py
@@ -158,15 +158,30 @@ def retrieve_article(self, uid: str, webenv: str) -> dict:
         return self._parse_article(uid, text_dict)
 
     def _parse_article(self, uid: str, text_dict: dict) -> dict:
-        ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
-            "Article"
-        ]
-        summary = "\n".join(
-            [
-                f"{txt['@Label']}: {txt['#text']}"
-                for txt in ar.get("Abstract", {}).get("AbstractText", [])
-                if "#text" in txt and "@Label" in txt
+        try:
+            ar = text_dict["PubmedArticleSet"]["PubmedArticle"]["MedlineCitation"][
+                "Article"
             ]
+        except KeyError:
+            ar = text_dict["PubmedArticleSet"]["PubmedBookArticle"]["BookDocument"]
+        abstract_text = ar.get("Abstract", {}).get("AbstractText", [])
+        summaries = [
+            f"{txt['@Label']}: {txt['#text']}"
+            for txt in abstract_text
+            if "#text" in txt and "@Label" in txt
+        ]
+        summary = (
+            "\n".join(summaries)
+            if summaries
+            else (
+                abstract_text
+                if isinstance(abstract_text, str)
+                else (
+                    "\n".join(str(value) for value in abstract_text.values())
+                    if isinstance(abstract_text, dict)
+                    else "No abstract available"
+                )
+            )
         )
         a_d = ar.get("ArticleDate", {})
         pub_date = "-".join(

diff --git a/libs/langchain/tests/integration_tests/utilities/test_pubmed.py b/libs/langchain/tests/integration_tests/utilities/test_pubmed.py
@@ -20,8 +20,16 @@ def api_client() -> PubMedAPIWrapper:
 def test_run_success(api_client: PubMedAPIWrapper) -> None:
     """Test that returns the correct answer"""
 
-    output = api_client.run("chatgpt")
-    assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
+    search_string = (
+        "Examining the Validity of ChatGPT in Identifying "
+        "Relevant Nephrology Literature"
+    )
+    output = api_client.run(search_string)
+    test_string = (
+        "Examining the Validity of ChatGPT in Identifying "
+        "Relevant Nephrology Literature: Findings and Implications"
+    )
+    assert test_string in output
     assert len(output) == api_client.doc_content_chars_max
 
 
@@ -32,6 +40,53 @@ def test_run_returns_no_result(api_client: PubMedAPIWrapper) -> None:
     assert "No good PubMed Result was found" == output
 
 
+def test_retrieve_article_returns_book_abstract(api_client: PubMedAPIWrapper) -> None:
+    """Test that returns the excerpt of a book."""
+
+    output_nolabel = api_client.retrieve_article("25905357", "")
+    output_withlabel = api_client.retrieve_article("29262144", "")
+    test_string_nolabel = (
+        "Osteoporosis is a multifactorial disorder associated with low bone mass and "
+        "enhanced skeletal fragility. Although"
+    )
+    assert test_string_nolabel in output_nolabel["Summary"]
+    assert (
+        "Wallenberg syndrome was first described in 1808 by Gaspard Vieusseux. However,"
+        in output_withlabel["Summary"]
+    )
+
+
+def test_retrieve_article_returns_article_abstract(
+    api_client: PubMedAPIWrapper,
+) -> None:
+    """Test that returns the abstract of an article."""
+
+    output_nolabel = api_client.retrieve_article("37666905", "")
+    output_withlabel = api_client.retrieve_article("37666551", "")
+    test_string_nolabel = (
+        "This work aims to: (1) Provide maximal hand force data on six different "
+        "grasp types for healthy subjects; (2) detect grasp types with maximal "
+        "force significantly affected by hand osteoarthritis (HOA) in women; (3) "
+        "look for predictors to detect HOA from the maximal forces using discriminant "
+        "analyses."
+    )
+    assert test_string_nolabel in output_nolabel["Summary"]
+    test_string_withlabel = (
+        "OBJECTIVES: To assess across seven hospitals from six different countries "
+        "the extent to which the COVID-19 pandemic affected the volumes of orthopaedic "
+        "hospital admissions and patient outcomes for non-COVID-19 patients admitted "
+        "for orthopaedic care."
+    )
+    assert test_string_withlabel in output_withlabel["Summary"]
+
+
+def test_retrieve_article_no_abstract_available(api_client: PubMedAPIWrapper) -> None:
+    """Test that returns 'No abstract available'."""
+
+    output = api_client.retrieve_article("10766884", "")
+    assert "No abstract available" == output["Summary"]
+
+
 def assert_docs(docs: List[Document]) -> None:
     for doc in docs:
         assert doc.metadata
@@ -87,8 +142,16 @@ def _load_pubmed_from_universal_entry(**kwargs: Any) -> BaseTool:
 
 def test_load_pupmed_from_universal_entry() -> None:
     pubmed_tool = _load_pubmed_from_universal_entry()
-    output = pubmed_tool("chatgpt")
-    assert "Performance of ChatGPT on the Situational Judgement Test-A" in output
+    search_string = (
+        "Examining the Validity of ChatGPT in Identifying "
+        "Relevant Nephrology Literature"
+    )
+    output = pubmed_tool(search_string)
+    test_string = (
+        "Examining the Validity of ChatGPT in Identifying "
+        "Relevant Nephrology Literature: Findings and Implications"
+    )
+    assert test_string in output
 
 
 def test_load_pupmed_from_universal_entry_with_params() -> None: