Merge pull request ucbepic#76 from ucbepic/staging

Merging staging to main (from add gpt_pdf)
staru09 · Oct 5, 2024 · afdf917 · afdf917
2 parents 650f80b + efc4291
commit afdf917
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 3 deletions.
diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
@@ -1,9 +1,8 @@
 import importlib
 import io
 import os
-from typing import List, Optional
+from typing import Dict, List, Optional
 
-from litellm import transcription
 
 def llama_index_simple_directory_reader(filename: str) -> List[str]:
     from llama_index.core import SimpleDirectoryReader
@@ -12,6 +11,7 @@ def llama_index_simple_directory_reader(filename: str) -> List[str]:
     # FIXME: What about doc.metadata? Would be good to include that too...
     return [doc.text for doc in documents]
 
+
 def llama_index_wikipedia_reader(filename: str) -> List[str]:
     from llama_index.readers.wikipedia import WikipediaReader
 
@@ -21,10 +21,11 @@ def llama_index_wikipedia_reader(filename: str) -> List[str]:
     # The wikipedia reader does not include the page url in the metadata, which is impractical...
     for name, doc in zip(pages, documents):
         doc.metadata["source"] = "https://en.wikipedia.org/wiki/" + name
-    
+
     # FIXME: What about doc.metadata? Would be good to include that too...
     return [doc.text for doc in documents]
 
+
 def whisper_speech_to_text(filename: str) -> List[str]:
     """
     Transcribe speech from an audio file to text using Whisper model via litellm.
@@ -37,6 +38,7 @@ def whisper_speech_to_text(filename: str) -> List[str]:
         List[str]: Transcribed text.
     """
     import os
+    from litellm import transcription
 
     file_size = os.path.getsize(filename)
     if file_size > 25 * 1024 * 1024:  # 25 MB in bytes
@@ -397,6 +399,53 @@ def paddleocr_pdf_to_string(
     return pdf_content
 
 
+def gptpdf_to_string(
+    input_path: str,
+    gpt_model: str,
+    api_key: str,
+    base_url: str,
+    verbose: bool = False,
+    custom_prompt: Optional[Dict[str, str]] = None,
+) -> str:
+    """
+    Parse PDF using GPT to convert the content of a PDF to a markdown format and write it to an output file.
+
+    **Note: pip install gptpdf required**
+
+    Args:
+        input_path (str): Path to the input PDF file.
+        gpt_model (str): GPT model to be used for parsing.
+        api_key (str): API key for GPT service.
+        base_url (str): Base URL for the GPT service.
+        verbose (bool): If True, will print additional information during parsing.
+        custom_prompt (Optional[Dict[str, str]]): Custom prompt for the GPT model. See https://github.com/CosmosShadow/gptpdf for more information.
+
+    Returns:
+        str: Extracted content as a string.
+    """
+    from gptpdf import parse_pdf
+
+    import tempfile
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        kwargs = {
+            "pdf_path": input_path,
+            "output_dir": temp_dir,
+            "api_key": api_key,
+            "base_url": base_url,
+            "model": gpt_model,
+            "verbose": verbose,
+        }
+        if custom_prompt:
+            kwargs["prompt"] = custom_prompt
+
+        parsed_content, _ = parse_pdf(
+            **kwargs
+        )  # The second element is a list of image paths, which we don't need.
+
+        return [parsed_content]
+
+
 # Define a dictionary mapping function names to their corresponding functions
 
 

diff --git a/tests/test_parsing_tools.py b/tests/test_parsing_tools.py
@@ -213,3 +213,21 @@ def test_paddleocr_pdf_to_string():
     assert len(result) == 1
 
     assert "have received the new bottles, please discard" in result[0]
+
+
+# test function todo
+
+
+def test_gptpdf_to_string():
+    input_pdf = "tests/data/PublicWaterMassMailing.pdf"
+
+    result = parsing_tools.gptpdf_to_string(
+        input_path=input_pdf,
+        gpt_model="gpt-4o-mini",
+        api_key=os.environ["OPENAI_API_KEY"],
+        base_url="https://api.openai.com/v1",
+        verbose=False,
+    )
+
+    assert len(result) > 0, "The extracted content should not be empty."
+    assert len(result[0]) > 0, "The extracted content should not be empty."