Skip to content

Commit

Permalink
Add ability to load PDF from HTTP + specify file system from string (…
Browse files Browse the repository at this point in the history
…useful for config files) (#230)

* Fix PdfLoader, add ability to specify file system from string

* Make PdfLoader work with config files
  • Loading branch information
stellasia authored Dec 30, 2024
1 parent f8092fc commit 39fd4f7
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 12 deletions.
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ are listed in [the last section of this file](#customize).
- [End to end PDF to graph simple pipeline](build_graph/simple_kg_builder_from_pdf.py)
- [End to end text to graph simple pipeline](build_graph/simple_kg_builder_from_text.py)
- [Build KG pipeline from config file](build_graph/from_config_files/simple_kg_pipeline_from_config_file.py)
- [Build KG pipeline with PDF URL](build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py)


## Retrieve
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"version_": "1",
"template_": "SimpleKGPipeline",
"neo4j_config": {
"params_": {
"uri": {
"resolver_": "ENV",
"var_": "NEO4J_URI"
},
"user": {
"resolver_": "ENV",
"var_": "NEO4J_USER"
},
"password": {
"resolver_": "ENV",
"var_": "NEO4J_PASSWORD"
}
}
},
"llm_config": {
"class_": "OpenAILLM",
"params_": {
"api_key": {
"resolver_": "ENV",
"var_": "OPENAI_API_KEY"
},
"model_name": "gpt-4o",
"model_params": {
"temperature": 0,
"max_tokens": 2000,
"response_format": {"type": "json_object"}
}
}
},
"embedder_config": {
"class_": "OpenAIEmbeddings",
"params_": {
"api_key": {
"resolver_": "ENV",
"var_": "OPENAI_API_KEY"
}
}
},
"from_pdf": true,
"entities": [
"Person",
{
"label": "House",
"description": "Family the person belongs to",
"properties": [
{
"name": "name",
"type": "STRING"
}
]
},
{
"label": "Planet",
"properties": [
{
"name": "name",
"type": "STRING"
},
{
"name": "weather",
"type": "STRING"
}
]
}
],
"relations": [
"PARENT_OF",
{
"label": "HEIR_OF",
"description": "Used for inheritor relationship between father and sons"
},
{
"label": "RULES",
"properties": [
{
"name": "fromYear",
"type": "INTEGER"
}
]
}
],
"potential_schema": [
[
"Person",
"PARENT_OF",
"Person"
],
[
"Person",
"HEIR_OF",
"House"
],
[
"House",
"RULES",
"Planet"
]
],
"text_splitter": {
"class_": "text_splitters.fixed_size_splitter.FixedSizeSplitter",
"params_": {
"chunk_size": 100,
"chunk_overlap": 10
}
},
"pdf_loader": {
"class_": "pdf_loader.PdfLoader",
"run_params_": {
"fs": "http"
}
},
"perform_entity_resolution": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""In this example, the pipeline is defined in a JSON ('simple_kg_pipeline_config.json')
or YAML ('simple_kg_pipeline_config.yaml') file.
According to the configuration file, some parameters will be read from the env vars
(Neo4j credentials and the OpenAI API key).
"""

import asyncio
import logging

## If env vars are in a .env file, uncomment:
## (requires pip install python-dotenv)
# from dotenv import load_dotenv
# load_dotenv()
# env vars manually set for testing:
import os
from pathlib import Path

from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult

logging.basicConfig()
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USER"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"
# os.environ["OPENAI_API_KEY"] = "sk-..."


root_dir = Path(__file__).parent
file_path = root_dir / "simple_kg_pipeline_config_url.json"


# File to process
URL = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"


async def main() -> PipelineResult:
pipeline = PipelineRunner.from_config_file(file_path)
return await pipeline.run({"file_path": URL})


if __name__ == "__main__":
print(asyncio.run(main()))
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Use the PdfLoader component to extract text from a remote PDF file."""

import asyncio

from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader

url = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"


async def main() -> None:
loader = PdfLoader()
document = await loader.run(filepath=url, fs="http")
print(document.text[:100])


if __name__ == "__main__":
asyncio.run(main())
18 changes: 10 additions & 8 deletions src/neo4j_graphrag/experimental/components/pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,10 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
class PdfLoader(DataLoader):
@staticmethod
def load_file(
file: Union[Path, str],
file: str,
fs: AbstractFileSystem,
) -> str:
"""Parse PDF file and return text."""
if not isinstance(file, Path):
file = Path(file)

try:
with fs.open(file, "rb") as fp:
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
Expand All @@ -85,16 +82,21 @@ def load_file(

async def run(
self,
filepath: Path,
filepath: Union[str, Path],
metadata: Optional[Dict[str, str]] = None,
fs: Optional[AbstractFileSystem] = None,
fs: Optional[Union[AbstractFileSystem, str]] = None,
) -> PdfDocument:
fs = fs or LocalFileSystem()
if not isinstance(filepath, str):
filepath = str(filepath)
if isinstance(fs, str):
fs = fsspec.filesystem(fs)
elif fs is None:
fs = LocalFileSystem()
text = self.load_file(filepath, fs)
return PdfDocument(
text=text,
document_info=DocumentInfo(
path=str(filepath),
path=filepath,
metadata=self.get_document_metadata(text, metadata),
),
)
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ class ComponentConfig(ObjectConfig[Component]):
DEFAULT_MODULE = "neo4j_graphrag.experimental.components"
INTERFACE = Component

def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
self._global_data = resolved_data
return self.resolve_params(self.run_params_)


class ComponentType(RootModel): # type: ignore[type-arg]
root: Union[Component, ComponentConfig]
Expand All @@ -264,3 +268,8 @@ def parse(self, resolved_data: dict[str, Any] | None = None) -> Component:
if isinstance(self.root, Component):
return self.root
return self.root.parse(resolved_data)

def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
if isinstance(self.root, Component):
return {}
return self.root.get_run_params(resolved_data)
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,23 @@ def _get_pdf_loader(self) -> Optional[PdfLoader]:
return self.pdf_loader.parse(self._global_data) # type: ignore
return PdfLoader()

def _get_run_params_for_pdf_loader(self) -> dict[str, Any]:
if not self.from_pdf:
return {}
if self.pdf_loader:
return self.pdf_loader.get_run_params(self._global_data)
return {}

def _get_splitter(self) -> TextSplitter:
if self.text_splitter:
return self.text_splitter.parse(self._global_data) # type: ignore
return FixedSizeSplitter()

def _get_run_params_for_splitter(self) -> dict[str, Any]:
if self.text_splitter:
return self.text_splitter.get_run_params(self._global_data)
return {}

def _get_chunk_embedder(self) -> TextChunkEmbedder:
return TextChunkEmbedder(embedder=self.get_default_embedder())

Expand Down Expand Up @@ -123,6 +135,11 @@ def _get_writer(self) -> KGWriter:
neo4j_database=self.neo4j_database,
)

def _get_run_params_for_writer(self) -> dict[str, Any]:
if self.kg_writer:
return self.kg_writer.get_run_params(self._global_data)
return {}

def _get_resolver(self) -> Optional[EntityResolver]:
if not self.perform_entity_resolution:
return None
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/experimental/components/test_pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ def pdf_loader() -> PdfLoader:


@pytest.fixture
def dummy_pdf_path() -> Path:
return BASE_DIR / "sample_data/lorem_ipsum.pdf"
def dummy_pdf_path() -> str:
return str(BASE_DIR / "sample_data/lorem_ipsum.pdf")


def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
expected_content = "Lorem ipsum dolor sit amet."
actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
assert actual_content == expected_content


def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
with patch(
"fsspec.implementations.local.LocalFileSystem.open",
side_effect=Exception("Failed to open"),
Expand Down

0 comments on commit 39fd4f7

Please sign in to comment.