Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to load PDF from HTTP + specify file system from string (useful for config files) #230

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e9712a9
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 15, 2024
b52c45e
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 16, 2024
84c1780
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 17, 2024
47d4782
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 21, 2024
bc7a2f9
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 22, 2024
a945284
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 22, 2024
4e13c23
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 23, 2024
5367bed
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 24, 2024
21d1223
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 25, 2024
3329cd7
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 25, 2024
d8f6364
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Oct 28, 2024
4cec2f3
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 4, 2024
4445b49
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 5, 2024
939b18c
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 18, 2024
1104519
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 22, 2024
1893b85
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 25, 2024
6e4ebda
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Nov 28, 2024
8db7f01
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python
stellasia Dec 9, 2024
33e0f41
Fix PdfLoader, add ability to specify file system from string
stellasia Dec 10, 2024
621f4b9
Merge branch 'main' of https://github.com/neo4j/neo4j-graphrag-python…
stellasia Dec 12, 2024
19c9e06
Make PdfLoader work with config files
stellasia Dec 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ are listed in [the last section of this file](#customize).
- [End to end PDF to graph simple pipeline](build_graph/simple_kg_builder_from_pdf.py)
- [End to end text to graph simple pipeline](build_graph/simple_kg_builder_from_text.py)
- [Build KG pipeline from config file](build_graph/from_config_files/simple_kg_pipeline_from_config_file.py)
- [Build KG pipeline with PDF URL](build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py)


## Retrieve
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"version_": "1",
"template_": "SimpleKGPipeline",
"neo4j_config": {
"params_": {
"uri": {
"resolver_": "ENV",
"var_": "NEO4J_URI"
},
"user": {
"resolver_": "ENV",
"var_": "NEO4J_USER"
},
"password": {
"resolver_": "ENV",
"var_": "NEO4J_PASSWORD"
}
}
},
"llm_config": {
"class_": "OpenAILLM",
"params_": {
"api_key": {
"resolver_": "ENV",
"var_": "OPENAI_API_KEY"
},
"model_name": "gpt-4o",
"model_params": {
"temperature": 0,
"max_tokens": 2000,
"response_format": {"type": "json_object"}
}
}
},
"embedder_config": {
"class_": "OpenAIEmbeddings",
"params_": {
"api_key": {
"resolver_": "ENV",
"var_": "OPENAI_API_KEY"
}
}
},
"from_pdf": true,
"entities": [
"Person",
{
"label": "House",
"description": "Family the person belongs to",
"properties": [
{
"name": "name",
"type": "STRING"
}
]
},
{
"label": "Planet",
"properties": [
{
"name": "name",
"type": "STRING"
},
{
"name": "weather",
"type": "STRING"
}
]
}
],
"relations": [
"PARENT_OF",
{
"label": "HEIR_OF",
"description": "Used for inheritor relationship between father and sons"
},
{
"label": "RULES",
"properties": [
{
"name": "fromYear",
"type": "INTEGER"
}
]
}
],
"potential_schema": [
[
"Person",
"PARENT_OF",
"Person"
],
[
"Person",
"HEIR_OF",
"House"
],
[
"House",
"RULES",
"Planet"
]
],
"text_splitter": {
"class_": "text_splitters.fixed_size_splitter.FixedSizeSplitter",
"params_": {
"chunk_size": 100,
"chunk_overlap": 10
}
},
"pdf_loader": {
"class_": "pdf_loader.PdfLoader",
"run_params_": {
"fs": "http"
}
},
"perform_entity_resolution": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""In this example, the pipeline is defined in a JSON ('simple_kg_pipeline_config.json')
or YAML ('simple_kg_pipeline_config.yaml') file.

According to the configuration file, some parameters will be read from the env vars
(Neo4j credentials and the OpenAI API key).
"""

import asyncio
import logging

## If env vars are in a .env file, uncomment:
## (requires pip install python-dotenv)
# from dotenv import load_dotenv
# load_dotenv()
# env vars manually set for testing:
import os
from pathlib import Path

from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult

logging.basicConfig()
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG)

os.environ["NEO4J_URI"] = "bolt://localhost:7687"
os.environ["NEO4J_USER"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"
# os.environ["OPENAI_API_KEY"] = "sk-..."


root_dir = Path(__file__).parent
file_path = root_dir / "simple_kg_pipeline_config_url.json"


# File to process
URL = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"


async def main() -> PipelineResult:
pipeline = PipelineRunner.from_config_file(file_path)
return await pipeline.run({"file_path": URL})


if __name__ == "__main__":
print(asyncio.run(main()))
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Use the PdfLoader component to extract text from a remote PDF file."""

import asyncio

from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader

url = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf"


async def main() -> None:
loader = PdfLoader()
document = await loader.run(filepath=url, fs="http")
print(document.text[:100])


if __name__ == "__main__":
asyncio.run(main())
18 changes: 10 additions & 8 deletions src/neo4j_graphrag/experimental/components/pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,10 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
class PdfLoader(DataLoader):
@staticmethod
def load_file(
file: Union[Path, str],
file: str,
fs: AbstractFileSystem,
) -> str:
"""Parse PDF file and return text."""
if not isinstance(file, Path):
file = Path(file)

try:
with fs.open(file, "rb") as fp:
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
Expand All @@ -85,16 +82,21 @@ def load_file(

async def run(
self,
filepath: Path,
filepath: Union[str, Path],
metadata: Optional[Dict[str, str]] = None,
fs: Optional[AbstractFileSystem] = None,
fs: Optional[Union[AbstractFileSystem, str]] = None,
) -> PdfDocument:
fs = fs or LocalFileSystem()
if not isinstance(filepath, str):
filepath = str(filepath)
if isinstance(fs, str):
alexthomas93 marked this conversation as resolved.
Show resolved Hide resolved
fs = fsspec.filesystem(fs)
elif fs is None:
fs = LocalFileSystem()
text = self.load_file(filepath, fs)
return PdfDocument(
text=text,
document_info=DocumentInfo(
path=str(filepath),
path=filepath,
metadata=self.get_document_metadata(text, metadata),
),
)
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,10 @@ class ComponentConfig(ObjectConfig[Component]):
DEFAULT_MODULE = "neo4j_graphrag.experimental.components"
INTERFACE = Component

def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
self._global_data = resolved_data
return self.resolve_params(self.run_params_)


class ComponentType(RootModel): # type: ignore[type-arg]
alexthomas93 marked this conversation as resolved.
Show resolved Hide resolved
root: Union[Component, ComponentConfig]
Expand All @@ -264,3 +268,8 @@ def parse(self, resolved_data: dict[str, Any] | None = None) -> Component:
if isinstance(self.root, Component):
return self.root
return self.root.parse(resolved_data)

def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]:
if isinstance(self.root, Component):
return {}
return self.root.get_run_params(resolved_data)
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,23 @@ def _get_pdf_loader(self) -> Optional[PdfLoader]:
return self.pdf_loader.parse(self._global_data) # type: ignore
return PdfLoader()

def _get_run_params_for_pdf_loader(self) -> dict[str, Any]:
if not self.from_pdf:
return {}
if self.pdf_loader:
return self.pdf_loader.get_run_params(self._global_data)
return {}

def _get_splitter(self) -> TextSplitter:
if self.text_splitter:
return self.text_splitter.parse(self._global_data) # type: ignore
return FixedSizeSplitter()

def _get_run_params_for_splitter(self) -> dict[str, Any]:
if self.text_splitter:
return self.text_splitter.get_run_params(self._global_data)
return {}

def _get_chunk_embedder(self) -> TextChunkEmbedder:
return TextChunkEmbedder(embedder=self.get_default_embedder())

Expand Down Expand Up @@ -123,6 +135,11 @@ def _get_writer(self) -> KGWriter:
neo4j_database=self.neo4j_database,
)

def _get_run_params_for_writer(self) -> dict[str, Any]:
if self.kg_writer:
return self.kg_writer.get_run_params(self._global_data)
return {}

def _get_resolver(self) -> Optional[EntityResolver]:
if not self.perform_entity_resolution:
return None
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/experimental/components/test_pdf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ def pdf_loader() -> PdfLoader:


@pytest.fixture
def dummy_pdf_path() -> Path:
return BASE_DIR / "sample_data/lorem_ipsum.pdf"
def dummy_pdf_path() -> str:
return str(BASE_DIR / "sample_data/lorem_ipsum.pdf")


def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
expected_content = "Lorem ipsum dolor sit amet."
actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem())
assert actual_content == expected_content


def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None:
with patch(
"fsspec.implementations.local.LocalFileSystem.open",
side_effect=Exception("Failed to open"),
Expand Down
Loading