From 39fd4f74e10892a6fa7b4398a97cb71e1252fa52 Mon Sep 17 00:00:00 2001 From: Estelle Scifo Date: Mon, 30 Dec 2024 09:53:00 +0100 Subject: [PATCH] Add ability to load PDF from HTTP + specify file system from string (useful for config files) (#230) * Fix PdfLoader, add ability to specify file system from string * Make PdfLoader work with config files --- examples/README.md | 1 + .../simple_kg_pipeline_config_url.json | 118 ++++++++++++++++++ ...e_kg_pipeline_from_config_file_with_url.py | 45 +++++++ .../components/loaders/pdf_loader_from_url.py | 17 +++ .../experimental/components/pdf_loader.py | 18 +-- .../pipeline/config/object_config.py | 9 ++ .../template_pipeline/simple_kg_builder.py | 17 +++ .../components/test_pdf_loader.py | 8 +- 8 files changed, 221 insertions(+), 12 deletions(-) create mode 100644 examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json create mode 100644 examples/build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py create mode 100644 examples/customize/build_graph/components/loaders/pdf_loader_from_url.py diff --git a/examples/README.md b/examples/README.md index a7308660..6d56bbd0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,6 +15,7 @@ are listed in [the last section of this file](#customize). - [End to end PDF to graph simple pipeline](build_graph/simple_kg_builder_from_pdf.py) - [End to end text to graph simple pipeline](build_graph/simple_kg_builder_from_text.py) - [Build KG pipeline from config file](build_graph/from_config_files/simple_kg_pipeline_from_config_file.py) +- [Build KG pipeline with PDF URL](build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py) ## Retrieve diff --git a/examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json b/examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json new file mode 100644 index 00000000..12cb2237 --- /dev/null +++ b/examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json @@ -0,0 +1,118 @@ +{ + "version_": "1", + "template_": "SimpleKGPipeline", + "neo4j_config": { + "params_": { + "uri": { + "resolver_": "ENV", + "var_": "NEO4J_URI" + }, + "user": { + "resolver_": "ENV", + "var_": "NEO4J_USER" + }, + "password": { + "resolver_": "ENV", + "var_": "NEO4J_PASSWORD" + } + } + }, + "llm_config": { + "class_": "OpenAILLM", + "params_": { + "api_key": { + "resolver_": "ENV", + "var_": "OPENAI_API_KEY" + }, + "model_name": "gpt-4o", + "model_params": { + "temperature": 0, + "max_tokens": 2000, + "response_format": {"type": "json_object"} + } + } + }, + "embedder_config": { + "class_": "OpenAIEmbeddings", + "params_": { + "api_key": { + "resolver_": "ENV", + "var_": "OPENAI_API_KEY" + } + } + }, + "from_pdf": true, + "entities": [ + "Person", + { + "label": "House", + "description": "Family the person belongs to", + "properties": [ + { + "name": "name", + "type": "STRING" + } + ] + }, + { + "label": "Planet", + "properties": [ + { + "name": "name", + "type": "STRING" + }, + { + "name": "weather", + "type": "STRING" + } + ] + } + ], + "relations": [ + "PARENT_OF", + { + "label": "HEIR_OF", + "description": "Used for inheritor relationship between father and sons" + }, + { + "label": "RULES", + "properties": [ + { + "name": "fromYear", + "type": "INTEGER" + } + ] + } + ], + "potential_schema": [ + [ + "Person", + "PARENT_OF", + "Person" + ], + [ + "Person", + "HEIR_OF", + "House" + ], + [ + "House", + "RULES", + "Planet" + ] + ], + "text_splitter": { + "class_": "text_splitters.fixed_size_splitter.FixedSizeSplitter", + "params_": { + "chunk_size": 100, + "chunk_overlap": 10 + } + }, + "pdf_loader": { + "class_": "pdf_loader.PdfLoader", + "run_params_": { + "fs": "http" + } + }, + "perform_entity_resolution": true +} diff --git a/examples/build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py b/examples/build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py new file mode 100644 index 00000000..2114e928 --- /dev/null +++ b/examples/build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py @@ -0,0 +1,45 @@ +"""In this example, the pipeline is defined in a JSON ('simple_kg_pipeline_config.json') +or YAML ('simple_kg_pipeline_config.yaml') file. + +According to the configuration file, some parameters will be read from the env vars +(Neo4j credentials and the OpenAI API key). +""" + +import asyncio +import logging + +## If env vars are in a .env file, uncomment: +## (requires pip install python-dotenv) +# from dotenv import load_dotenv +# load_dotenv() +# env vars manually set for testing: +import os +from pathlib import Path + +from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner +from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult + +logging.basicConfig() +logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) + +os.environ["NEO4J_URI"] = "bolt://localhost:7687" +os.environ["NEO4J_USER"] = "neo4j" +os.environ["NEO4J_PASSWORD"] = "password" +# os.environ["OPENAI_API_KEY"] = "sk-..." + + +root_dir = Path(__file__).parent +file_path = root_dir / "simple_kg_pipeline_config_url.json" + + +# File to process +URL = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf" + + +async def main() -> PipelineResult: + pipeline = PipelineRunner.from_config_file(file_path) + return await pipeline.run({"file_path": URL}) + + +if __name__ == "__main__": + print(asyncio.run(main())) diff --git a/examples/customize/build_graph/components/loaders/pdf_loader_from_url.py b/examples/customize/build_graph/components/loaders/pdf_loader_from_url.py new file mode 100644 index 00000000..b880a891 --- /dev/null +++ b/examples/customize/build_graph/components/loaders/pdf_loader_from_url.py @@ -0,0 +1,17 @@ +"""Use the PdfLoader component to extract text from a remote PDF file.""" + +import asyncio + +from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader + +url = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf" + + +async def main() -> None: + loader = PdfLoader() + document = await loader.run(filepath=url, fs="http") + print(document.text[:100]) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/neo4j_graphrag/experimental/components/pdf_loader.py b/src/neo4j_graphrag/experimental/components/pdf_loader.py index 2cd3fca1..c925c677 100644 --- a/src/neo4j_graphrag/experimental/components/pdf_loader.py +++ b/src/neo4j_graphrag/experimental/components/pdf_loader.py @@ -62,13 +62,10 @@ def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool: class PdfLoader(DataLoader): @staticmethod def load_file( - file: Union[Path, str], + file: str, fs: AbstractFileSystem, ) -> str: """Parse PDF file and return text.""" - if not isinstance(file, Path): - file = Path(file) - try: with fs.open(file, "rb") as fp: stream = fp if is_default_fs(fs) else io.BytesIO(fp.read()) @@ -85,16 +82,21 @@ def load_file( async def run( self, - filepath: Path, + filepath: Union[str, Path], metadata: Optional[Dict[str, str]] = None, - fs: Optional[AbstractFileSystem] = None, + fs: Optional[Union[AbstractFileSystem, str]] = None, ) -> PdfDocument: - fs = fs or LocalFileSystem() + if not isinstance(filepath, str): + filepath = str(filepath) + if isinstance(fs, str): + fs = fsspec.filesystem(fs) + elif fs is None: + fs = LocalFileSystem() text = self.load_file(filepath, fs) return PdfDocument( text=text, document_info=DocumentInfo( - path=str(filepath), + path=filepath, metadata=self.get_document_metadata(text, metadata), ), ) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/object_config.py b/src/neo4j_graphrag/experimental/pipeline/config/object_config.py index cb47b380..deeee200 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/object_config.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/object_config.py @@ -254,6 +254,10 @@ class ComponentConfig(ObjectConfig[Component]): DEFAULT_MODULE = "neo4j_graphrag.experimental.components" INTERFACE = Component + def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]: + self._global_data = resolved_data + return self.resolve_params(self.run_params_) + class ComponentType(RootModel): # type: ignore[type-arg] root: Union[Component, ComponentConfig] @@ -264,3 +268,8 @@ def parse(self, resolved_data: dict[str, Any] | None = None) -> Component: if isinstance(self.root, Component): return self.root return self.root.parse(resolved_data) + + def get_run_params(self, resolved_data: dict[str, Any]) -> dict[str, Any]: + if isinstance(self.root, Component): + return {} + return self.root.get_run_params(resolved_data) diff --git a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py index 73edfd9a..14ee112a 100644 --- a/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py +++ b/src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py @@ -90,11 +90,23 @@ def _get_pdf_loader(self) -> Optional[PdfLoader]: return self.pdf_loader.parse(self._global_data) # type: ignore return PdfLoader() + def _get_run_params_for_pdf_loader(self) -> dict[str, Any]: + if not self.from_pdf: + return {} + if self.pdf_loader: + return self.pdf_loader.get_run_params(self._global_data) + return {} + def _get_splitter(self) -> TextSplitter: if self.text_splitter: return self.text_splitter.parse(self._global_data) # type: ignore return FixedSizeSplitter() + def _get_run_params_for_splitter(self) -> dict[str, Any]: + if self.text_splitter: + return self.text_splitter.get_run_params(self._global_data) + return {} + def _get_chunk_embedder(self) -> TextChunkEmbedder: return TextChunkEmbedder(embedder=self.get_default_embedder()) @@ -123,6 +135,11 @@ def _get_writer(self) -> KGWriter: neo4j_database=self.neo4j_database, ) + def _get_run_params_for_writer(self) -> dict[str, Any]: + if self.kg_writer: + return self.kg_writer.get_run_params(self._global_data) + return {} + def _get_resolver(self) -> Optional[EntityResolver]: if not self.perform_entity_resolution: return None diff --git a/tests/unit/experimental/components/test_pdf_loader.py b/tests/unit/experimental/components/test_pdf_loader.py index 949e2322..e5fe51cf 100644 --- a/tests/unit/experimental/components/test_pdf_loader.py +++ b/tests/unit/experimental/components/test_pdf_loader.py @@ -30,17 +30,17 @@ def pdf_loader() -> PdfLoader: @pytest.fixture -def dummy_pdf_path() -> Path: - return BASE_DIR / "sample_data/lorem_ipsum.pdf" +def dummy_pdf_path() -> str: + return str(BASE_DIR / "sample_data/lorem_ipsum.pdf") -def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None: +def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None: expected_content = "Lorem ipsum dolor sit amet." actual_content = pdf_loader.load_file(dummy_pdf_path, fs=LocalFileSystem()) assert actual_content == expected_content -def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None: +def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: str) -> None: with patch( "fsspec.implementations.local.LocalFileSystem.open", side_effect=Exception("Failed to open"),