Merge branch 'neo4j:main' into hotfix/pr-e2e-tests

alexthomas93 · Aug 13, 2024 · 583424a · 583424a
2 parents f2bb04e + 8b6cf43
commit 583424a
Show file tree

Hide file tree

Showing 11 changed files with 202 additions and 23 deletions.
diff --git a/docs/source/user_guide_pipeline.rst b/docs/source/user_guide_pipeline.rst
@@ -10,9 +10,10 @@ This page provides information about how to create a pipeline.
 
     Pipelines run asynchronously, see examples below.
 
-******************************
+
+*******************
 Creating Components
-******************************
+*******************
 
 Components are asynchronous units of work that perform simple tasks,
 such as chunking documents or saving results to Neo4j.

diff --git a/examples/pipeline/Harry Potter and the Death Hallows Summary.pdf b/examples/pipeline/Harry Potter and the Death Hallows Summary.pdf
diff --git a/examples/pipeline/kg_builder_with_schema.py → examples/pipeline/kg_builder_from_pdf.py b/examples/pipeline/kg_builder_with_schema.py → examples/pipeline/kg_builder_from_pdf.py
@@ -18,12 +18,24 @@
 import logging
 from typing import Any
 
+import neo4j
+from langchain_text_splitters import CharacterTextSplitter
+from neo4j_genai.experimental.components.entity_relation_extractor import (
+    LLMEntityRelationExtractor,
+    OnError,
+)
+from neo4j_genai.experimental.components.kg_writer import Neo4jWriter
+from neo4j_genai.experimental.components.pdf_loader import PdfLoader
 from neo4j_genai.experimental.components.schema import (
     SchemaBuilder,
     SchemaEntity,
     SchemaRelation,
 )
+from neo4j_genai.experimental.components.text_splitters.langchain import (
+    LangChainTextSplitterAdapter,
+)
 from neo4j_genai.experimental.pipeline import Component, DataModel
+from neo4j_genai.llm import OpenAILLM
 from pydantic import BaseModel, validate_call
 
 logging.basicConfig(level=logging.DEBUG)
@@ -86,7 +98,7 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
         )
 
 
-if __name__ == "__main__":
+async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
     from neo4j_genai.experimental.pipeline import Pipeline
 
     # Instantiate Entity and Relation objects
@@ -96,34 +108,62 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
             label="ORGANIZATION",
             description="A structured group of people with a common purpose.",
         ),
+        SchemaEntity(label="LOCATION", description="A location or place."),
         SchemaEntity(
-            label="AGE",
+            label="HORCRUX",
+            description="A magical item in the Harry Potter universe.",
         ),
     ]
     relations = [
         SchemaRelation(
-            label="EMPLOYED_BY", description="Indicates employment relationship."
+            label="SITUATED_AT", description="Indicates the location of a person."
         ),
         SchemaRelation(
-            label="ORGANIZED_BY",
-            description="Indicates organization responsible for an event.",
+            label="LED_BY",
+            description="Indicates the leader of an organization.",
         ),
         SchemaRelation(
-            label="ATTENDED_BY", description="Indicates attendance at an event."
+            label="OWNS",
+            description="Indicates the ownership of an item such as a Horcrux.",
+        ),
+        SchemaRelation(
+            label="INTERACTS", description="The interaction between two people."
         ),
     ]
     potential_schema = [
-        ("PERSON", "EMPLOYED_BY", "ORGANIZATION"),
-        ("ORGANIZATION", "ATTENDED_BY", "PERSON"),
+        ("PERSON", "SITUATED_AT", "LOCATION"),
+        ("PERSON", "INTERACTS", "PERSON"),
+        ("PERSON", "OWNS", "HORCRUX"),
+        ("ORGANIZATION", "LED_BY", "PERSON"),
     ]
 
     # Set up the pipeline
     pipe = Pipeline()
-    pipe.add_component("chunker", DocumentChunker())
+    pipe.add_component("pdf_loader", PdfLoader())
+    pipe.add_component(
+        "splitter",
+        LangChainTextSplitterAdapter(
+            # chunk_size=50 for the sake of this demo
+            CharacterTextSplitter(chunk_size=50, chunk_overlap=10, separator=".")
+        ),
+    )
     pipe.add_component("schema", SchemaBuilder())
-    pipe.add_component("extractor", ERExtractor())
-    pipe.add_component("writer", Writer())
-    pipe.connect("chunker", "extractor", input_config={"chunks": "chunker.chunks"})
+    pipe.add_component(
+        "extractor",
+        LLMEntityRelationExtractor(
+            llm=OpenAILLM(
+                model_name="gpt-4o",
+                model_params={
+                    "max_tokens": 1000,
+                    "response_format": {"type": "json_object"},
+                },
+            ),
+            on_error=OnError.RAISE,
+        ),
+    )
+    pipe.add_component("writer", Neo4jWriter(neo4j_driver))
+    pipe.connect("pdf_loader", "splitter", input_config={"text": "pdf_loader.text"})
+    pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
     pipe.connect("schema", "extractor", input_config={"schema": "schema"})
     pipe.connect(
         "extractor",
@@ -132,15 +172,20 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
     )
 
     pipe_inputs = {
-        "chunker": {
-            "text": """Graphs are everywhere.
-            GraphRAG is the future of Artificial Intelligence.
-            Robots are already running the world."""
+        "pdf_loader": {
+            "filepath": "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf"
         },
         "schema": {
             "entities": entities,
             "relations": relations,
             "potential_schema": potential_schema,
         },
     }
-    print(asyncio.run(pipe.run(pipe_inputs)))
+    return await pipe.run(pipe_inputs)
+
+
+if __name__ == "__main__":
+    with neo4j.GraphDatabase.driver(
+        "bolt://localhost:7687", auth=("neo4j", "password")
+    ) as driver:
+        print(asyncio.run(main(driver)))
diff --git a/examples/pipeline/kg_builder.py → examples/pipeline/kg_builder_from_text.py b/examples/pipeline/kg_builder.py → examples/pipeline/kg_builder_from_text.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ weaviate-client = {version = "^4.6.1", optional = true}
 pinecone-client = {version = "^4.1.0", optional = true}
 types-mock = "^5.1.0.20240425"
 eval-type-backport = "^0.2.0"
+pypdf = "^4.3.1"
 
 [tool.poetry.group.dev.dependencies]
 pylint = "^3.1.0"

diff --git a/src/neo4j_genai/exceptions.py b/src/neo4j_genai/exceptions.py
@@ -106,7 +106,13 @@ class SchemaFetchError(Neo4jGenAiError):
     pass
 
 
-class SchemaValidationError(Exception):
+class SchemaValidationError(Neo4jGenAiError):
     """Custom exception for errors in schema configuration."""
 
     pass
+
+
+class PdfLoaderError(Neo4jGenAiError):
+    """Custom exception for errors in PDF loader."""
+
+    pass
diff --git a/src/neo4j_genai/experimental/components/pdf_loader.py b/src/neo4j_genai/experimental/components/pdf_loader.py
@@ -0,0 +1,78 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import io
+from abc import abstractmethod
+from pathlib import Path
+from typing import Optional, Union
+
+import fsspec
+import pypdf
+from fsspec import AbstractFileSystem
+from fsspec.implementations.local import LocalFileSystem
+
+from neo4j_genai.exceptions import PdfLoaderError
+from neo4j_genai.experimental.pipeline import Component, DataModel
+
+
+class PdfDocument(DataModel):
+    text: str
+
+
+class DataLoader(Component):
+    """
+    Interface for loading data of various input types.
+    """
+
+    @abstractmethod
+    async def run(self, filepath: Path) -> PdfDocument:
+        pass
+
+
+def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
+    return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
+
+
+class PdfLoader(DataLoader):
+    @staticmethod
+    def load_file(
+        file: Union[Path, str],
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> str:
+        """Parse PDF file and return text."""
+        if not isinstance(file, Path):
+            file = Path(file)
+
+        fs = fs or LocalFileSystem()
+
+        try:
+            with fs.open(file, "rb") as fp:
+                stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
+                pdf = pypdf.PdfReader(stream)
+                num_pages = len(pdf.pages)
+                text_parts = (
+                    pdf.pages[page].extract_text() for page in range(num_pages)
+                )
+                full_text = "\n".join(text_parts)
+
+                return full_text
+        except Exception as e:
+            raise PdfLoaderError(e)
+
+    async def run(
+        self,
+        filepath: Path,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> PdfDocument:
+        return PdfDocument(text=self.load_file(filepath, fs))
diff --git a/src/neo4j_genai/experimental/components/schema.py b/src/neo4j_genai/experimental/components/schema.py
@@ -92,7 +92,7 @@ def check_schema(cls, data: Dict[str, Any]) -> Dict[str, Any]:
                 )
             if entity2 not in entities:
                 raise SchemaValidationError(
-                    f"Entity '{entity1}' is not defined in the provided entities."
+                    f"Entity '{entity2}' is not defined in the provided entities."
                 )
 
         return data

diff --git a/tests/unit/experimental/components/sample_data/lorem_ipsum.pdf b/tests/unit/experimental/components/sample_data/lorem_ipsum.pdf
diff --git a/tests/unit/experimental/components/test_pdf_loader.py b/tests/unit/experimental/components/test_pdf_loader.py
@@ -0,0 +1,48 @@
+#  Copyright (c) "Neo4j"
+#  Neo4j Sweden AB [https://neo4j.com]
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  #
+#      https://www.apache.org/licenses/LICENSE-2.0
+#  #
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+from neo4j_genai.exceptions import PdfLoaderError
+from neo4j_genai.experimental.components.pdf_loader import PdfLoader
+
+BASE_DIR = Path(__file__).parent
+
+
+@pytest.fixture
+def pdf_loader() -> PdfLoader:
+    return PdfLoader()
+
+
+@pytest.fixture
+def dummy_pdf_path() -> Path:
+    return BASE_DIR / "sample_data/lorem_ipsum.pdf"
+
+
+def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
+    expected_content = "Lorem ipsum dolor sit amet."
+    actual_content = pdf_loader.load_file(dummy_pdf_path)
+    assert actual_content == expected_content
+
+
+def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
+    with patch(
+        "fsspec.implementations.local.LocalFileSystem.open",
+        side_effect=Exception("Failed to open"),
+    ):
+        with pytest.raises(PdfLoaderError):
+            pdf_loader.load_file(dummy_pdf_path)