Skip to content

Commit

Permalink
Merge branch 'neo4j:main' into hotfix/pr-e2e-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
alexthomas93 authored Aug 13, 2024
2 parents f2bb04e + 8b6cf43 commit 583424a
Show file tree
Hide file tree
Showing 11 changed files with 202 additions and 23 deletions.
5 changes: 3 additions & 2 deletions docs/source/user_guide_pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ This page provides information about how to create a pipeline.

Pipelines run asynchronously, see examples below.

******************************

*******************
Creating Components
******************************
*******************

Components are asynchronous units of work that perform simple tasks,
such as chunking documents or saving results to Neo4j.
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,24 @@
import logging
from typing import Any

import neo4j
from langchain_text_splitters import CharacterTextSplitter
from neo4j_genai.experimental.components.entity_relation_extractor import (
LLMEntityRelationExtractor,
OnError,
)
from neo4j_genai.experimental.components.kg_writer import Neo4jWriter
from neo4j_genai.experimental.components.pdf_loader import PdfLoader
from neo4j_genai.experimental.components.schema import (
SchemaBuilder,
SchemaEntity,
SchemaRelation,
)
from neo4j_genai.experimental.components.text_splitters.langchain import (
LangChainTextSplitterAdapter,
)
from neo4j_genai.experimental.pipeline import Component, DataModel
from neo4j_genai.llm import OpenAILLM
from pydantic import BaseModel, validate_call

logging.basicConfig(level=logging.DEBUG)
Expand Down Expand Up @@ -86,7 +98,7 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
)


if __name__ == "__main__":
async def main(neo4j_driver: neo4j.Driver) -> dict[str, Any]:
from neo4j_genai.experimental.pipeline import Pipeline

# Instantiate Entity and Relation objects
Expand All @@ -96,34 +108,62 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
label="ORGANIZATION",
description="A structured group of people with a common purpose.",
),
SchemaEntity(label="LOCATION", description="A location or place."),
SchemaEntity(
label="AGE",
label="HORCRUX",
description="A magical item in the Harry Potter universe.",
),
]
relations = [
SchemaRelation(
label="EMPLOYED_BY", description="Indicates employment relationship."
label="SITUATED_AT", description="Indicates the location of a person."
),
SchemaRelation(
label="ORGANIZED_BY",
description="Indicates organization responsible for an event.",
label="LED_BY",
description="Indicates the leader of an organization.",
),
SchemaRelation(
label="ATTENDED_BY", description="Indicates attendance at an event."
label="OWNS",
description="Indicates the ownership of an item such as a Horcrux.",
),
SchemaRelation(
label="INTERACTS", description="The interaction between two people."
),
]
potential_schema = [
("PERSON", "EMPLOYED_BY", "ORGANIZATION"),
("ORGANIZATION", "ATTENDED_BY", "PERSON"),
("PERSON", "SITUATED_AT", "LOCATION"),
("PERSON", "INTERACTS", "PERSON"),
("PERSON", "OWNS", "HORCRUX"),
("ORGANIZATION", "LED_BY", "PERSON"),
]

# Set up the pipeline
pipe = Pipeline()
pipe.add_component("chunker", DocumentChunker())
pipe.add_component("pdf_loader", PdfLoader())
pipe.add_component(
"splitter",
LangChainTextSplitterAdapter(
# chunk_size=50 for the sake of this demo
CharacterTextSplitter(chunk_size=50, chunk_overlap=10, separator=".")
),
)
pipe.add_component("schema", SchemaBuilder())
pipe.add_component("extractor", ERExtractor())
pipe.add_component("writer", Writer())
pipe.connect("chunker", "extractor", input_config={"chunks": "chunker.chunks"})
pipe.add_component(
"extractor",
LLMEntityRelationExtractor(
llm=OpenAILLM(
model_name="gpt-4o",
model_params={
"max_tokens": 1000,
"response_format": {"type": "json_object"},
},
),
on_error=OnError.RAISE,
),
)
pipe.add_component("writer", Neo4jWriter(neo4j_driver))
pipe.connect("pdf_loader", "splitter", input_config={"text": "pdf_loader.text"})
pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"})
pipe.connect("schema", "extractor", input_config={"schema": "schema"})
pipe.connect(
"extractor",
Expand All @@ -132,15 +172,20 @@ async def run(self, graph: Neo4jGraph) -> WriterModel:
)

pipe_inputs = {
"chunker": {
"text": """Graphs are everywhere.
GraphRAG is the future of Artificial Intelligence.
Robots are already running the world."""
"pdf_loader": {
"filepath": "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf"
},
"schema": {
"entities": entities,
"relations": relations,
"potential_schema": potential_schema,
},
}
print(asyncio.run(pipe.run(pipe_inputs)))
return await pipe.run(pipe_inputs)


if __name__ == "__main__":
with neo4j.GraphDatabase.driver(
"bolt://localhost:7687", auth=("neo4j", "password")
) as driver:
print(asyncio.run(main(driver)))
File renamed without changes.
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ weaviate-client = {version = "^4.6.1", optional = true}
pinecone-client = {version = "^4.1.0", optional = true}
types-mock = "^5.1.0.20240425"
eval-type-backport = "^0.2.0"
pypdf = "^4.3.1"

[tool.poetry.group.dev.dependencies]
pylint = "^3.1.0"
Expand Down
8 changes: 7 additions & 1 deletion src/neo4j_genai/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,13 @@ class SchemaFetchError(Neo4jGenAiError):
pass


class SchemaValidationError(Exception):
class SchemaValidationError(Neo4jGenAiError):
"""Custom exception for errors in schema configuration."""

pass


class PdfLoaderError(Neo4jGenAiError):
"""Custom exception for errors in PDF loader."""

pass
78 changes: 78 additions & 0 deletions src/neo4j_genai/experimental/components/pdf_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) "Neo4j"
# Neo4j Sweden AB [https://neo4j.com]
# #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# https://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
from abc import abstractmethod
from pathlib import Path
from typing import Optional, Union

import fsspec
import pypdf
from fsspec import AbstractFileSystem
from fsspec.implementations.local import LocalFileSystem

from neo4j_genai.exceptions import PdfLoaderError
from neo4j_genai.experimental.pipeline import Component, DataModel


class PdfDocument(DataModel):
text: str


class DataLoader(Component):
"""
Interface for loading data of various input types.
"""

@abstractmethod
async def run(self, filepath: Path) -> PdfDocument:
pass


def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir


class PdfLoader(DataLoader):
@staticmethod
def load_file(
file: Union[Path, str],
fs: Optional[AbstractFileSystem] = None,
) -> str:
"""Parse PDF file and return text."""
if not isinstance(file, Path):
file = Path(file)

fs = fs or LocalFileSystem()

try:
with fs.open(file, "rb") as fp:
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
pdf = pypdf.PdfReader(stream)
num_pages = len(pdf.pages)
text_parts = (
pdf.pages[page].extract_text() for page in range(num_pages)
)
full_text = "\n".join(text_parts)

return full_text
except Exception as e:
raise PdfLoaderError(e)

async def run(
self,
filepath: Path,
fs: Optional[AbstractFileSystem] = None,
) -> PdfDocument:
return PdfDocument(text=self.load_file(filepath, fs))
2 changes: 1 addition & 1 deletion src/neo4j_genai/experimental/components/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def check_schema(cls, data: Dict[str, Any]) -> Dict[str, Any]:
)
if entity2 not in entities:
raise SchemaValidationError(
f"Entity '{entity1}' is not defined in the provided entities."
f"Entity '{entity2}' is not defined in the provided entities."
)

return data
Expand Down
Binary file not shown.
48 changes: 48 additions & 0 deletions tests/unit/experimental/components/test_pdf_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) "Neo4j"
# Neo4j Sweden AB [https://neo4j.com]
# #
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# #
# https://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from unittest.mock import patch

import pytest
from neo4j_genai.exceptions import PdfLoaderError
from neo4j_genai.experimental.components.pdf_loader import PdfLoader

BASE_DIR = Path(__file__).parent


@pytest.fixture
def pdf_loader() -> PdfLoader:
return PdfLoader()


@pytest.fixture
def dummy_pdf_path() -> Path:
return BASE_DIR / "sample_data/lorem_ipsum.pdf"


def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
expected_content = "Lorem ipsum dolor sit amet."
actual_content = pdf_loader.load_file(dummy_pdf_path)
assert actual_content == expected_content


def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None:
with patch(
"fsspec.implementations.local.LocalFileSystem.open",
side_effect=Exception("Failed to open"),
):
with pytest.raises(PdfLoaderError):
pdf_loader.load_file(dummy_pdf_path)

0 comments on commit 583424a

Please sign in to comment.