forked from neo4j/neo4j-graphrag-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'neo4j:main' into hotfix/pr-e2e-tests
- Loading branch information
Showing
11 changed files
with
202 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# Copyright (c) "Neo4j" | ||
# Neo4j Sweden AB [https://neo4j.com] | ||
# # | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# # | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# # | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import io | ||
from abc import abstractmethod | ||
from pathlib import Path | ||
from typing import Optional, Union | ||
|
||
import fsspec | ||
import pypdf | ||
from fsspec import AbstractFileSystem | ||
from fsspec.implementations.local import LocalFileSystem | ||
|
||
from neo4j_genai.exceptions import PdfLoaderError | ||
from neo4j_genai.experimental.pipeline import Component, DataModel | ||
|
||
|
||
class PdfDocument(DataModel): | ||
text: str | ||
|
||
|
||
class DataLoader(Component): | ||
""" | ||
Interface for loading data of various input types. | ||
""" | ||
|
||
@abstractmethod | ||
async def run(self, filepath: Path) -> PdfDocument: | ||
pass | ||
|
||
|
||
def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool: | ||
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir | ||
|
||
|
||
class PdfLoader(DataLoader): | ||
@staticmethod | ||
def load_file( | ||
file: Union[Path, str], | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> str: | ||
"""Parse PDF file and return text.""" | ||
if not isinstance(file, Path): | ||
file = Path(file) | ||
|
||
fs = fs or LocalFileSystem() | ||
|
||
try: | ||
with fs.open(file, "rb") as fp: | ||
stream = fp if is_default_fs(fs) else io.BytesIO(fp.read()) | ||
pdf = pypdf.PdfReader(stream) | ||
num_pages = len(pdf.pages) | ||
text_parts = ( | ||
pdf.pages[page].extract_text() for page in range(num_pages) | ||
) | ||
full_text = "\n".join(text_parts) | ||
|
||
return full_text | ||
except Exception as e: | ||
raise PdfLoaderError(e) | ||
|
||
async def run( | ||
self, | ||
filepath: Path, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> PdfDocument: | ||
return PdfDocument(text=self.load_file(filepath, fs)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Copyright (c) "Neo4j" | ||
# Neo4j Sweden AB [https://neo4j.com] | ||
# # | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# # | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# # | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from pathlib import Path | ||
from unittest.mock import patch | ||
|
||
import pytest | ||
from neo4j_genai.exceptions import PdfLoaderError | ||
from neo4j_genai.experimental.components.pdf_loader import PdfLoader | ||
|
||
BASE_DIR = Path(__file__).parent | ||
|
||
|
||
@pytest.fixture | ||
def pdf_loader() -> PdfLoader: | ||
return PdfLoader() | ||
|
||
|
||
@pytest.fixture | ||
def dummy_pdf_path() -> Path: | ||
return BASE_DIR / "sample_data/lorem_ipsum.pdf" | ||
|
||
|
||
def test_pdf_loading(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None: | ||
expected_content = "Lorem ipsum dolor sit amet." | ||
actual_content = pdf_loader.load_file(dummy_pdf_path) | ||
assert actual_content == expected_content | ||
|
||
|
||
def test_pdf_processing_error(pdf_loader: PdfLoader, dummy_pdf_path: Path) -> None: | ||
with patch( | ||
"fsspec.implementations.local.LocalFileSystem.open", | ||
side_effect=Exception("Failed to open"), | ||
): | ||
with pytest.raises(PdfLoaderError): | ||
pdf_loader.load_file(dummy_pdf_path) |