-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ability to load PDF from HTTP + specify file system from string (…
…useful for config files) (#230) * Fix PdfLoader, add ability to specify file system from string * Make PdfLoader work with config files
- Loading branch information
Showing
8 changed files
with
221 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
118 changes: 118 additions & 0 deletions
118
examples/build_graph/from_config_files/simple_kg_pipeline_config_url.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
{ | ||
"version_": "1", | ||
"template_": "SimpleKGPipeline", | ||
"neo4j_config": { | ||
"params_": { | ||
"uri": { | ||
"resolver_": "ENV", | ||
"var_": "NEO4J_URI" | ||
}, | ||
"user": { | ||
"resolver_": "ENV", | ||
"var_": "NEO4J_USER" | ||
}, | ||
"password": { | ||
"resolver_": "ENV", | ||
"var_": "NEO4J_PASSWORD" | ||
} | ||
} | ||
}, | ||
"llm_config": { | ||
"class_": "OpenAILLM", | ||
"params_": { | ||
"api_key": { | ||
"resolver_": "ENV", | ||
"var_": "OPENAI_API_KEY" | ||
}, | ||
"model_name": "gpt-4o", | ||
"model_params": { | ||
"temperature": 0, | ||
"max_tokens": 2000, | ||
"response_format": {"type": "json_object"} | ||
} | ||
} | ||
}, | ||
"embedder_config": { | ||
"class_": "OpenAIEmbeddings", | ||
"params_": { | ||
"api_key": { | ||
"resolver_": "ENV", | ||
"var_": "OPENAI_API_KEY" | ||
} | ||
} | ||
}, | ||
"from_pdf": true, | ||
"entities": [ | ||
"Person", | ||
{ | ||
"label": "House", | ||
"description": "Family the person belongs to", | ||
"properties": [ | ||
{ | ||
"name": "name", | ||
"type": "STRING" | ||
} | ||
] | ||
}, | ||
{ | ||
"label": "Planet", | ||
"properties": [ | ||
{ | ||
"name": "name", | ||
"type": "STRING" | ||
}, | ||
{ | ||
"name": "weather", | ||
"type": "STRING" | ||
} | ||
] | ||
} | ||
], | ||
"relations": [ | ||
"PARENT_OF", | ||
{ | ||
"label": "HEIR_OF", | ||
"description": "Used for inheritor relationship between father and sons" | ||
}, | ||
{ | ||
"label": "RULES", | ||
"properties": [ | ||
{ | ||
"name": "fromYear", | ||
"type": "INTEGER" | ||
} | ||
] | ||
} | ||
], | ||
"potential_schema": [ | ||
[ | ||
"Person", | ||
"PARENT_OF", | ||
"Person" | ||
], | ||
[ | ||
"Person", | ||
"HEIR_OF", | ||
"House" | ||
], | ||
[ | ||
"House", | ||
"RULES", | ||
"Planet" | ||
] | ||
], | ||
"text_splitter": { | ||
"class_": "text_splitters.fixed_size_splitter.FixedSizeSplitter", | ||
"params_": { | ||
"chunk_size": 100, | ||
"chunk_overlap": 10 | ||
} | ||
}, | ||
"pdf_loader": { | ||
"class_": "pdf_loader.PdfLoader", | ||
"run_params_": { | ||
"fs": "http" | ||
} | ||
}, | ||
"perform_entity_resolution": true | ||
} |
45 changes: 45 additions & 0 deletions
45
examples/build_graph/from_config_files/simple_kg_pipeline_from_config_file_with_url.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
"""In this example, the pipeline is defined in a JSON ('simple_kg_pipeline_config.json') | ||
or YAML ('simple_kg_pipeline_config.yaml') file. | ||
According to the configuration file, some parameters will be read from the env vars | ||
(Neo4j credentials and the OpenAI API key). | ||
""" | ||
|
||
import asyncio | ||
import logging | ||
|
||
## If env vars are in a .env file, uncomment: | ||
## (requires pip install python-dotenv) | ||
# from dotenv import load_dotenv | ||
# load_dotenv() | ||
# env vars manually set for testing: | ||
import os | ||
from pathlib import Path | ||
|
||
from neo4j_graphrag.experimental.pipeline.config.runner import PipelineRunner | ||
from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult | ||
|
||
logging.basicConfig() | ||
logging.getLogger("neo4j_graphrag").setLevel(logging.DEBUG) | ||
|
||
os.environ["NEO4J_URI"] = "bolt://localhost:7687" | ||
os.environ["NEO4J_USER"] = "neo4j" | ||
os.environ["NEO4J_PASSWORD"] = "password" | ||
# os.environ["OPENAI_API_KEY"] = "sk-..." | ||
|
||
|
||
root_dir = Path(__file__).parent | ||
file_path = root_dir / "simple_kg_pipeline_config_url.json" | ||
|
||
|
||
# File to process | ||
URL = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf" | ||
|
||
|
||
async def main() -> PipelineResult: | ||
pipeline = PipelineRunner.from_config_file(file_path) | ||
return await pipeline.run({"file_path": URL}) | ||
|
||
|
||
if __name__ == "__main__": | ||
print(asyncio.run(main())) |
17 changes: 17 additions & 0 deletions
17
examples/customize/build_graph/components/loaders/pdf_loader_from_url.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
"""Use the PdfLoader component to extract text from a remote PDF file.""" | ||
|
||
import asyncio | ||
|
||
from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader | ||
|
||
url = "https://raw.githubusercontent.com/neo4j/neo4j-graphrag-python/c166afc4d5abc56a5686f3da46a97ed7c07da19d/examples/data/Harry%20Potter%20and%20the%20Chamber%20of%20Secrets%20Summary.pdf" | ||
|
||
|
||
async def main() -> None: | ||
loader = PdfLoader() | ||
document = await loader.run(filepath=url, fs="http") | ||
print(document.text[:100]) | ||
|
||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters