diff --git a/.github/workflows/pr-e2e-tests.yaml b/.github/workflows/pr-e2e-tests.yaml index d4d29015..0480f907 100644 --- a/.github/workflows/pr-e2e-tests.yaml +++ b/.github/workflows/pr-e2e-tests.yaml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.12'] + python-version: ['3.9', '3.12'] neo4j-version: - 5 neo4j-edition: diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 3bdc15e3..39c73472 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + python-version: [ '3.9', '3.10', '3.11', '3.12' ] steps: - name: Install graphviz package run: sudo apt install graphviz graphviz-dev diff --git a/.github/workflows/pre-release.yaml b/.github/workflows/pre-release.yaml new file mode 100644 index 00000000..b60d3943 --- /dev/null +++ b/.github/workflows/pre-release.yaml @@ -0,0 +1,46 @@ +name: Publish a new Alpha release (x.y.0a0 -> x.y.0a1) 🚀 + +on: + workflow_dispatch: + +jobs: + bump-version: + outputs: + version: ${{ steps.get-version.outputs.version }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.GIT_PUSH_PAT }} + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + with: + version: 1.8.2 + virtualenvs-create: false + virtualenvs-in-project: false + installer-parallel: true + + - name: Bump version + run: poetry version prerelease + + - name: Get version + id: get-version + run: echo version=`poetry version -s` >> "$GITHUB_OUTPUT" + - name: Print version + run: | + echo Version: ${{ steps.get-version.outputs.version }} + - name: Update CHANGELOG.md (cross platform supported) + run: | + sed -i.bak -e 's/## Next/## Next\n\n## ${{ steps.get-version.outputs.version }}/' CHANGELOG.md && rm CHANGELOG.md.bak + - uses: EndBug/add-and-commit@v9 + with: + author_name: 'Neo4j-GraphRAG GitHub Action' + author_email: 'team-gen-ai@neo4j.com' + message: 'Bump version to ${{ steps.get-version.outputs.version }}' + add: "['pyproject.toml', 'CHANGELOG.md']" + tag: '${{ steps.get-version.outputs.version }}' diff --git a/.github/workflows/premajor-release.yaml b/.github/workflows/premajor-release.yaml new file mode 100644 index 00000000..6711988c --- /dev/null +++ b/.github/workflows/premajor-release.yaml @@ -0,0 +1,46 @@ +name: Publish a new Alpha Major release (0.4.0 -> 1.0.0a0) 🚀 + +on: + workflow_dispatch: + +jobs: + bump-version: + outputs: + version: ${{ steps.get-version.outputs.version }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.GIT_PUSH_PAT }} + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install and configure Poetry + uses: snok/install-poetry@v1 + with: + version: 1.8.2 + virtualenvs-create: false + virtualenvs-in-project: false + installer-parallel: true + + - name: Bump version + run: poetry version premajor + + - name: Get version + id: get-version + run: echo version=`poetry version -s` >> "$GITHUB_OUTPUT" + - name: Print version + run: | + echo Version: ${{ steps.get-version.outputs.version }} + - name: Update CHANGELOG.md (cross platform supported) + run: | + sed -i.bak -e 's/## Next/## Next\n\n## ${{ steps.get-version.outputs.version }}/' CHANGELOG.md && rm CHANGELOG.md.bak + - uses: EndBug/add-and-commit@v9 + with: + author_name: 'Neo4j-GraphRAG GitHub Action' + author_email: 'team-gen-ai@neo4j.com' + message: 'Bump version to ${{ steps.get-version.outputs.version }}' + add: "['pyproject.toml', 'CHANGELOG.md']" + tag: '${{ steps.get-version.outputs.version }}' diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index bb01c019..82387a96 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" - name: Install pypa/build run: >- python3 -m diff --git a/.github/workflows/scheduled-e2e-tests.yaml b/.github/workflows/scheduled-e2e-tests.yaml index ff1ea0bc..59ddaa42 100644 --- a/.github/workflows/scheduled-e2e-tests.yaml +++ b/.github/workflows/scheduled-e2e-tests.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] neo4j-version: - 5 neo4j-edition: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b33b254..cc256216 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,14 @@ ## Next +- Added `SinglePropertyExactMatchResolver` component allowing to merge entities with exact same property (e.g. name) + +## 0.7.0 + ### Added - Added AzureOpenAILLM and AzureOpenAIEmbeddings to support Azure served OpenAI models - Added `template` validation in `PromptTemplate` class upon construction. -- `custom_prompt` arg is now converted to `Text2CypherTemplate` class within the `Text2CypherRetriever.get_search_results` method. -- `Text2CypherTemplate` and `RAGTemplate` prompt templates now require `query_text` arg and will error if it is not present. Previous `query_text` aliases may be used, but will warn of deprecation. - Examples demonstrating the use of Mistral embeddings and LLM in RAG pipelines. -- Fixed bug in `Text2CypherRetriever` using `custom_prompt` arg where the `search` method would not inject the `query_text` content. - Added feature to include kwargs in `Text2CypherRetriever.search()` that will be injected into a custom prompt, if provided. - Added validation to `custom_prompt` parameter of `Text2CypherRetriever` to ensure that `query_text` placeholder exists in prompt. - Introduced a fixed size text splitter component for splitting text into specified fixed size chunks with overlap. Updated examples and tests to utilize this new component. @@ -20,14 +21,19 @@ ### Fixed - Resolved import issue with the Vertex AI Embeddings class. +- Fixed bug in `Text2CypherRetriever` using `custom_prompt` arg where the `search` method would not inject the `query_text` content. +- `custom_prompt` arg is now converted to `Text2CypherTemplate` class within the `Text2CypherRetriever.get_search_results` method. +- `Text2CypherTemplate` and `RAGTemplate` prompt templates now require `query_text` arg and will error if it is not present. Previous `query_text` aliases may be used, but will warn of deprecation. - Resolved issue where Neo4jWriter component would raise an error if the start or end node ID was not defined properly in the input. - Resolved issue where relationship types was not escaped in the insert Cypher query. -- Improved query performance in Neo4jWriter. +- Improved query performance in Neo4jWriter: created nodes now have a generic `__KGBuilder__` label and an index is created on the `__KGBuilder__.id` property. Moreover, insertion queries are now batched. Batch size can be controlled using the `batch_size` parameter in the `Neo4jWriter` component. ### Changed - Moved the Embedder class to the neo4j_graphrag.embeddings directory for better organization alongside other custom embedders. - Removed query argument from the GraphRAG class' `.search` method; users must now use `query_text`. - Neo4jWriter component now runs a single query to merge node and set its embeddings if any. +- Nodes created by the `Neo4jWriter` now have an extra `__KGBuilder__` label. Nodes from the entity graph also have an `__Entity__` label. +- Dropped support for Python 3.8 (end of life). ## 0.6.3 ### Changed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 42ad4749..c98c4891 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -54,7 +54,7 @@ Remember that many community members have become regular contributors and some a ## Specifically for this project Setting up the development environment: -1. Install Python 3.8.1+ +1. Install Python 3.9+ 2. Install poetry (see https://python-poetry.org/docs/#installation) 3. Install dependencies: diff --git a/README.md b/README.md index d32b1b60..6d728eef 100644 --- a/README.md +++ b/README.md @@ -14,13 +14,12 @@ Python versions supported: * Python 3.11 supported. * Python 3.10 supported. * Python 3.9 supported. -* Python 3.8 supported. # Usage ## Installation -This package requires Python (>=3.8.1). +This package requires Python (>=3.9). To install the latest stable version, use: @@ -37,6 +36,47 @@ Follow installation instructions [here](https://pygraphviz.github.io/documentati ## Examples +### Knowledge graph construction + +```python +from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline +from neo4j_graphrag.llm.openai_llm import OpenAILLM + +# Instantiate Entity and Relation objects +entities = ["PERSON", "ORGANIZATION", "LOCATION"] +relations = ["SITUATED_AT", "INTERACTS", "LED_BY"] +potential_schema = [ + ("PERSON", "SITUATED_AT", "LOCATION"), + ("PERSON", "INTERACTS", "PERSON"), + ("ORGANIZATION", "LED_BY", "PERSON"), +] + +# Instantiate the LLM +llm = OpenAILLM( + model_name="gpt-4o", + model_params={ + "max_tokens": 2000, + "response_format": {"type": "json_object"}, + }, +) + +# Create an instance of the SimpleKGPipeline +kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + file_path=file_path, + entities=entities, + relations=relations, +) + +await kg_builder.run_async(text=""" + Albert Einstein was a German physicist born in 1879 who wrote many groundbreaking + papers especially about general relativity and quantum mechanics. +""") +``` + + + ### Creating a vector index When creating a vector index, make sure you match the number of dimensions in the index with the number of dimensions the embeddings have. diff --git a/docs/source/api.rst b/docs/source/api.rst index 0eafde26..6cde0c35 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -71,6 +71,14 @@ LLMEntityRelationExtractor :members: run +SinglePropertyExactMatchResolver +================================ + +.. autoclass:: neo4j_graphrag.experimental.components.resolver.SinglePropertyExactMatchResolver + :members: run + + + .. _pipeline-section: ******** diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b9e51b3..bbc31633 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -27,7 +27,6 @@ Python versions supported: * Python 3.11 * Python 3.10 * Python 3.9 -* Python 3.8 ****** @@ -60,7 +59,7 @@ Usage Installation ************ -This package requires Python (>=3.8.1). +This package requires Python (>=3.9). To install the latest stable version, use: @@ -302,4 +301,4 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` -* :ref:`search` \ No newline at end of file +* :ref:`search` diff --git a/docs/source/user_guide_kg_builder.rst b/docs/source/user_guide_kg_builder.rst index ce9a61fa..2b91eaff 100644 --- a/docs/source/user_guide_kg_builder.rst +++ b/docs/source/user_guide_kg_builder.rst @@ -11,8 +11,6 @@ unstructured data. This feature is still experimental. API changes and bug fixes are expected. - It is not recommended to use it in production yet. - ****************** Pipeline Structure @@ -26,6 +24,7 @@ A Knowledge Graph (KG) construction pipeline requires a few components: - **Schema builder**: provide a schema to ground the LLM extracted entities and relations and obtain an easily navigable KG. - **Entity and relation extractor**: extract relevant entities and relations from the text. - **Knowledge Graph writer**: save the identified entities and relations. +- **Entity resolver**: merge similar entities into a single node. .. image:: images/kg_builder_pipeline.png :alt: KG Builder pipeline @@ -389,7 +388,13 @@ to a Neo4j database: graph = Neo4jGraph(nodes=[], relationships=[]) await writer.run(graph) -See :ref:`neo4jgraph` for the description of the input type. +To improve insert performances, it is possible to act on two parameters: + +- `batch_size`: the number of nodes/relationships to be processed in each batch (default is 1000). +- `max_concurrency`: the max number of concurrent queries (default is 5). + +See :ref:`neo4jgraph`. + It is possible to create a custom writer using the `KGWriter` interface: @@ -419,4 +424,45 @@ It is possible to create a custom writer using the `KGWriter` interface: The `validate_call` decorator is required when the input parameter contain a `pydantic` model. -See :ref:`kgwritermodel` and :ref:`kgwriter` in API reference. \ No newline at end of file +See :ref:`kgwritermodel` and :ref:`kgwriter` in API reference. + + +Entity Resolver +=============== + +The KG Writer component creates new nodes for each identified entity +without making assumptions about entity similarity. The Entity Resolver +is responsible for refining the created knowledge graph by merging entity +nodes that represent the same real-world object. + +In practice, this package implements a single resolver that merges nodes +with the same label and identical "name" property. + +.. warning:: + + The `SinglePropertyExactMatchResolver` **replaces** the nodes created by the KG writer. + + +It can be used like this: + +.. code:: python + from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, + ) + resolver = SinglePropertyExactMatchResolver(driver) + res = await resolver.run() + +.. warning:: + + By default, all nodes with the __Entity__ label will be resolved. + To exclude specific nodes, a filter_query can be added to the query. + For example, if a `:Resolved` label has been applied to already resolved entities + in the graph, these entities can be excluded with the following approach: + + .. code:: python + + from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, + ) + resolver = SinglePropertyExactMatchResolver(driver, filter_query="WHERE not entity:Resolved") + res = await resolver.run() diff --git a/examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf b/examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf new file mode 100644 index 00000000..8fa942c3 Binary files /dev/null and b/examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf differ diff --git a/examples/pipeline/kg_builder_example.py b/examples/pipeline/kg_builder_example.py new file mode 100644 index 00000000..1a5281de --- /dev/null +++ b/examples/pipeline/kg_builder_example.py @@ -0,0 +1,86 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import asyncio +import logging + +import neo4j +from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline +from neo4j_graphrag.llm.openai_llm import OpenAILLM + +logging.basicConfig(level=logging.INFO) + + +async def main(neo4j_driver: neo4j.Driver) -> None: + # Instantiate Entity and Relation objects + entities = ["PERSON", "ORGANIZATION", "HORCRUX", "LOCATION"] + relations = ["SITUATED_AT", "INTERACTS", "OWNS", "LED_BY"] + potential_schema = [ + ("PERSON", "SITUATED_AT", "LOCATION"), + ("PERSON", "INTERACTS", "PERSON"), + ("PERSON", "OWNS", "HORCRUX"), + ("ORGANIZATION", "LED_BY", "PERSON"), + ] + + # Instantiate the LLM + llm = OpenAILLM( + model_name="gpt-4o", + model_params={ + "max_tokens": 2000, + "response_format": {"type": "json_object"}, + }, + ) + + # Create an instance of the SimpleKGPipeline + kg_builder_pdf = SimpleKGPipeline( + llm=llm, + driver=neo4j_driver, + entities=entities, + relations=relations, + potential_schema=potential_schema, + from_pdf=True, + on_error="RAISE", + ) + + # Run the knowledge graph building process asynchronously + pdf_file_path = "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf" + pdf_result = await kg_builder_pdf.run_async(file_path=pdf_file_path) + print(f"PDF Processing Result: {pdf_result}") + + # Create an instance of the SimpleKGPipeline for text input + kg_builder_text = SimpleKGPipeline( + llm=llm, + driver=neo4j_driver, + entities=entities, + relations=relations, + potential_schema=potential_schema, + from_pdf=False, + on_error="RAISE", + ) + + # Run the knowledge graph building process with text input + text_input = "John Doe lives in New York City." + text_result = await kg_builder_text.run_async(text=text_input) + print(f"Text Processing Result: {text_result}") + + await llm.async_client.close() + + +if __name__ == "__main__": + with neo4j.GraphDatabase.driver( + "bolt://localhost:7687", auth=("neo4j", "password") + ) as driver: + asyncio.run(main(driver)) diff --git a/examples/pipeline/kg_builder_from_pdf.py b/examples/pipeline/kg_builder_from_pdf.py index 2d587383..a841b92b 100644 --- a/examples/pipeline/kg_builder_from_pdf.py +++ b/examples/pipeline/kg_builder_from_pdf.py @@ -33,12 +33,14 @@ FixedSizeSplitter, ) from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult -from neo4j_graphrag.llm import OpenAILLM +from neo4j_graphrag.llm import LLMInterface, OpenAILLM logging.basicConfig(level=logging.INFO) -async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: +async def define_and_run_pipeline( + neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface +) -> PipelineResult: from neo4j_graphrag.experimental.pipeline import Pipeline # Instantiate Entity and Relation objects @@ -86,13 +88,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: pipe.add_component(SchemaBuilder(), "schema") pipe.add_component( LLMEntityRelationExtractor( - llm=OpenAILLM( - model_name="gpt-4o", - model_params={ - "max_tokens": 2000, - "response_format": {"type": "json_object"}, - }, - ), + llm=llm, on_error=OnError.RAISE, ), "extractor", @@ -127,8 +123,23 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: return await pipe.run(pipe_inputs) -if __name__ == "__main__": - with neo4j.GraphDatabase.driver( +async def main() -> PipelineResult: + llm = OpenAILLM( + model_name="gpt-4o", + model_params={ + "max_tokens": 2000, + "response_format": {"type": "json_object"}, + }, + ) + driver = neo4j.AsyncGraphDatabase.driver( "bolt://localhost:7687", auth=("neo4j", "password") - ) as driver: - print(asyncio.run(main(driver))) + ) + res = await define_and_run_pipeline(driver, llm) + await driver.close() + await llm.async_client.close() + return res + + +if __name__ == "__main__": + res = asyncio.run(main()) + print(res) diff --git a/examples/pipeline/kg_builder_from_text.py b/examples/pipeline/kg_builder_from_text.py index 15817e18..193de0d0 100644 --- a/examples/pipeline/kg_builder_from_text.py +++ b/examples/pipeline/kg_builder_from_text.py @@ -15,7 +15,6 @@ from __future__ import annotations import asyncio -import logging.config import neo4j from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings @@ -36,30 +35,12 @@ ) from neo4j_graphrag.experimental.pipeline import Pipeline from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult -from neo4j_graphrag.llm import OpenAILLM - -# set log level to DEBUG for all neo4j_graphrag.* loggers -logging.config.dictConfig( - { - "version": 1, - "handlers": { - "console": { - "class": "logging.StreamHandler", - } - }, - "loggers": { - "root": { - "handlers": ["console"], - }, - "neo4j_graphrag": { - "level": "DEBUG", - }, - }, - } -) +from neo4j_graphrag.llm import LLMInterface, OpenAILLM -async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: +async def define_and_run_pipeline( + neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface +) -> PipelineResult: """This is where we define and run the KG builder pipeline, instantiating a few components: - Text Splitter: in this example we use the fixed size text splitter @@ -83,13 +64,7 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: pipe.add_component(SchemaBuilder(), "schema") pipe.add_component( LLMEntityRelationExtractor( - llm=OpenAILLM( - model_name="gpt-4o", - model_params={ - "max_tokens": 1000, - "response_format": {"type": "json_object"}, - }, - ), + llm=llm, on_error=OnError.RAISE, ), "extractor", @@ -164,8 +139,23 @@ async def main(neo4j_driver: neo4j.Driver) -> PipelineResult: return await pipe.run(pipe_inputs) -if __name__ == "__main__": - with neo4j.GraphDatabase.driver( +async def main() -> PipelineResult: + llm = OpenAILLM( + model_name="gpt-4o", + model_params={ + "max_tokens": 1000, + "response_format": {"type": "json_object"}, + }, + ) + driver = neo4j.AsyncGraphDatabase.driver( "bolt://localhost:7687", auth=("neo4j", "password") - ) as driver: - print(asyncio.run(main(driver))) + ) + res = await define_and_run_pipeline(driver, llm) + await driver.close() + await llm.async_client.close() + return res + + +if __name__ == "__main__": + res = asyncio.run(main()) + print(res) diff --git a/examples/pipeline/kg_builder_two_documents_entity_resolution.py b/examples/pipeline/kg_builder_two_documents_entity_resolution.py new file mode 100644 index 00000000..eb68b022 --- /dev/null +++ b/examples/pipeline/kg_builder_two_documents_entity_resolution.py @@ -0,0 +1,156 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import asyncio + +import neo4j +from neo4j_graphrag.experimental.components.entity_relation_extractor import ( + LLMEntityRelationExtractor, + OnError, +) +from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter +from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader +from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, +) +from neo4j_graphrag.experimental.components.schema import ( + SchemaBuilder, + SchemaEntity, + SchemaProperty, + SchemaRelation, +) +from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import ( + FixedSizeSplitter, +) +from neo4j_graphrag.experimental.pipeline import Pipeline +from neo4j_graphrag.llm import LLMInterface, OpenAILLM + + +async def define_and_run_pipeline( + neo4j_driver: neo4j.AsyncDriver, llm: LLMInterface +) -> None: + """This is where we define and run the KG builder pipeline, instantiating a few + components: + - Text Splitter: in this example we use the fixed size text splitter + - Schema Builder: this component takes a list of entities, relationships and + possible triplets as inputs, validate them and return a schema ready to use + for the rest of the pipeline + - LLM Entity Relation Extractor is an LLM-based entity and relation extractor: + based on the provided schema, the LLM will do its best to identity these + entities and their relations within the provided text + - KG writer: once entities and relations are extracted, they can be writen + to a Neo4j database + """ + pipe = Pipeline() + # define the components + pipe.add_component(PdfLoader(), "loader") + pipe.add_component( + FixedSizeSplitter(), + "splitter", + ) + pipe.add_component(SchemaBuilder(), "schema") + pipe.add_component( + LLMEntityRelationExtractor( + llm=llm, + on_error=OnError.IGNORE, + ), + "extractor", + ) + pipe.add_component(Neo4jWriter(neo4j_driver), "writer") + pipe.add_component(SinglePropertyExactMatchResolver(neo4j_driver), "resolver") + # define the execution order of component + # and how the output of previous components must be used + pipe.connect("loader", "splitter", {"text": "loader.text"}) + pipe.connect("splitter", "extractor", input_config={"chunks": "splitter"}) + pipe.connect( + "schema", + "extractor", + input_config={"schema": "schema", "document_info": "loader.document_info"}, + ) + pipe.connect( + "extractor", + "writer", + input_config={"graph": "extractor"}, + ) + pipe.connect("writer", "resolver", {}) + # user input: + # the initial text + # and the list of entities and relations we are looking for + pipe_inputs = { + "loader": {}, + "schema": { + "entities": [ + SchemaEntity( + label="Person", + properties=[ + SchemaProperty(name="name", type="STRING"), + SchemaProperty(name="place_of_birth", type="STRING"), + SchemaProperty(name="date_of_birth", type="DATE"), + ], + ), + SchemaEntity( + label="Organization", + properties=[ + SchemaProperty(name="name", type="STRING"), + SchemaProperty(name="country", type="STRING"), + ], + ), + ], + "relations": [ + SchemaRelation( + label="WORKED_FOR", + ), + SchemaRelation( + label="FRIEND", + ), + SchemaRelation( + label="ENEMY", + ), + ], + "potential_schema": [ + ("Person", "WORKED_FOR", "Organization"), + ("Person", "FRIEND", "Person"), + ("Person", "ENEMY", "Person"), + ], + }, + } + # run the pipeline for each documents + for document in [ + "examples/pipeline/Harry Potter and the Chamber of Secrets Summary.pdf", + "examples/pipeline/Harry Potter and the Death Hallows Summary.pdf", + ]: + pipe_inputs["loader"]["filepath"] = document + await pipe.run(pipe_inputs) + + +async def main() -> None: + llm = OpenAILLM( + model_name="gpt-4o", + model_params={ + "max_tokens": 1000, + "response_format": {"type": "json_object"}, + }, + ) + driver = neo4j.AsyncGraphDatabase.driver( + "bolt://localhost:7687", auth=("neo4j", "password") + ) + await define_and_run_pipeline(driver, llm) + await driver.close() + await llm.async_client.close() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/poetry.lock b/poetry.lock index e5dbe5dc..e2e030c8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,13 +2,13 @@ [[package]] name = "aiohappyeyeballs" -version = "2.4.2" +version = "2.4.3" description = "Happy Eyeballs for asyncio" optional = false python-versions = ">=3.8" files = [ - {file = "aiohappyeyeballs-2.4.2-py3-none-any.whl", hash = "sha256:8522691d9a154ba1145b157d6d5c15e5c692527ce6a53c5e5f9876977f6dab2f"}, - {file = "aiohappyeyeballs-2.4.2.tar.gz", hash = "sha256:4ca893e6c5c1f5bf3888b04cb5a3bee24995398efef6e0b9f747b5e89d84fd74"}, + {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"}, + {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"}, ] [[package]] @@ -159,9 +159,6 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} - [[package]] name = "anthropic" version = "0.34.2" @@ -189,13 +186,13 @@ vertex = ["google-auth (>=2,<3)"] [[package]] name = "anyio" -version = "4.5.0" +version = "4.6.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "anyio-4.5.0-py3-none-any.whl", hash = "sha256:fdeb095b7cc5a5563175eedd926ec4ae55413bb4be5770c424af0ba46ccb4a78"}, - {file = "anyio-4.5.0.tar.gz", hash = "sha256:c5a275fe5ca0afd788001f58fca1e69e29ce706d746e317d660e21f70c530ef9"}, + {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, + {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, ] [package.dependencies] @@ -211,13 +208,13 @@ trio = ["trio (>=0.26.1)"] [[package]] name = "astroid" -version = "3.2.4" +version = "3.3.4" description = "An abstract syntax tree for Python with inference support." optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.9.0" files = [ - {file = "astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25"}, - {file = "astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a"}, + {file = "astroid-3.3.4-py3-none-any.whl", hash = "sha256:5eba185467253501b62a9f113c263524b4f5d55e1b30456370eed4cdbd6438fd"}, + {file = "astroid-3.3.4.tar.gz", hash = "sha256:e73d0b62dd680a7c07cb2cd0ce3c22570b044dd01bd994bc3a2dd16c6cbba162"}, ] [package.dependencies] @@ -304,17 +301,17 @@ lxml = ["lxml"] [[package]] name = "boto3" -version = "1.35.29" +version = "1.35.30" description = "The AWS SDK for Python" optional = false python-versions = ">=3.8" files = [ - {file = "boto3-1.35.29-py3-none-any.whl", hash = "sha256:2244044cdfa8ac345d7400536dc15a4824835e7ec5c55bc267e118af66bb27db"}, - {file = "boto3-1.35.29.tar.gz", hash = "sha256:7bbb1ee649e09e956952285782cfdebd7e81fc78384f48dfab3d66c6eaf3f63f"}, + {file = "boto3-1.35.30-py3-none-any.whl", hash = "sha256:d89c3459db89c5408e83219ab849ffd0146bc4285e75cdc67c6e45d390a12df2"}, + {file = "boto3-1.35.30.tar.gz", hash = "sha256:d2851aec8e9dc6937977acbe9a5124ecc31b3ad5f50a10cd9ae52636da3f52fa"}, ] [package.dependencies] -botocore = ">=1.35.29,<1.36.0" +botocore = ">=1.35.30,<1.36.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.10.0,<0.11.0" @@ -323,13 +320,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.35.29" +version = "1.35.30" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">=3.8" files = [ - {file = "botocore-1.35.29-py3-none-any.whl", hash = "sha256:f8e3ae0d84214eff3fb69cb4dc51cea6c43d3bde82027a94d00c52b941d6c3d5"}, - {file = "botocore-1.35.29.tar.gz", hash = "sha256:4ed28ab03675bb008a290c452c5ddd7aaa5d4e3fa1912aadbdf93057ee84362b"}, + {file = "botocore-1.35.30-py3-none-any.whl", hash = "sha256:3bb9f9dde001608671ea74681ac3cec06bbbb10cba8cb8c1387a25e843075ce0"}, + {file = "botocore-1.35.30.tar.gz", hash = "sha256:ab5350e8a50e48d371fa2d517d65c29a40c43788cb9a15387f93eac5a23df0fd"}, ] [package.dependencies] @@ -1257,79 +1254,38 @@ tracing = ["opentelemetry-api (>=1.1.0)"] [[package]] name = "google-crc32c" -version = "1.5.0" +version = "1.6.0" description = "A python wrapper of the C library 'Google CRC32C'" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"}, - {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"}, - {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346"}, - {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65"}, - {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b"}, - {file = "google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02"}, - {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4"}, - {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e"}, - {file = "google_crc32c-1.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c"}, - {file = "google_crc32c-1.5.0-cp310-cp310-win32.whl", hash = "sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee"}, - {file = "google_crc32c-1.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289"}, - {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273"}, - {file = "google_crc32c-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298"}, - {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57"}, - {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438"}, - {file = "google_crc32c-1.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906"}, - {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183"}, - {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd"}, - {file = "google_crc32c-1.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c"}, - {file = "google_crc32c-1.5.0-cp311-cp311-win32.whl", hash = "sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709"}, - {file = "google_crc32c-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-win32.whl", hash = "sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94"}, - {file = "google_crc32c-1.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740"}, - {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8"}, - {file = "google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a"}, - {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946"}, - {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a"}, - {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d"}, - {file = "google_crc32c-1.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a"}, - {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37"}, - {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894"}, - {file = "google_crc32c-1.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a"}, - {file = "google_crc32c-1.5.0-cp38-cp38-win32.whl", hash = "sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4"}, - {file = "google_crc32c-1.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c"}, - {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7"}, - {file = "google_crc32c-1.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d"}, - {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100"}, - {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9"}, - {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57"}, - {file = "google_crc32c-1.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210"}, - {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd"}, - {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96"}, - {file = "google_crc32c-1.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61"}, - {file = "google_crc32c-1.5.0-cp39-cp39-win32.whl", hash = "sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c"}, - {file = "google_crc32c-1.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541"}, - {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325"}, - {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd"}, - {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091"}, - {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178"}, - {file = "google_crc32c-1.5.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2"}, - {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d"}, - {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2"}, - {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5"}, - {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462"}, - {file = "google_crc32c-1.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314"}, - {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728"}, - {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88"}, - {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb"}, - {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31"}, - {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93"}, + {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa"}, + {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc"}, + {file = "google_crc32c-1.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42"}, + {file = "google_crc32c-1.6.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4"}, + {file = "google_crc32c-1.6.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8"}, + {file = "google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d"}, + {file = "google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f"}, + {file = "google_crc32c-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3"}, + {file = "google_crc32c-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d"}, + {file = "google_crc32c-1.6.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b"}, + {file = "google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00"}, + {file = "google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3"}, + {file = "google_crc32c-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760"}, + {file = "google_crc32c-1.6.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e2806553238cd076f0a55bddab37a532b53580e699ed8e5606d0de1f856b5205"}, + {file = "google_crc32c-1.6.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb0966e1c50d0ef5bc743312cc730b533491d60585a9a08f897274e57c3f70e0"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:386122eeaaa76951a8196310432c5b0ef3b53590ef4c317ec7588ec554fec5d2"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2952396dc604544ea7476b33fe87faedc24d666fb0c2d5ac971a2b9576ab871"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35834855408429cecf495cac67ccbab802de269e948e27478b1e47dfb6465e57"}, + {file = "google_crc32c-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8797406499f28b5ef791f339594b0b5fdedf54e203b5066675c406ba69d705c"}, + {file = "google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc"}, + {file = "google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d"}, + {file = "google_crc32c-1.6.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05e2d8c9a2f853ff116db9706b4a27350587f341eda835f46db3c0a8c8ce2f24"}, + {file = "google_crc32c-1.6.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91ca8145b060679ec9176e6de4f89b07363d6805bd4760631ef254905503598d"}, + {file = "google_crc32c-1.6.0.tar.gz", hash = "sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc"}, ] [package.extras] @@ -2735,21 +2691,21 @@ files = [ [[package]] name = "networkx" -version = "3.1" +version = "3.2.1" description = "Python package for creating and manipulating graphs and networks" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, - {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, + {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, + {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"}, ] [package.extras] -default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] -developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] -doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] -extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] -test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] +default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"] +developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"] +doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] name = "nltk" @@ -2787,43 +2743,6 @@ files = [ {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, ] -[[package]] -name = "numpy" -version = "1.24.4" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, - {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, - {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, - {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, - {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, - {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, - {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, - {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, - {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, - {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, - {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, -] - [[package]] name = "numpy" version = "1.26.4" @@ -3116,70 +3035,89 @@ files = [ [[package]] name = "pandas" -version = "2.0.3" +version = "2.2.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, - {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, - {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, - {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, - {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, - {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, - {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, - {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, - {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, - {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, - {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, + {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, + {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, + {file = "pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed"}, + {file = "pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57"}, + {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42"}, + {file = "pandas-2.2.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f"}, + {file = "pandas-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645"}, + {file = "pandas-2.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039"}, + {file = "pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd"}, + {file = "pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698"}, + {file = "pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc"}, + {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3"}, + {file = "pandas-2.2.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32"}, + {file = "pandas-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5"}, + {file = "pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9"}, + {file = "pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4"}, + {file = "pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3"}, + {file = "pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319"}, + {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8"}, + {file = "pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a"}, + {file = "pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13"}, + {file = "pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015"}, + {file = "pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28"}, + {file = "pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0"}, + {file = "pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24"}, + {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659"}, + {file = "pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb"}, + {file = "pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d"}, + {file = "pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468"}, + {file = "pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18"}, + {file = "pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2"}, + {file = "pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4"}, + {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d"}, + {file = "pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a"}, + {file = "pandas-2.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39"}, + {file = "pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30"}, + {file = "pandas-2.2.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c"}, + {file = "pandas-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c"}, + {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea"}, + {file = "pandas-2.2.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761"}, + {file = "pandas-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e"}, + {file = "pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667"}, ] [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" -tzdata = ">=2022.1" +tzdata = ">=2022.7" [package.extras] -all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] -aws = ["s3fs (>=2021.08.0)"] -clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] -compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] -computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2021.07.0)"] -gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] -hdf5 = ["tables (>=3.6.1)"] -html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] -mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] -spss = ["pyreadstat (>=1.1.2)"] -sql-other = ["SQLAlchemy (>=1.4.16)"] -test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.6.3)"] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] [[package]] name = "parameterized" @@ -3609,17 +3547,17 @@ files = [ [[package]] name = "pylint" -version = "3.2.7" +version = "3.3.1" description = "python code static checker" optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.9.0" files = [ - {file = "pylint-3.2.7-py3-none-any.whl", hash = "sha256:02f4aedeac91be69fb3b4bea997ce580a4ac68ce58b89eaefeaf06749df73f4b"}, - {file = "pylint-3.2.7.tar.gz", hash = "sha256:1b7a721b575eaeaa7d39db076b6e7743c993ea44f57979127c517c6c572c803e"}, + {file = "pylint-3.3.1-py3-none-any.whl", hash = "sha256:2f846a466dd023513240bc140ad2dd73bfc080a5d85a710afdb728c420a5a2b9"}, + {file = "pylint-3.3.1.tar.gz", hash = "sha256:9f3dcc87b1203e612b78d91a896407787e708b3f189b5fa0b307712d49ff0c6e"}, ] [package.dependencies] -astroid = ">=3.2.4,<=3.3.0-dev0" +astroid = ">=3.3.4,<=3.4.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ {version = ">=0.2", markers = "python_version < \"3.11\""}, @@ -4136,88 +4074,90 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"] [[package]] name = "scikit-learn" -version = "1.3.2" +version = "1.5.2" description = "A set of python modules for machine learning and data mining" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "scikit-learn-1.3.2.tar.gz", hash = "sha256:a2f54c76accc15a34bfb9066e6c7a56c1e7235dda5762b990792330b52ccfb05"}, - {file = "scikit_learn-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e326c0eb5cf4d6ba40f93776a20e9a7a69524c4db0757e7ce24ba222471ee8a1"}, - {file = "scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:535805c2a01ccb40ca4ab7d081d771aea67e535153e35a1fd99418fcedd1648a"}, - {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1215e5e58e9880b554b01187b8c9390bf4dc4692eedeaf542d3273f4785e342c"}, - {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ee107923a623b9f517754ea2f69ea3b62fc898a3641766cb7deb2f2ce450161"}, - {file = "scikit_learn-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:35a22e8015048c628ad099da9df5ab3004cdbf81edc75b396fd0cff8699ac58c"}, - {file = "scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6fb6bc98f234fda43163ddbe36df8bcde1d13ee176c6dc9b92bb7d3fc842eb66"}, - {file = "scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:18424efee518a1cde7b0b53a422cde2f6625197de6af36da0b57ec502f126157"}, - {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3271552a5eb16f208a6f7f617b8cc6d1f137b52c8a1ef8edf547db0259b2c9fb"}, - {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4144a5004a676d5022b798d9e573b05139e77f271253a4703eed295bde0433"}, - {file = "scikit_learn-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:67f37d708f042a9b8d59551cf94d30431e01374e00dc2645fa186059c6c5d78b"}, - {file = "scikit_learn-1.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8db94cd8a2e038b37a80a04df8783e09caac77cbe052146432e67800e430c028"}, - {file = "scikit_learn-1.3.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:61a6efd384258789aa89415a410dcdb39a50e19d3d8410bd29be365bcdd512d5"}, - {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb06f8dce3f5ddc5dee1715a9b9f19f20d295bed8e3cd4fa51e1d050347de525"}, - {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2de18d86f630d68fe1f87af690d451388bb186480afc719e5f770590c2ef6c"}, - {file = "scikit_learn-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107"}, - {file = "scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a19f90f95ba93c1a7f7924906d0576a84da7f3b2282ac3bfb7a08a32801add93"}, - {file = "scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b8692e395a03a60cd927125eef3a8e3424d86dde9b2370d544f0ea35f78a8073"}, - {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e1e94cc23d04d39da797ee34236ce2375ddea158b10bee3c343647d615581d"}, - {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:785a2213086b7b1abf037aeadbbd6d67159feb3e30263434139c98425e3dcfcf"}, - {file = "scikit_learn-1.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:64381066f8aa63c2710e6b56edc9f0894cc7bf59bd71b8ce5613a4559b6145e0"}, - {file = "scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c43290337f7a4b969d207e620658372ba3c1ffb611f8bc2b6f031dc5c6d1d03"}, - {file = "scikit_learn-1.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dc9002fc200bed597d5d34e90c752b74df516d592db162f756cc52836b38fe0e"}, - {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d08ada33e955c54355d909b9c06a4789a729977f165b8bae6f225ff0a60ec4a"}, - {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f0ae4b79b0ff9cca0bf3716bcc9915bdacff3cebea15ec79652d1cc4fa5c9"}, - {file = "scikit_learn-1.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:ed932ea780517b00dae7431e031faae6b49b20eb6950918eb83bd043237950e0"}, + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:299406827fb9a4f862626d0fe6c122f5f87f8910b86fe5daa4c32dcd742139b6"}, + {file = "scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2d4cad1119c77930b235579ad0dc25e65c917e756fe80cab96aa3b9428bd3fb0"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c412ccc2ad9bf3755915e3908e677b367ebc8d010acbb3f182814524f2e5540"}, + {file = "scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a686885a4b3818d9e62904d91b57fa757fc2bed3e465c8b177be652f4dd37c8"}, + {file = "scikit_learn-1.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:c15b1ca23d7c5f33cc2cb0a0d6aaacf893792271cddff0edbd6a40e8319bc113"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:03b6158efa3faaf1feea3faa884c840ebd61b6484167c711548fce208ea09445"}, + {file = "scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1ff45e26928d3b4eb767a8f14a9a6efbf1cbff7c05d1fb0f95f211a89fd4f5de"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f763897fe92d0e903aa4847b0aec0e68cadfff77e8a0687cabd946c89d17e675"}, + {file = "scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8b0ccd4a902836493e026c03256e8b206656f91fbcc4fde28c57a5b752561f1"}, + {file = "scikit_learn-1.5.2-cp311-cp311-win_amd64.whl", hash = "sha256:6c16d84a0d45e4894832b3c4d0bf73050939e21b99b01b6fd59cbb0cf39163b6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f932a02c3f4956dfb981391ab24bda1dbd90fe3d628e4b42caef3e041c67707a"}, + {file = "scikit_learn-1.5.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3b923d119d65b7bd555c73be5423bf06c0105678ce7e1f558cb4b40b0a5502b1"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, + {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, + {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, + {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, + {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca64b3089a6d9b9363cd3546f8978229dcbb737aceb2c12144ee3f70f95684b7"}, + {file = "scikit_learn-1.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:3bed4909ba187aca80580fe2ef370d9180dcf18e621a27c4cf2ef10d279a7efe"}, + {file = "scikit_learn-1.5.2.tar.gz", hash = "sha256:b4237ed7b3fdd0a4882792e68ef2545d5baa50aca3bb45aa7df468138ad8f94d"}, ] [package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3,<2.0" -scipy = ">=1.5.0" -threadpoolctl = ">=2.0.0" +joblib = ">=1.2.0" +numpy = ">=1.19.5" +scipy = ">=1.6.0" +threadpoolctl = ">=3.1.0" [package.extras] -benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] -examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] -tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] +benchmark = ["matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "pandas (>=1.1.5)"] +build = ["cython (>=3.0.10)", "meson-python (>=0.16.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory_profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pydata-sphinx-theme (>=0.15.3)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=7.3.7)", "sphinx-copybutton (>=0.5.2)", "sphinx-design (>=0.5.0)", "sphinx-design (>=0.6.0)", "sphinx-gallery (>=0.16.0)", "sphinx-prompt (>=1.4.0)", "sphinx-remove-toctrees (>=1.0.0.post1)", "sphinxcontrib-sass (>=0.3.4)", "sphinxext-opengraph (>=0.9.1)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +install = ["joblib (>=1.2.0)", "numpy (>=1.19.5)", "scipy (>=1.6.0)", "threadpoolctl (>=3.1.0)"] +maintenance = ["conda-lock (==2.5.6)"] +tests = ["black (>=24.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.9)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.20.30)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.2.1)", "scikit-image (>=0.17.2)"] [[package]] name = "scipy" -version = "1.10.1" +version = "1.13.1" description = "Fundamental algorithms for scientific computing in Python" optional = false -python-versions = "<3.12,>=3.8" -files = [ - {file = "scipy-1.10.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e7354fd7527a4b0377ce55f286805b34e8c54b91be865bac273f527e1b839019"}, - {file = "scipy-1.10.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4b3f429188c66603a1a5c549fb414e4d3bdc2a24792e061ffbd607d3d75fd84e"}, - {file = "scipy-1.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1553b5dcddd64ba9a0d95355e63fe6c3fc303a8fd77c7bc91e77d61363f7433f"}, - {file = "scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c0ff64b06b10e35215abce517252b375e580a6125fd5fdf6421b98efbefb2d2"}, - {file = "scipy-1.10.1-cp310-cp310-win_amd64.whl", hash = "sha256:fae8a7b898c42dffe3f7361c40d5952b6bf32d10c4569098d276b4c547905ee1"}, - {file = "scipy-1.10.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0f1564ea217e82c1bbe75ddf7285ba0709ecd503f048cb1236ae9995f64217bd"}, - {file = "scipy-1.10.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d925fa1c81b772882aa55bcc10bf88324dadb66ff85d548c71515f6689c6dac5"}, - {file = "scipy-1.10.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaea0a6be54462ec027de54fca511540980d1e9eea68b2d5c1dbfe084797be35"}, - {file = "scipy-1.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15a35c4242ec5f292c3dd364a7c71a61be87a3d4ddcc693372813c0b73c9af1d"}, - {file = "scipy-1.10.1-cp311-cp311-win_amd64.whl", hash = "sha256:43b8e0bcb877faf0abfb613d51026cd5cc78918e9530e375727bf0625c82788f"}, - {file = "scipy-1.10.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5678f88c68ea866ed9ebe3a989091088553ba12c6090244fdae3e467b1139c35"}, - {file = "scipy-1.10.1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:39becb03541f9e58243f4197584286e339029e8908c46f7221abeea4b749fa88"}, - {file = "scipy-1.10.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bce5869c8d68cf383ce240e44c1d9ae7c06078a9396df68ce88a1230f93a30c1"}, - {file = "scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07c3457ce0b3ad5124f98a86533106b643dd811dd61b548e78cf4c8786652f6f"}, - {file = "scipy-1.10.1-cp38-cp38-win_amd64.whl", hash = "sha256:049a8bbf0ad95277ffba9b3b7d23e5369cc39e66406d60422c8cfef40ccc8415"}, - {file = "scipy-1.10.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:cd9f1027ff30d90618914a64ca9b1a77a431159df0e2a195d8a9e8a04c78abf9"}, - {file = "scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:79c8e5a6c6ffaf3a2262ef1be1e108a035cf4f05c14df56057b64acc5bebffb6"}, - {file = "scipy-1.10.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51af417a000d2dbe1ec6c372dfe688e041a7084da4fdd350aeb139bd3fb55353"}, - {file = "scipy-1.10.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b4735d6c28aad3cdcf52117e0e91d6b39acd4272f3f5cd9907c24ee931ad601"}, - {file = "scipy-1.10.1-cp39-cp39-win_amd64.whl", hash = "sha256:7ff7f37b1bf4417baca958d254e8e2875d0cc23aaadbe65b3d5b3077b0eb23ea"}, - {file = "scipy-1.10.1.tar.gz", hash = "sha256:2cf9dfb80a7b4589ba4c40ce7588986d6d5cebc5457cad2c2880f6bc2d42f3a5"}, +python-versions = ">=3.9" +files = [ + {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"}, + {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"}, + {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cfa31f1def5c819b19ecc3a8b52d28ffdcc7ed52bb20c9a7589669dd3c250989"}, + {file = "scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26264b282b9da0952a024ae34710c2aff7d27480ee91a2e82b7b7073c24722f"}, + {file = "scipy-1.13.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eccfa1906eacc02de42d70ef4aecea45415f5be17e72b61bafcfd329bdc52e94"}, + {file = "scipy-1.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:2831f0dc9c5ea9edd6e51e6e769b655f08ec6db6e2e10f86ef39bd32eb11da54"}, + {file = "scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:27e52b09c0d3a1d5b63e1105f24177e544a222b43611aaf5bc44d4a0979e32f9"}, + {file = "scipy-1.13.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:54f430b00f0133e2224c3ba42b805bfd0086fe488835effa33fa291561932326"}, + {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e89369d27f9e7b0884ae559a3a956e77c02114cc60a6058b4e5011572eea9299"}, + {file = "scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a78b4b3345f1b6f68a763c6e25c0c9a23a9fd0f39f5f3d200efe8feda560a5fa"}, + {file = "scipy-1.13.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:45484bee6d65633752c490404513b9ef02475b4284c4cfab0ef946def50b3f59"}, + {file = "scipy-1.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:5713f62f781eebd8d597eb3f88b8bf9274e79eeabf63afb4a737abc6c84ad37b"}, + {file = "scipy-1.13.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5d72782f39716b2b3509cd7c33cdc08c96f2f4d2b06d51e52fb45a19ca0c86a1"}, + {file = "scipy-1.13.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:017367484ce5498445aade74b1d5ab377acdc65e27095155e448c88497755a5d"}, + {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:949ae67db5fa78a86e8fa644b9a6b07252f449dcf74247108c50e1d20d2b4627"}, + {file = "scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de3ade0e53bc1f21358aa74ff4830235d716211d7d077e340c7349bc3542e884"}, + {file = "scipy-1.13.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2ac65fb503dad64218c228e2dc2d0a0193f7904747db43014645ae139c8fad16"}, + {file = "scipy-1.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:cdd7dacfb95fea358916410ec61bbc20440f7860333aee6d882bb8046264e949"}, + {file = "scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:436bbb42a94a8aeef855d755ce5a465479c721e9d684de76bf61a62e7c2b81d5"}, + {file = "scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:8335549ebbca860c52bf3d02f80784e91a004b71b059e3eea9678ba994796a24"}, + {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d533654b7d221a6a97304ab63c41c96473ff04459e404b83275b60aa8f4b7004"}, + {file = "scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637e98dcf185ba7f8e663e122ebf908c4702420477ae52a04f9908707456ba4d"}, + {file = "scipy-1.13.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a014c2b3697bde71724244f63de2476925596c24285c7a637364761f8710891c"}, + {file = "scipy-1.13.1-cp39-cp39-win_amd64.whl", hash = "sha256:392e4ec766654852c25ebad4f64e4e584cf19820b980bc04960bca0b0cd6eaa2"}, + {file = "scipy-1.13.1.tar.gz", hash = "sha256:095a87a0312b08dfd6a6155cbbd310a8c51800fc931b8c0b84003014b874ed3c"}, ] [package.dependencies] -numpy = ">=1.19.5,<1.27.0" +numpy = ">=1.22.4,<2.3" [package.extras] -dev = ["click", "doit (>=0.36.0)", "flake8", "mypy", "pycodestyle", "pydevtool", "rich-click", "typing_extensions"] -doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-design (>=0.2.0)"] -test = ["asv", "gmpy2", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] +dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pydevtool", "rich-click", "ruff", "types-psutil", "typing_extensions"] +doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"] +test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] [[package]] name = "scipy" @@ -4933,30 +4873,31 @@ optree = ["optree (>=0.11.0)"] [[package]] name = "tox" -version = "4.20.0" +version = "4.21.0" description = "tox is a generic virtualenv management and test command line tool" optional = false python-versions = ">=3.8" files = [ - {file = "tox-4.20.0-py3-none-any.whl", hash = "sha256:21a8005e3d3fe5658a8e36b8ca3ed13a4230429063c5cc2a2fdac6ee5aa0de34"}, - {file = "tox-4.20.0.tar.gz", hash = "sha256:5b78a49b6eaaeab3ae4186415e7c97d524f762ae967c63562687c3e5f0ec23d5"}, + {file = "tox-4.21.0-py3-none-any.whl", hash = "sha256:693ac51378255d34ad7aab6dd2ce9ab6a1cf1924eb930183fde850ad503b681d"}, + {file = "tox-4.21.0.tar.gz", hash = "sha256:e64dd9847ff3a7ec90368be412d7efe61a39caf043222ffbe9ad638ea435f6f6"}, ] [package.dependencies] cachetools = ">=5.5" chardet = ">=5.2" colorama = ">=0.4.6" -filelock = ">=3.15.4" +filelock = ">=3.16.1" packaging = ">=24.1" -platformdirs = ">=4.2.2" +platformdirs = ">=4.3.6" pluggy = ">=1.5" -pyproject-api = ">=1.7.1" +pyproject-api = ">=1.8" tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} -virtualenv = ">=20.26.3" +typing-extensions = {version = ">=4.12.2", markers = "python_version < \"3.11\""} +virtualenv = ">=20.26.6" [package.extras] -docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-argparse-cli (>=1.17)", "sphinx-autodoc-typehints (>=2.4)", "sphinx-copybutton (>=0.5.2)", "sphinx-inline-tabs (>=2023.4.21)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=24.8)"] -testing = ["build[virtualenv] (>=1.2.2)", "covdefaults (>=2.3)", "detect-test-pollution (>=1.2)", "devpi-process (>=1)", "diff-cover (>=9.1.1)", "distlib (>=0.3.8)", "flaky (>=3.8.1)", "hatch-vcs (>=0.4)", "hatchling (>=1.25)", "psutil (>=6)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-xdist (>=3.6.1)", "re-assert (>=1.1)", "setuptools (>=74.1.2)", "time-machine (>=2.15)", "wheel (>=0.44)"] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-argparse-cli (>=1.18.2)", "sphinx-autodoc-typehints (>=2.4.4)", "sphinx-copybutton (>=0.5.2)", "sphinx-inline-tabs (>=2023.4.21)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=24.8)"] +testing = ["build[virtualenv] (>=1.2.2)", "covdefaults (>=2.3)", "detect-test-pollution (>=1.2)", "devpi-process (>=1.0.2)", "diff-cover (>=9.2)", "distlib (>=0.3.8)", "flaky (>=3.8.1)", "hatch-vcs (>=0.4)", "hatchling (>=1.25)", "psutil (>=6)", "pytest (>=8.3.3)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-xdist (>=3.6.1)", "re-assert (>=1.1)", "setuptools (>=75.1)", "time-machine (>=2.15)", "wheel (>=0.44)"] [[package]] name = "tqdm" @@ -5453,5 +5394,5 @@ kg-creation-tools = ["pygraphviz", "pygraphviz"] [metadata] lock-version = "2.0" -python-versions = "^3.8.1" -content-hash = "660e13e30d6c69da33af3f771daf387d5af1c5c29c1c44de3b22d1354aca97b5" +python-versions = "^3.9.0" +content-hash = "ecb73179848945e83c04273f59675703c9874be1db45ed9b810890c37b12dbfb" diff --git a/pyproject.toml b/pyproject.toml index b06b30a4..3aacbea8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ [tool.poetry] name = "neo4j-graphrag" -version = "0.6.3" +version = "0.7.0" description = "Python package to allow easy integration to Neo4j's GraphRAG features" authors = ["Neo4j, Inc "] license = "Apache License, Version 2.0" @@ -28,7 +28,7 @@ include = "neo4j_graphrag" from = "src" [tool.poetry.dependencies] -python = "^3.8.1" +python = "^3.9.0" neo4j = "^5.17.0" pydantic = "^2.6.3" weaviate-client = {version = "^4.6.1", optional = true} diff --git a/src/neo4j_graphrag/experimental/components/kg_writer.py b/src/neo4j_graphrag/experimental/components/kg_writer.py index c0424c72..dcae6d91 100644 --- a/src/neo4j_graphrag/experimental/components/kg_writer.py +++ b/src/neo4j_graphrag/experimental/components/kg_writer.py @@ -18,11 +18,15 @@ import inspect import logging from abc import abstractmethod -from typing import Any, Dict, Literal, Optional, Tuple +from typing import Any, Generator, Literal, Optional import neo4j from pydantic import validate_call +from neo4j_graphrag.experimental.components.entity_relation_extractor import ( + CHUNK_NODE_LABEL, + DOCUMENT_NODE_LABEL, +) from neo4j_graphrag.experimental.components.types import ( Neo4jGraph, Neo4jNode, @@ -34,14 +38,25 @@ logger = logging.getLogger(__name__) +def batched(rows: list[Any], batch_size: int) -> Generator[list[Any], None, None]: + index = 0 + for i in range(0, len(rows), batch_size): + start = i + end = min(start + batch_size, len(rows)) + batch = rows[start:end] + yield batch + index += 1 + + class KGWriterModel(DataModel): """Data model for the output of the Knowledge Graph writer. Attributes: - status (Literal["SUCCESS", "FAILURE"]): Whether or not the write operation was successful. + status (Literal["SUCCESS", "FAILURE"]): Whether the write operation was successful. """ status: Literal["SUCCESS", "FAILURE"] + metadata: Optional[dict[str, Any]] = None class KGWriter(Component): @@ -91,90 +106,85 @@ def __init__( self, driver: neo4j.driver, neo4j_database: Optional[str] = None, + batch_size: int = 1000, max_concurrency: int = 5, ): self.driver = driver self.neo4j_database = neo4j_database + self.batch_size = batch_size self.max_concurrency = max_concurrency def _db_setup(self) -> None: # create index on __Entity__.id + # used when creating the relationships self.driver.execute_query( - "CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__Entity__) ON (n.id)" + "CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__KGBuilder__) ON (n.id)" ) async def _async_db_setup(self) -> None: # create index on __Entity__.id + # used when creating the relationships await self.driver.execute_query( - "CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__Entity__) ON (n.id)" + "CREATE INDEX __entity__id IF NOT EXISTS FOR (n:__KGBuilder__) ON (n.id)" ) - def _get_node_query(self, node: Neo4jNode) -> Tuple[str, Dict[str, Any]]: - # Create the initial node - parameters = { - "id": node.id, - "properties": node.properties or {}, - "embeddings": node.embedding_properties, - } - query = UPSERT_NODE_QUERY.format(label=node.label) - return query, parameters - - def _upsert_node(self, node: Neo4jNode) -> None: + @staticmethod + def _nodes_to_rows(nodes: list[Neo4jNode]) -> list[dict[str, Any]]: + rows = [] + for node in nodes: + labels = [node.label] + if node.label not in (CHUNK_NODE_LABEL, DOCUMENT_NODE_LABEL): + labels.append("__Entity__") + row = node.model_dump() + row["labels"] = labels + rows.append(row) + return rows + + def _upsert_nodes(self, nodes: list[Neo4jNode]) -> None: """Upserts a single node into the Neo4j database." Args: - node (Neo4jNode): The node to upsert into the database. + nodes (list[Neo4jNode]): The nodes batch to upsert into the database. """ - query, parameters = self._get_node_query(node) - self.driver.execute_query(query, parameters_=parameters) + parameters = {"rows": self._nodes_to_rows(nodes)} + self.driver.execute_query(UPSERT_NODE_QUERY, parameters_=parameters) - async def _async_upsert_node( + async def _async_upsert_nodes( self, - node: Neo4jNode, + nodes: list[Neo4jNode], sem: asyncio.Semaphore, ) -> None: """Asynchronously upserts a single node into the Neo4j database." Args: - node (Neo4jNode): The node to upsert into the database. + nodes (list[Neo4jNode]): The nodes batch to upsert into the database. """ async with sem: - query, parameters = self._get_node_query(node) - await self.driver.execute_query(query, parameters_=parameters) - - def _get_rel_query(self, rel: Neo4jRelationship) -> Tuple[str, Dict[str, Any]]: - # Create the initial relationship - parameters = { - "start_node_id": rel.start_node_id, - "end_node_id": rel.end_node_id, - "properties": rel.properties or {}, - "embeddings": rel.embedding_properties, - } - query = UPSERT_RELATIONSHIP_QUERY.format( - type=rel.type, - ) - return query, parameters + parameters = {"rows": self._nodes_to_rows(nodes)} + await self.driver.execute_query(UPSERT_NODE_QUERY, parameters_=parameters) - def _upsert_relationship(self, rel: Neo4jRelationship) -> None: + def _upsert_relationships(self, rels: list[Neo4jRelationship]) -> None: """Upserts a single relationship into the Neo4j database. Args: - rel (Neo4jRelationship): The relationship to upsert into the database. + rels (list[Neo4jRelationship]): The relationships batch to upsert into the database. """ - query, parameters = self._get_rel_query(rel) - self.driver.execute_query(query, parameters_=parameters) + parameters = {"rows": [rel.model_dump() for rel in rels]} + self.driver.execute_query(UPSERT_RELATIONSHIP_QUERY, parameters_=parameters) - async def _async_upsert_relationship( - self, rel: Neo4jRelationship, sem: asyncio.Semaphore + async def _async_upsert_relationships( + self, rels: list[Neo4jRelationship], sem: asyncio.Semaphore ) -> None: """Asynchronously upserts a single relationship into the Neo4j database. Args: - rel (Neo4jRelationship): The relationship to upsert into the database. + rels (list[Neo4jRelationship]): The relationships batch to upsert into the database. """ async with sem: - query, parameters = self._get_rel_query(rel) - await self.driver.execute_query(query, parameters_=parameters) + parameters = {"rows": [rel.model_dump() for rel in rels]} + await self.driver.execute_query( + UPSERT_RELATIONSHIP_QUERY, parameters_=parameters + ) @validate_call async def run(self, graph: Neo4jGraph) -> KGWriterModel: @@ -188,25 +198,32 @@ async def run(self, graph: Neo4jGraph) -> KGWriterModel: await self._async_db_setup() sem = asyncio.Semaphore(self.max_concurrency) node_tasks = [ - self._async_upsert_node(node, sem) for node in graph.nodes + self._async_upsert_nodes(batch, sem) + for batch in batched(graph.nodes, self.batch_size) ] await asyncio.gather(*node_tasks) rel_tasks = [ - self._async_upsert_relationship(rel, sem) - for rel in graph.relationships + self._async_upsert_relationships(batch, sem) + for batch in batched(graph.relationships, self.batch_size) ] await asyncio.gather(*rel_tasks) else: self._db_setup() - for node in graph.nodes: - self._upsert_node(node) + for batch in batched(graph.nodes, self.batch_size): + self._upsert_nodes(batch) - for rel in graph.relationships: - self._upsert_relationship(rel) + for batch in batched(graph.relationships, self.batch_size): + self._upsert_relationships(batch) - return KGWriterModel(status="SUCCESS") + return KGWriterModel( + status="SUCCESS", + metadata={ + "node_count": len(graph.nodes), + "relationship_count": len(graph.relationships), + }, + ) except neo4j.exceptions.ClientError as e: logger.exception(e) - return KGWriterModel(status="FAILURE") + return KGWriterModel(status="FAILURE", metadata={"error": str(e)}) diff --git a/src/neo4j_graphrag/experimental/components/pdf_loader.py b/src/neo4j_graphrag/experimental/components/pdf_loader.py index f17e3b02..2cd3fca1 100644 --- a/src/neo4j_graphrag/experimental/components/pdf_loader.py +++ b/src/neo4j_graphrag/experimental/components/pdf_loader.py @@ -25,7 +25,7 @@ from fsspec.implementations.local import LocalFileSystem from neo4j_graphrag.exceptions import PdfLoaderError -from neo4j_graphrag.experimental.pipeline import Component, DataModel +from neo4j_graphrag.experimental.pipeline.component import Component, DataModel class DocumentInfo(DataModel): diff --git a/src/neo4j_graphrag/experimental/components/resolver.py b/src/neo4j_graphrag/experimental/components/resolver.py new file mode 100644 index 00000000..4e07a1d6 --- /dev/null +++ b/src/neo4j_graphrag/experimental/components/resolver.py @@ -0,0 +1,142 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import abc +from typing import Any, Optional, Union + +import neo4j + +from neo4j_graphrag.experimental.components.types import ResolutionStats +from neo4j_graphrag.experimental.pipeline import Component +from neo4j_graphrag.utils import execute_query + + +class EntityResolver(Component, abc.ABC): + """Entity resolution base class + + Args: + driver (neo4j.driver): The Neo4j driver to connect to the database. + filter_query (Optional[str]): Cypher query to select the entities to resolve. By default, all nodes with __Entity__ label are used + """ + + def __init__( + self, + driver: Union[neo4j.Driver, neo4j.AsyncDriver], + filter_query: Optional[str] = None, + ) -> None: + self.driver = driver + self.filter_query = filter_query + + @abc.abstractmethod + async def run(self, *args: Any, **kwargs: Any) -> ResolutionStats: + pass + + +class SinglePropertyExactMatchResolver(EntityResolver): + """Resolve entities with same label and exact same property (default is "name"). + + Args: + driver (neo4j.driver): The Neo4j driver to connect to the database. + filter_query (Optional[str]): To reduce the resolution scope, add a Cypher WHERE clause. + resolve_property (str): The property that will be compared (default: "name"). If values match exactly, entities are merged. + neo4j_database (Optional[str]): The name of the Neo4j database to write to. Defaults to 'neo4j' if not provided. + + Example: + + .. code-block:: python + + from neo4j import AsyncGraphDatabase + from neo4j_graphrag.experimental.components.resolver import SinglePropertyExactMatchResolver + + URI = "neo4j://localhost:7687" + AUTH = ("neo4j", "password") + DATABASE = "neo4j" + + driver = AsyncGraphDatabase.driver(URI, auth=AUTH, database=DATABASE) + resolver = SinglePropertyExactMatchResolver(driver=driver, neo4j_database=DATABASE) + await resolver.run() # no expected parameters + + """ + + def __init__( + self, + driver: Union[neo4j.Driver, neo4j.AsyncDriver], + filter_query: Optional[str] = None, + resolve_property: str = "name", + neo4j_database: Optional[str] = None, + ) -> None: + super().__init__(driver, filter_query) + self.resolve_property = resolve_property + self.database = neo4j_database + + async def run(self) -> ResolutionStats: + """Resolve entities based on the following rule: + For each entity label, entities with the same 'resolve_property' value + (exact match) are grouped into a single node: + + - Properties: the property from the first node will remain if already set, otherwise the first property in list will be written. + - Relationships: merge relationships with same type and target node. + + See apoc.refactor.mergeNodes documentation for more details. + """ + match_query = "MATCH (entity:__Entity__) " + if self.filter_query: + match_query += self.filter_query + stat_query = f"{match_query} RETURN count(entity) as c" + records = await execute_query( + self.driver, + stat_query, + database_=self.database, + ) + number_of_nodes_to_resolve = records[0].get("c") + if number_of_nodes_to_resolve == 0: + return ResolutionStats( + number_of_nodes_to_resolve=0, + ) + merge_nodes_query = ( + f"{match_query} " + f"WITH entity, entity.{self.resolve_property} as prop " + # keep only entities for which the resolve_property (name) is not null + "WITH entity, prop WHERE prop IS NOT NULL " + # will check the property for each of the entity labels, + # except the reserved ones __Entity__ and __KGBuilder__ + "UNWIND labels(entity) as lab " + "WITH lab, prop, entity WHERE NOT lab IN ['__Entity__', '__KGBuilder__'] " + # aggregate based on property value and label + # collect all entities with exact same property and label + # in the 'entities' list + "WITH prop, lab, collect(entity) AS entities " + # merge all entities into a single node + # * merge relationships: if the merged entities have a relationship of same + # type to the same target node, these relationships are merged + # otherwise relationships are just attached to the newly created node + # * properties: if the two entities have the same property key with + # different values, only one of them is kept in the created node + "CALL apoc.refactor.mergeNodes(entities,{ " + " properties:'discard', " + " mergeRels:true " + "}) " + "YIELD node " + "RETURN count(node) as c " + ) + records = await execute_query( + self.driver, + merge_nodes_query, + database_=self.database, + ) + number_of_created_nodes = records[0].get("c") + return ResolutionStats( + number_of_nodes_to_resolve=number_of_nodes_to_resolve, + number_of_created_nodes=number_of_created_nodes, + ) diff --git a/src/neo4j_graphrag/experimental/components/schema.py b/src/neo4j_graphrag/experimental/components/schema.py index 20919d8a..c82a7d5f 100644 --- a/src/neo4j_graphrag/experimental/components/schema.py +++ b/src/neo4j_graphrag/experimental/components/schema.py @@ -19,7 +19,7 @@ from pydantic import BaseModel, ValidationError, model_validator, validate_call from neo4j_graphrag.exceptions import SchemaValidationError -from neo4j_graphrag.experimental.pipeline import Component, DataModel +from neo4j_graphrag.experimental.pipeline.component import Component, DataModel class SchemaProperty(BaseModel): diff --git a/src/neo4j_graphrag/experimental/components/types.py b/src/neo4j_graphrag/experimental/components/types.py index 1a93a321..2591c890 100644 --- a/src/neo4j_graphrag/experimental/components/types.py +++ b/src/neo4j_graphrag/experimental/components/types.py @@ -51,13 +51,13 @@ class Neo4jNode(BaseModel): Attributes: id (str): The ID of the node. label (str): The label of the node. - properties (Optional[dict[str, Any]]): A dictionary of properties attached to the node. + properties (dict[str, Any]): A dictionary of properties attached to the node. embedding_properties (Optional[dict[str, list[float]]]): A list of embedding properties attached to the node. """ id: str label: str - properties: Optional[dict[str, Any]] = None + properties: dict[str, Any] = {} embedding_properties: Optional[dict[str, list[float]]] = None @field_validator("properties", "embedding_properties") @@ -77,14 +77,14 @@ class Neo4jRelationship(BaseModel): start_node_id (str): The ID of the start node. end_node_id (str): The ID of the end node. type (str): The relationship type. - properties (Optional[dict[str, Any]]): A dictionary of properties attached to the relationship. + properties (dict[str, Any]): A dictionary of properties attached to the relationship. embedding_properties (Optional[dict[str, list[float]]]): A list of embedding properties attached to the relationship. """ start_node_id: str end_node_id: str type: str - properties: Optional[dict[str, Any]] = None + properties: dict[str, Any] = {} embedding_properties: Optional[dict[str, list[float]]] = None @@ -98,3 +98,8 @@ class Neo4jGraph(DataModel): nodes: list[Neo4jNode] = [] relationships: list[Neo4jRelationship] = [] + + +class ResolutionStats(DataModel): + number_of_nodes_to_resolve: int + number_of_created_nodes: Optional[int] = None diff --git a/src/neo4j_graphrag/experimental/pipeline/component.py b/src/neo4j_graphrag/experimental/pipeline/component.py index b6a5963e..84cd5bc0 100644 --- a/src/neo4j_graphrag/experimental/pipeline/component.py +++ b/src/neo4j_graphrag/experimental/pipeline/component.py @@ -73,6 +73,7 @@ class Component(abc.ABC, metaclass=ComponentMeta): # these variables are filled by the metaclass # added here for the type checker + # DO NOT CHANGE component_inputs: dict[str, dict[str, str | bool]] component_outputs: dict[str, dict[str, str | bool]] diff --git a/src/neo4j_graphrag/experimental/pipeline/kg_builder.py b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py new file mode 100644 index 00000000..2ba0c7c0 --- /dev/null +++ b/src/neo4j_graphrag/experimental/pipeline/kg_builder.py @@ -0,0 +1,244 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Any, List, Optional, Union + +import neo4j +from pydantic import BaseModel, ConfigDict, Field + +from neo4j_graphrag.experimental.components.entity_relation_extractor import ( + LLMEntityRelationExtractor, + OnError, +) +from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter +from neo4j_graphrag.experimental.components.pdf_loader import PdfLoader +from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, +) +from neo4j_graphrag.experimental.components.schema import ( + SchemaBuilder, + SchemaEntity, + SchemaRelation, +) +from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import ( + FixedSizeSplitter, +) +from neo4j_graphrag.experimental.pipeline.exceptions import PipelineDefinitionError +from neo4j_graphrag.experimental.pipeline.pipeline import Pipeline, PipelineResult +from neo4j_graphrag.generation.prompts import ERExtractionTemplate +from neo4j_graphrag.llm.base import LLMInterface + + +class SimpleKGPipelineConfig(BaseModel): + llm: LLMInterface + driver: neo4j.Driver + from_pdf: bool + entities: list[SchemaEntity] = Field(default_factory=list) + relations: list[SchemaRelation] = Field(default_factory=list) + potential_schema: list[tuple[str, str, str]] = Field(default_factory=list) + pdf_loader: Any = None + kg_writer: Any = None + text_splitter: Any = None + on_error: OnError = OnError.RAISE + prompt_template: Union[ERExtractionTemplate, str] = ERExtractionTemplate() + perform_entity_resolution: bool = True + + model_config = ConfigDict(arbitrary_types_allowed=True) + + +class SimpleKGPipeline: + """ + A class to simplify the process of building a knowledge graph from text documents. + It abstracts away the complexity of setting up the pipeline and its components. + + Args: + llm (LLMInterface): An instance of an LLM to use for entity and relation extraction. + driver (neo4j.Driver): A Neo4j driver instance for database connection. + entities (Optional[List[str]]): A list of entity labels as strings. + relations (Optional[List[str]]): A list of relation labels as strings. + potential_schema (Optional[List[tuple]]): A list of potential schema relationships. + from_pdf (bool): Determines whether to include the PdfLoader in the pipeline. + If True, expects `file_path` input in `run` methods. + If False, expects `text` input in `run` methods. + text_splitter (Optional[Any]): A text splitter component. Defaults to FixedSizeSplitter(). + pdf_loader (Optional[Any]): A PDF loader component. Defaults to PdfLoader(). + kg_writer (Optional[Any]): A knowledge graph writer component. Defaults to Neo4jWriter(). + on_error (str): Error handling strategy. Defaults to "RAISE". Possible values: "RAISE" or "IGNORE". + perform_entity_resolution (bool): Merge entities with same label and name. Default: True + """ + + def __init__( + self, + llm: LLMInterface, + driver: neo4j.Driver, + entities: Optional[List[str]] = None, + relations: Optional[List[str]] = None, + potential_schema: Optional[List[tuple[str, str, str]]] = None, + from_pdf: bool = True, + text_splitter: Optional[Any] = None, + pdf_loader: Optional[Any] = None, + kg_writer: Optional[Any] = None, + on_error: str = "RAISE", + prompt_template: Union[ERExtractionTemplate, str] = ERExtractionTemplate(), + perform_entity_resolution: bool = True, + ): + self.entities = [SchemaEntity(label=label) for label in entities or []] + self.relations = [SchemaRelation(label=label) for label in relations or []] + self.potential_schema = potential_schema if potential_schema is not None else [] + + try: + on_error_enum = OnError(on_error) + except ValueError: + raise PipelineDefinitionError( + f"Invalid value for on_error: {on_error}. Expected 'RAISE' or 'CONTINUE'." + ) + + config = SimpleKGPipelineConfig( + llm=llm, + driver=driver, + entities=self.entities, + relations=self.relations, + potential_schema=self.potential_schema, + from_pdf=from_pdf, + pdf_loader=pdf_loader, + kg_writer=kg_writer, + text_splitter=text_splitter, + on_error=on_error_enum, + prompt_template=prompt_template, + perform_entity_resolution=perform_entity_resolution, + ) + + self.from_pdf = config.from_pdf + self.llm = config.llm + self.driver = config.driver + self.text_splitter = config.text_splitter or FixedSizeSplitter() + self.on_error = config.on_error + self.pdf_loader = config.pdf_loader if pdf_loader is not None else PdfLoader() + self.kg_writer = ( + config.kg_writer if kg_writer is not None else Neo4jWriter(driver) + ) + self.prompt_template = config.prompt_template + self.perform_entity_resolution = config.perform_entity_resolution + + self.pipeline = self._build_pipeline() + + def _build_pipeline(self) -> Pipeline: + pipe = Pipeline() + + pipe.add_component(self.text_splitter, "splitter") + pipe.add_component(SchemaBuilder(), "schema") + pipe.add_component( + LLMEntityRelationExtractor( + llm=self.llm, + on_error=self.on_error, + prompt_template=self.prompt_template, + ), + "extractor", + ) + pipe.add_component(self.kg_writer, "writer") + + if self.from_pdf: + pipe.add_component(self.pdf_loader, "pdf_loader") + + pipe.connect( + "pdf_loader", + "splitter", + input_config={"text": "pdf_loader.text"}, + ) + + pipe.connect( + "schema", + "extractor", + input_config={ + "schema": "schema", + "document_info": "pdf_loader.document_info", + }, + ) + else: + pipe.connect( + "schema", + "extractor", + input_config={ + "schema": "schema", + }, + ) + + pipe.connect( + "splitter", + "extractor", + input_config={"chunks": "splitter"}, + ) + + # Connect extractor to writer + pipe.connect( + "extractor", + "writer", + input_config={"graph": "extractor"}, + ) + + if self.perform_entity_resolution: + pipe.add_component( + SinglePropertyExactMatchResolver(self.driver), "resolver" + ) + pipe.connect("writer", "resolver", {}) + + return pipe + + async def run_async( + self, file_path: Optional[str] = None, text: Optional[str] = None + ) -> PipelineResult: + """ + Asynchronously runs the knowledge graph building process. + + Args: + file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. + text (Optional[str]): The text content to process. Required if `from_pdf` is False. + + Returns: + PipelineResult: The result of the pipeline execution. + """ + pipe_inputs = self._prepare_inputs(file_path=file_path, text=text) + return await self.pipeline.run(pipe_inputs) + + def _prepare_inputs( + self, file_path: Optional[str], text: Optional[str] + ) -> dict[str, Any]: + if self.from_pdf: + if file_path is None or text is not None: + raise PipelineDefinitionError( + "Expected 'file_path' argument when 'from_pdf' is True." + ) + else: + if text is None or file_path is not None: + raise PipelineDefinitionError( + "Expected 'text' argument when 'from_pdf' is False." + ) + + pipe_inputs: dict[str, Any] = { + "schema": { + "entities": self.entities, + "relations": self.relations, + "potential_schema": self.potential_schema, + }, + } + + if self.from_pdf: + pipe_inputs["pdf_loader"] = {"filepath": file_path} + else: + pipe_inputs["splitter"] = {"text": text} + + return pipe_inputs diff --git a/src/neo4j_graphrag/experimental/pipeline/pipeline.py b/src/neo4j_graphrag/experimental/pipeline/pipeline.py index c3598121..3d004eb8 100644 --- a/src/neo4j_graphrag/experimental/pipeline/pipeline.py +++ b/src/neo4j_graphrag/experimental/pipeline/pipeline.py @@ -195,8 +195,9 @@ async def check_dependencies_complete(self, task: TaskPipelineNode) -> None: for d in dependencies: d_status = await self.get_status_for_component(d.start) if d_status != RunStatus.DONE: - logger.warning( - f"Missing dependency {d.start} for {task.name} (status: {d_status})" + logger.debug( + f"Missing dependency {d.start} for {task.name} (status: {d_status}). " + "Will try again when dependency is complete." ) raise PipelineMissingDependencyError() @@ -605,6 +606,7 @@ def validate_parameter_mapping_for_task(self, task: TaskPipelineNode) -> bool: async def run(self, data: dict[str, Any]) -> PipelineResult: logger.debug("Starting pipeline") start_time = default_timer() + self.invalidate() self.validate_input_data(data) orchestrator = Orchestrator(self) await orchestrator.run(data) diff --git a/src/neo4j_graphrag/neo4j_queries.py b/src/neo4j_graphrag/neo4j_queries.py index 144ef178..fd819e5c 100644 --- a/src/neo4j_graphrag/neo4j_queries.py +++ b/src/neo4j_graphrag/neo4j_queries.py @@ -42,27 +42,31 @@ ) UPSERT_NODE_QUERY = ( - "MERGE (n:__Entity__ {{id: $id}}) " - "WITH n SET n:`{label}`, n += $properties " - "WITH n CALL {{ " - "WITH n WITH n WHERE $embeddings IS NOT NULL " - "UNWIND keys($embeddings) as emb " - "CALL db.create.setNodeVectorProperty(n, emb, $embeddings[emb]) " - "}} " + "UNWIND $rows AS row " + "CREATE (n:__KGBuilder__ {id: row.id}) " + "SET n += row.properties " + "WITH n, row CALL apoc.create.addLabels(n, row.labels) YIELD node " + "WITH node as n, row CALL { " + "WITH n, row WITH n, row WHERE row.embedding_properties IS NOT NULL " + "UNWIND keys(row.embedding_properties) as emb " + "CALL db.create.setNodeVectorProperty(n, emb, row.embedding_properties[emb]) " + "RETURN count(*) as nbEmb " + "} " "RETURN elementId(n)" ) UPSERT_RELATIONSHIP_QUERY = ( - "MATCH (start:__Entity__ {{ id: $start_node_id }}) " - "MATCH (end:__Entity__ {{ id: $end_node_id }}) " - "MERGE (start)-[r:`{type}`]->(end) " - "WITH r SET r += $properties " - "WITH r CALL {{ " - "WITH r WITH r WHERE $embeddings IS NOT NULL " - "UNWIND keys($embeddings) as emb " - "CALL db.create.setRelationshipVectorProperty(r, emb, $embeddings[emb]) " - "}} " - "RETURN elementId(r)" + "UNWIND $rows as row " + "MATCH (start:__KGBuilder__ {id: row.start_node_id}) " + "MATCH (end:__KGBuilder__ {id: row.end_node_id}) " + "WITH start, end, row " + "CALL apoc.merge.relationship(start, row.type, {}, row.properties, end, row.properties) YIELD rel " + "WITH rel, row CALL { " + "WITH rel, row WITH rel, row WHERE row.embedding_properties IS NOT NULL " + "UNWIND keys(row.embedding_properties) as emb " + "CALL db.create.setRelationshipVectorProperty(rel, emb, row.embedding_properties[emb]) " + "} " + "RETURN elementId(rel)" ) UPSERT_VECTOR_ON_NODE_QUERY = ( diff --git a/src/neo4j_graphrag/utils.py b/src/neo4j_graphrag/utils.py index e86f7588..5f1b322f 100644 --- a/src/neo4j_graphrag/utils.py +++ b/src/neo4j_graphrag/utils.py @@ -14,7 +14,10 @@ # limitations under the License. from __future__ import annotations -from typing import Optional +import inspect +from typing import Any, Optional, Union + +import neo4j def validate_search_query_input( @@ -22,3 +25,15 @@ def validate_search_query_input( ) -> None: if not (bool(query_vector) ^ bool(query_text)): raise ValueError("You must provide exactly one of query_vector or query_text.") + + +async def execute_query( + driver: Union[neo4j.Driver, neo4j.AsyncDriver], query: str, **kwargs: Any +) -> list[neo4j.Record]: + if inspect.iscoroutinefunction(driver.execute_query): + records, _, _ = await driver.execute_query(query, **kwargs) + return records # type: ignore[no-any-return] + # ignoring type because mypy complains about coroutine + # but we're sure at this stage we do not have a coroutine anymore + records, _, _ = driver.execute_query(query, **kwargs) # type: ignore[misc] + return records # type: ignore[no-any-return] diff --git a/tests/e2e/data/documents/harry_potter_part1.txt b/tests/e2e/data/documents/harry_potter_part1.txt new file mode 100644 index 00000000..0b6a6621 --- /dev/null +++ b/tests/e2e/data/documents/harry_potter_part1.txt @@ -0,0 +1,8 @@ +At Malfoy Manor, Snape tells Voldemort the date that Harry’s friends are planning to +move him from the house on Privet Drive to a new safe location, so that Voldemort +can capture Harry en route. + +As Harry packs to leave Privet Drive, he reads two obituaries for Dumbledore, both +of which make him think that he didn’t know Dumbledore as well as he should have. +Downstairs, he bids good-bye to the Dursleys for the final time, as the threat of +Voldemort forces them to go into hiding themselves. diff --git a/tests/e2e/data/documents/harry_potter_part2.txt b/tests/e2e/data/documents/harry_potter_part2.txt new file mode 100644 index 00000000..8a5e8d53 --- /dev/null +++ b/tests/e2e/data/documents/harry_potter_part2.txt @@ -0,0 +1,6 @@ +The Order of the Phoenix, led by Alastor “Mad-Eye” Moody, arrives to take Harry to +his new home at the Weasleys’ house, the Burrow. Six of Harry’s friends take +Polyjuice Potion to disguise themselves as Harry and act as decoys, and they all fly +off in different directions. The Death Eaters, alerted to their departure by Snape, +attack Harry and his friends. Voldemort chases Harry down, but Harry’s wand fends +Voldemort off, seemingly without Harry’s help. diff --git a/tests/e2e/test_entity_resolver_component_e2e.py b/tests/e2e/test_entity_resolver_component_e2e.py new file mode 100644 index 00000000..49c8af4c --- /dev/null +++ b/tests/e2e/test_entity_resolver_component_e2e.py @@ -0,0 +1,181 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import neo4j +import pytest +from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, +) + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_resolver_single_node(driver: neo4j.Driver) -> None: + driver.execute_query("MATCH (n) DETACH DELETE n") + driver.execute_query( + """ + CREATE (d:Document {id: "0", path: "path"}) + CREATE (c:Chunk {id: "0:0"}) + CREATE (c)-[:FROM_DOCUMENT]->(d) + CREATE (alice:__Entity__:Person {id: "0:0:1", name: "Alice"}) + CREATE (alice)-[:FROM_CHUNK]->(c) + """ + ) + resolver = SinglePropertyExactMatchResolver(driver) + res = await resolver.run() + # __Entity__ nodes attached to a chunk + assert res.number_of_nodes_to_resolve == 1 + # Alice + assert res.number_of_created_nodes == 1 + + records, _, _ = driver.execute_query( + "MATCH path=(:Person {name: 'Alice'}) RETURN path" + ) + assert len(records) == 1 + path = records[0].get("path") + assert path.start_node.get("name") == "Alice" + assert path.start_node.labels == frozenset({"__Entity__", "Person"}) + assert path.start_node.get("id") == "0:0:1" + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_resolver_two_nodes_and_relationships(driver: neo4j.Driver) -> None: + driver.execute_query("MATCH (n) DETACH DELETE n") + driver.execute_query( + """ + CREATE (d:Document {id: "0", path: "path"}) + CREATE (c:Chunk {id: "0:0"}) + CREATE (c)-[:FROM_DOCUMENT]->(d) + CREATE (alice1:__Entity__:Person {id: "0:0:1", name: "Alice"}) + CREATE (alice2:__Entity__:Person {id: "0:0:2", name: "Alice"}) + CREATE (sweden:__Entity__:Country {id: "0:0:3", name: "Sweden"}) + CREATE (alice1)-[:LIVES_IN]->(sweden) + CREATE (alice1)-[:FROM_CHUNK]->(c) + CREATE (alice2)-[:FROM_CHUNK]->(c) + CREATE (sweden)-[:FROM_CHUNK]->(c) + """ + ) + resolver = SinglePropertyExactMatchResolver(driver) + res = await resolver.run() + # __Entity__ nodes attached to a chunk + assert res.number_of_nodes_to_resolve == 3 + # Alice and Sweden + assert res.number_of_created_nodes == 2 + + # check the domain graph + records, _, _ = driver.execute_query( + "MATCH path=(:Person {name: 'Alice'})-[:LIVES_IN]->(:Country {name: 'Sweden'}) RETURN path" + ) + assert len(records) == 1 + path = records[0].get("path") + assert path.start_node.get("name") == "Alice" + assert path.start_node.labels == frozenset({"__Entity__", "Person"}) + assert path.end_node.get("name") == "Sweden" + assert path.end_node.labels == frozenset({"__Entity__", "Country"}) + assert len(path.relationships) == 1 + assert path.relationships[0].type == "LIVES_IN" + + # check the lexical graph + records, _, _ = driver.execute_query( + "MATCH path=(:Person {name: 'Alice'})-[:FROM_CHUNK]->(:Chunk) RETURN path" + ) + assert len(records) == 1 + path = records[0].get("path") + assert path.start_node.get("name") == "Alice" + assert path.end_node.labels == frozenset({"Chunk"}) + assert path.end_node.get("id") == "0:0" + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_resolver_same_name_different_labels(driver: neo4j.Driver) -> None: + driver.execute_query("MATCH (n) DETACH DELETE n") + driver.execute_query( + """ + CREATE (d:Document {id: "0", path: "path"}) + CREATE (c:Chunk {id: "0:0"}) + CREATE (c)-[:FROM_DOCUMENT]->(d) + CREATE (alice1:__Entity__:Person {id: "0:0:1", name: "Alice"}) + CREATE (alice2:__Entity__:Human {id: "0:0:2", name: "Alice"}) + CREATE (alice1)-[:FROM_CHUNK]->(c) + CREATE (alice2)-[:FROM_CHUNK]->(c) + CREATE (sweden)-[:FROM_CHUNK]->(c) + """ + ) + resolver = SinglePropertyExactMatchResolver(driver) + res = await resolver.run() + # __Entity__ nodes attached to a chunk + assert res.number_of_nodes_to_resolve == 2 + # Alice Person and Alice Human + assert res.number_of_created_nodes == 2 + + records, _, _ = driver.execute_query("MATCH (alice {name: 'Alice'}) RETURN alice") + assert len(records) == 2 + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_resolver_custom_property(driver: neo4j.Driver) -> None: + driver.execute_query("MATCH (n) DETACH DELETE n") + driver.execute_query( + """ + CREATE (d:Document {id: "0", path: "path"}) + CREATE (c:Chunk {id: "0:0"}) + CREATE (c)-[:FROM_DOCUMENT]->(d) + CREATE (alice:__Entity__:Person {id: "0:0:1", name: "Alice"}) + CREATE (alicia:__Entity__:Person {id: "0:0:1", name: "Alicia"}) + CREATE (alice)-[:FROM_CHUNK]->(c) + CREATE (alicia)-[:FROM_CHUNK]->(c) + """ + ) + resolver = SinglePropertyExactMatchResolver(driver, resolve_property="id") + res = await resolver.run() + # __Entity__ nodes attached to a chunk + assert res.number_of_nodes_to_resolve == 2 + # Alice + assert res.number_of_created_nodes == 1 + + records, _, _ = driver.execute_query("MATCH (person:Person) RETURN person") + assert len(records) == 1 + assert records[0].get("person").get("name") == "Alice" + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_resolver_custom_filter(driver: neo4j.Driver) -> None: + driver.execute_query("MATCH (n) DETACH DELETE n") + driver.execute_query( + """ + CREATE (d:Document {id: "0", path: "path"}) + CREATE (c:Chunk {id: "0:0"}) + CREATE (c)-[:FROM_DOCUMENT]->(d) + CREATE (alice1:__Entity__:Person {id: "0:0:1", name: "Alice"}) + CREATE (alice2:__Entity__:Person {id: "0:0:2", name: "Alice"}) + CREATE (sweden:__Entity__:Country {id: "0:0:3", name: "Sweden"}) + CREATE (alice1)-[:LIVES_IN]->(sweden) + CREATE (alice1)-[:FROM_CHUNK]->(c) + CREATE (alice2)-[:FROM_CHUNK]->(c) + CREATE (sweden)-[:FROM_CHUNK]->(c) + """ + ) + resolver = SinglePropertyExactMatchResolver( + driver, filter_query="WHERE not entity:Person" + ) + res = await resolver.run() + # __Entity__ nodes attached to a chunk without a Person label + # so only Country here + assert res.number_of_nodes_to_resolve == 1 + # Sweden + assert res.number_of_created_nodes == 1 diff --git a/tests/e2e/test_kg_builder_pipeline_e2e.py b/tests/e2e/test_kg_builder_pipeline_e2e.py index 10fa6041..1713098b 100644 --- a/tests/e2e/test_kg_builder_pipeline_e2e.py +++ b/tests/e2e/test_kg_builder_pipeline_e2e.py @@ -28,6 +28,9 @@ OnError, ) from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter +from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, +) from neo4j_graphrag.experimental.components.schema import ( SchemaBuilder, SchemaEntity, @@ -63,7 +66,7 @@ def schema_builder() -> SchemaBuilder: @pytest.fixture def text_splitter() -> FixedSizeSplitter: - return FixedSizeSplitter(chunk_size=500, chunk_overlap=100) + return FixedSizeSplitter(chunk_size=500, chunk_overlap=10) @pytest.fixture @@ -84,6 +87,11 @@ def kg_writer(driver: neo4j.Driver) -> Neo4jWriter: return Neo4jWriter(driver) +@pytest.fixture +def entity_resolver(driver: neo4j.Driver) -> SinglePropertyExactMatchResolver: + return SinglePropertyExactMatchResolver(driver) + + @pytest.fixture def kg_builder_pipeline( text_splitter: FixedSizeSplitter, @@ -91,29 +99,16 @@ def kg_builder_pipeline( schema_builder: SchemaBuilder, entity_relation_extractor: LLMEntityRelationExtractor, kg_writer: Neo4jWriter, + entity_resolver: SinglePropertyExactMatchResolver, ) -> Pipeline: pipe = Pipeline() # define the components - pipe.add_component( - text_splitter, - "splitter", - ) - pipe.add_component( - chunk_embedder, - "embedder", - ) - pipe.add_component( - schema_builder, - "schema", - ) - pipe.add_component( - entity_relation_extractor, - "extractor", - ) - pipe.add_component( - kg_writer, - "writer", - ) + pipe.add_component(text_splitter, "splitter") + pipe.add_component(chunk_embedder, "embedder") + pipe.add_component(schema_builder, "schema") + pipe.add_component(entity_relation_extractor, "extractor") + pipe.add_component(kg_writer, "writer") + pipe.add_component(entity_resolver, "resolver") # define the execution order of component # and how the output of previous components must be used pipe.connect("splitter", "embedder", input_config={"text_chunks": "splitter"}) @@ -125,6 +120,7 @@ def kg_builder_pipeline( "writer", input_config={"graph": "extractor"}, ) + pipe.connect("writer", "resolver", {}) return pipe @@ -135,6 +131,24 @@ def harry_potter_text() -> str: return text +@pytest.fixture +def harry_potter_text_part1() -> str: + with open( + os.path.join(BASE_DIR, "data/documents/harry_potter_part1.txt"), "r" + ) as f: + text = f.read() + return text + + +@pytest.fixture +def harry_potter_text_part2() -> str: + with open( + os.path.join(BASE_DIR, "data/documents/harry_potter_part2.txt"), "r" + ) as f: + text = f.read() + return text + + @pytest.mark.asyncio @pytest.mark.usefixtures("setup_neo4j_for_kg_construction") async def test_pipeline_builder_happy_path( @@ -190,7 +204,6 @@ async def test_pipeline_builder_happy_path( }""" ), LLMResponse(content='{"nodes": [], "relationships": []}'), - LLMResponse(content='{"nodes": [], "relationships": []}'), ] # user input: @@ -252,25 +265,25 @@ async def test_pipeline_builder_happy_path( } res = await kg_builder_pipeline.run(pipe_inputs) # llm must have been called for each chunk - assert llm.ainvoke.call_count == 3 + assert llm.ainvoke.call_count == 2 # result must be success assert isinstance(res, PipelineResult) assert res.run_id is not None - assert res.result == {"writer": {"status": "SUCCESS"}} + assert "resolver" in res.result # check component's results chunks = await kg_builder_pipeline.store.get_result_for_component( res.run_id, "splitter" ) - assert len(chunks["chunks"]) == 3 + assert len(chunks["chunks"]) == 2 graph = await kg_builder_pipeline.store.get_result_for_component( res.run_id, "extractor" ) - # 3 entities + 3 chunks + 1 document + # 3 entities + 2 chunks + 1 document nodes = graph["nodes"] - assert len(nodes) == 7 + assert len(nodes) == 6 label_counts = dict(Counter([n["label"] for n in nodes])) assert label_counts == { - "Chunk": 3, + "Chunk": 2, "Document": 1, "Person": 2, "Organization": 1, @@ -279,23 +292,23 @@ async def test_pipeline_builder_happy_path( # + 3 rels between entities and their chunk # + 2 "NEXT_CHUNK" rels relationships = graph["relationships"] - assert len(relationships) == 10 + assert len(relationships) == 8 type_counts = dict(Counter([r["type"] for r in relationships])) assert type_counts == { "FROM_CHUNK": 3, - "FROM_DOCUMENT": 3, + "FROM_DOCUMENT": 2, "KNOWS": 1, "LED_BY": 1, - "NEXT_CHUNK": 2, + "NEXT_CHUNK": 1, } # then check content of neo4j db created_nodes = driver.execute_query("MATCH (n) RETURN n") - assert len(created_nodes.records) == 7 + assert len(created_nodes.records) == 6 created_rels = driver.execute_query("MATCH ()-[r]->() RETURN r") - assert len(created_rels.records) == 10 + assert len(created_rels.records) == 8 created_chunks = driver.execute_query("MATCH (n:Chunk) RETURN n").records - assert len(created_chunks) == 3 + assert len(created_chunks) == 2 for c in created_chunks: node = c.get("n") assert node.get("embedding") == [1, 2, 3] @@ -358,7 +371,6 @@ async def test_pipeline_builder_failing_chunk_raise( }""" ), LLMResponse(content="invalid json"), - LLMResponse(content='{"nodes": [], "relationships": []}'), ] # user input: @@ -437,7 +449,6 @@ async def test_pipeline_builder_failing_chunk_do_not_raise( ] }""" ), - LLMResponse(content='{"nodes": [], "relationships": []}'), ] # user input: @@ -458,37 +469,157 @@ async def test_pipeline_builder_failing_chunk_do_not_raise( ).component.on_error = OnError.IGNORE # type: ignore[attr-defined, unused-ignore] res = await kg_builder_pipeline.run(pipe_inputs) # llm must have been called for each chunk - assert llm.ainvoke.call_count == 3 + assert llm.ainvoke.call_count == 2 # result must be success assert isinstance(res, PipelineResult) assert res.run_id is not None - assert res.result == {"writer": {"status": "SUCCESS"}} + assert res.result == { + "resolver": {"number_of_created_nodes": 3, "number_of_nodes_to_resolve": 3} + } # check component's results chunks = await kg_builder_pipeline.store.get_result_for_component( res.run_id, "splitter" ) - assert len(chunks["chunks"]) == 3 + assert len(chunks["chunks"]) == 2 graph = await kg_builder_pipeline.store.get_result_for_component( res.run_id, "extractor" ) - # 3 entities + 3 chunks + # 3 entities + 2 chunks nodes = graph["nodes"] - assert len(nodes) == 6 + assert len(nodes) == 5 label_counts = dict(Counter([n["label"] for n in nodes])) assert label_counts == { - "Chunk": 3, + "Chunk": 2, "Person": 2, "Organization": 1, } # 2 relationships between entities # + 3 rels between entities and their chunk - # + 2 "NEXT_CHUNK" rels + # + 1 "NEXT_CHUNK" rels relationships = graph["relationships"] - assert len(relationships) == 7 + assert len(relationships) == 6 type_counts = dict(Counter([r["type"] for r in relationships])) - assert type_counts == {"FROM_CHUNK": 3, "KNOWS": 1, "LED_BY": 1, "NEXT_CHUNK": 2} + assert type_counts == {"FROM_CHUNK": 3, "KNOWS": 1, "LED_BY": 1, "NEXT_CHUNK": 1} # then check content of neo4j db created_nodes = driver.execute_query("MATCH (n) RETURN n") - assert len(created_nodes.records) == 6 + assert len(created_nodes.records) == 5 created_rels = driver.execute_query("MATCH ()-[r]->() RETURN r") - assert len(created_rels.records) == 7 + assert len(created_rels.records) == 6 + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("setup_neo4j_for_kg_construction") +async def test_pipeline_builder_two_documents( + harry_potter_text_part1: str, + harry_potter_text_part2: str, + embedder: MagicMock, + llm: MagicMock, + driver: neo4j.Driver, + kg_builder_pipeline: Pipeline, +) -> None: + """Run same pipeline on two documents. Check entity resolution. + + First document: + 2 chunks, entities Harry and The Order of the Phoenix, 1 relationship + Second document: + 1 chunk, entities Harry and Alastor Mad-Eye Moody, 1 relationship + + Should create: + 1 document node + 3 chunk nodes + 3 entities (1 Harry + the other two) + ==> 7 nodes + 3 relationships for lexical graph + 3 relationships for the entity graph + ==> 6 relationships + """ + driver.execute_query("MATCH (n) DETACH DELETE n") + embedder.embed_query.return_value = [1, 2, 3] + llm.ainvoke.side_effect = [ + LLMResponse( + content="""{ + "nodes": [ + { + "id": "0", + "label": "Person", + "properties": { + "name": "Harry" + } + }, + { + "id": "1", + "label": "Organization", + "properties": { + "name": "The Order of the Phoenix" + } + } + ], + "relationships": [ + { + "type": "MEMBER_OF", + "start_node_id": "0", + "end_node_id": "1" + } + ] + }""" + ), + LLMResponse( + content="""{ + "nodes": [ + { + "id": "10", + "label": "Person", + "properties": { + "name": "Harry" + } + }, + { + "id": "11", + "label": "Person", + "properties": { + "name": "Alastor Mad-Eye Moody" + } + } + ], + "relationships": [ + { + "type": "KNOWS", + "start_node_id": "10", + "end_node_id": "11" + } + ] + }""" + ), + LLMResponse(content='{"nodes": [], "relationships": []}'), + ] + + # user input: + # the initial text + # and the list of entities and relations we are looking for + pipe_inputs_1 = { + "splitter": {"text": harry_potter_text_part1}, + # note: schema not used in this test because + # we are mocking the LLM + "schema": { + "entities": [], + "relations": [], + "potential_schema": [], + }, + } + pipe_inputs_2 = { + "splitter": {"text": harry_potter_text_part2}, + # note: schema not used in this test because + # we are mocking the LLM + "schema": { + "entities": [], + "relations": [], + "potential_schema": [], + }, + } + await kg_builder_pipeline.run(pipe_inputs_1) + await kg_builder_pipeline.run(pipe_inputs_2) + created_nodes = driver.execute_query("MATCH (n:__Entity__) RETURN n") + assert len(created_nodes.records) == 3 + created_rels = driver.execute_query( + "MATCH (:__Entity__)-[r]->(:__Entity__) RETURN r" + ) + assert len(created_rels.records) == 2 diff --git a/tests/e2e/test_kg_writer_component_e2e.py b/tests/e2e/test_kg_writer_component_e2e.py index c7f495cf..6388dccc 100644 --- a/tests/e2e/test_kg_writer_component_e2e.py +++ b/tests/e2e/test_kg_writer_component_e2e.py @@ -28,26 +28,39 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: start_node = Neo4jNode( id="1", - label="Document", + label="MyLabel", properties={"chunk": 1}, embedding_properties={"vectorProperty": [1.0, 2.0, 3.0]}, ) end_node = Neo4jNode( id="2", - label="Document", - properties={"chunk": 2}, - embedding_properties={"vectorProperty": [1.0, 2.0, 3.0]}, + label="MyLabel", + properties={}, + embedding_properties=None, ) relationship = Neo4jRelationship( - start_node_id="1", end_node_id="2", type="NEXT_CHUNK" + start_node_id="1", end_node_id="2", type="MY_RELATIONSHIP" + ) + node_with_two_embeddings = Neo4jNode( + id="3", + label="MyLabel", + properties={"chunk": 1}, + embedding_properties={ + "vectorProperty": [1.0, 2.0, 3.0], + "otherVectorProperty": [10.0, 20.0, 30.0], + }, + ) + graph = Neo4jGraph( + nodes=[start_node, end_node, node_with_two_embeddings], + relationships=[relationship], ) - graph = Neo4jGraph(nodes=[start_node, end_node], relationships=[relationship]) neo4j_writer = Neo4jWriter(driver=driver) - await neo4j_writer.run(graph=graph) + res = await neo4j_writer.run(graph=graph) + assert res.status == "SUCCESS" query = """ - MATCH (a:Document {id: '1'})-[r:NEXT_CHUNK]->(b:Document {id: '2'}) + MATCH (a:MyLabel {id: '1'})-[r:MY_RELATIONSHIP]->(b:MyLabel {id: '2'}) RETURN a, r, b """ record = driver.execute_query(query).records[0] @@ -56,11 +69,10 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: node_a = record["a"] assert start_node.label in list(node_a.labels) assert start_node.id == str(node_a.get("id")) - if start_node.properties: - for key, val in start_node.properties.items(): - assert key in node_a.keys() - assert val == node_a.get(key) - if start_node.embedding_properties: + for key, val in start_node.properties.items(): + assert key in node_a.keys() + assert val == node_a.get(key) + if start_node.embedding_properties: # for mypy for key, val in start_node.embedding_properties.items(): assert key in node_a.keys() assert node_a.get(key) == [1.0, 2.0, 3.0] @@ -68,17 +80,23 @@ async def test_kg_writer(driver: neo4j.Driver) -> None: node_b = record["b"] assert end_node.label in list(node_b.labels) assert end_node.id == str(node_b.get("id")) - if end_node.properties: - for key, val in end_node.properties.items(): - assert key in node_b.keys() - assert val == node_b.get(key) - if end_node.embedding_properties: - for key, val in end_node.embedding_properties.items(): - assert key in node_b.keys() - assert node_b.get(key) == [1.0, 2.0, 3.0] + for key, val in end_node.properties.items(): + assert key in node_b.keys() + assert val == node_b.get(key) rel = record["r"] assert rel.type == relationship.type - assert relationship.start_node_id and relationship.end_node_id in [ - str(node.get("id")) for node in rel.nodes - ] + assert relationship.start_node_id == rel.start_node.get("id") + assert relationship.end_node_id == rel.end_node.get("id") + + query = """ + MATCH (c:MyLabel {id: '3'}) + RETURN c + """ + records = driver.execute_query(query).records + assert len(records) == 1 + node_c = records[0]["c"] + if node_with_two_embeddings.embedding_properties: # for mypy + for key, val in node_with_two_embeddings.embedding_properties.items(): + assert key in node_c.keys() + assert val == node_c.get(key) diff --git a/tests/unit/experimental/components/test_kg_writer.py b/tests/unit/experimental/components/test_kg_writer.py index ee1f49ff..94271d5a 100644 --- a/tests/unit/experimental/components/test_kg_writer.py +++ b/tests/unit/experimental/components/test_kg_writer.py @@ -18,7 +18,7 @@ from unittest.mock import MagicMock, Mock import pytest -from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter +from neo4j_graphrag.experimental.components.kg_writer import Neo4jWriter, batched from neo4j_graphrag.experimental.components.types import ( Neo4jGraph, Neo4jNode, @@ -27,17 +27,41 @@ from neo4j_graphrag.neo4j_queries import UPSERT_NODE_QUERY, UPSERT_RELATIONSHIP_QUERY +def test_batched() -> None: + assert list(batched([1, 2, 3, 4], batch_size=2)) == [ + [1, 2], + [3, 4], + ] + assert list(batched([1, 2, 3], batch_size=2)) == [ + [1, 2], + [3], + ] + assert list(batched([1, 2, 3], batch_size=4)) == [ + [1, 2, 3], + ] + + @mock.patch( "neo4j_graphrag.experimental.components.kg_writer.Neo4jWriter._db_setup", return_value=None, ) -def test_upsert_node(driver: MagicMock) -> None: +def test_upsert_nodes(driver: MagicMock) -> None: neo4j_writer = Neo4jWriter(driver=driver) node = Neo4jNode(id="1", label="Label", properties={"key": "value"}) - neo4j_writer._upsert_node(node=node) + neo4j_writer._upsert_nodes(nodes=[node]) driver.execute_query.assert_called_once_with( - UPSERT_NODE_QUERY.format(label="Label"), - parameters_={"id": "1", "properties": {"key": "value"}, "embeddings": None}, + UPSERT_NODE_QUERY, + parameters_={ + "rows": [ + { + "label": "Label", + "labels": ["Label", "__Entity__"], + "id": "1", + "properties": {"key": "value"}, + "embedding_properties": None, + } + ] + }, ) @@ -45,7 +69,7 @@ def test_upsert_node(driver: MagicMock) -> None: "neo4j_graphrag.experimental.components.kg_writer.Neo4jWriter._db_setup", return_value=None, ) -def test_upsert_node_with_embedding( +def test_upsert_nodes_with_embedding( driver: MagicMock, ) -> None: neo4j_writer = Neo4jWriter(driver=driver) @@ -56,13 +80,19 @@ def test_upsert_node_with_embedding( embedding_properties={"embeddingProp": [1.0, 2.0, 3.0]}, ) driver.execute_query.return_value.records = [{"elementId(n)": 1}] - neo4j_writer._upsert_node(node=node) + neo4j_writer._upsert_nodes(nodes=[node]) driver.execute_query.assert_any_call( - UPSERT_NODE_QUERY.format(label="Label"), + UPSERT_NODE_QUERY, parameters_={ - "id": "1", - "properties": {"key": "value"}, - "embeddings": {"embeddingProp": [1.0, 2.0, 3.0]}, + "rows": [ + { + "label": "Label", + "labels": ["Label", "__Entity__"], + "id": "1", + "properties": {"key": "value"}, + "embedding_properties": {"embeddingProp": [1.0, 2.0, 3.0]}, + } + ] }, ) @@ -79,18 +109,20 @@ def test_upsert_relationship(driver: MagicMock) -> None: type="RELATIONSHIP", properties={"key": "value"}, ) - neo4j_writer._upsert_relationship(rel=rel) + neo4j_writer._upsert_relationships(rels=[rel]) parameters = { - "start_node_id": "1", - "end_node_id": "2", - "properties": {"key": "value"}, - "embeddings": None, + "rows": [ + { + "type": "RELATIONSHIP", + "start_node_id": "1", + "end_node_id": "2", + "properties": {"key": "value"}, + "embedding_properties": None, + } + ] } driver.execute_query.assert_called_once_with( - UPSERT_RELATIONSHIP_QUERY.format( - type="RELATIONSHIP", - properties="{key: $key}", - ), + UPSERT_RELATIONSHIP_QUERY, parameters_=parameters, ) @@ -109,18 +141,20 @@ def test_upsert_relationship_with_embedding(_: Mock, driver: MagicMock) -> None: embedding_properties={"embeddingProp": [1.0, 2.0, 3.0]}, ) driver.execute_query.return_value.records = [{"elementId(r)": "rel_elem_id"}] - neo4j_writer._upsert_relationship(rel=rel) + neo4j_writer._upsert_relationships(rels=[rel]) parameters = { - "start_node_id": "1", - "end_node_id": "2", - "properties": {"key": "value"}, - "embeddings": {"embeddingProp": [1.0, 2.0, 3.0]}, + "rows": [ + { + "type": "RELATIONSHIP", + "start_node_id": "1", + "end_node_id": "2", + "properties": {"key": "value"}, + "embedding_properties": {"embeddingProp": [1.0, 2.0, 3.0]}, + } + ] } driver.execute_query.assert_any_call( - UPSERT_RELATIONSHIP_QUERY.format( - type="RELATIONSHIP", - properties="{key: $key}", - ), + UPSERT_RELATIONSHIP_QUERY, parameters_=parameters, ) @@ -137,17 +171,32 @@ async def test_run(_: Mock, driver: MagicMock) -> None: graph = Neo4jGraph(nodes=[node], relationships=[rel]) await neo4j_writer.run(graph=graph) driver.execute_query.assert_any_call( - UPSERT_NODE_QUERY.format(label="Label"), - parameters_={"id": "1", "properties": {}, "embeddings": None}, + UPSERT_NODE_QUERY, + parameters_={ + "rows": [ + { + "label": "Label", + "labels": ["Label", "__Entity__"], + "id": "1", + "properties": {}, + "embedding_properties": None, + } + ] + }, ) parameters_ = { - "start_node_id": "1", - "end_node_id": "2", - "properties": {}, - "embeddings": None, + "rows": [ + { + "type": "RELATIONSHIP", + "start_node_id": "1", + "end_node_id": "2", + "properties": {}, + "embedding_properties": None, + } + ] } driver.execute_query.assert_any_call( - UPSERT_RELATIONSHIP_QUERY.format(type="RELATIONSHIP", properties="{}"), + UPSERT_RELATIONSHIP_QUERY, parameters_=parameters_, ) @@ -164,16 +213,31 @@ async def test_run_async_driver(_: Mock, async_driver: MagicMock) -> None: graph = Neo4jGraph(nodes=[node], relationships=[rel]) await neo4j_writer.run(graph=graph) async_driver.execute_query.assert_any_call( - UPSERT_NODE_QUERY.format(label="Label"), - parameters_={"id": "1", "properties": {}, "embeddings": None}, + UPSERT_NODE_QUERY, + parameters_={ + "rows": [ + { + "label": "Label", + "labels": ["Label", "__Entity__"], + "id": "1", + "properties": {}, + "embedding_properties": None, + } + ] + }, ) parameters_ = { - "start_node_id": "1", - "end_node_id": "2", - "properties": {}, - "embeddings": None, + "rows": [ + { + "type": "RELATIONSHIP", + "start_node_id": "1", + "end_node_id": "2", + "properties": {}, + "embedding_properties": None, + } + ] } async_driver.execute_query.assert_any_call( - UPSERT_RELATIONSHIP_QUERY.format(type="RELATIONSHIP", properties="{}"), + UPSERT_RELATIONSHIP_QUERY, parameters_=parameters_, ) diff --git a/tests/unit/experimental/components/test_resolver.py b/tests/unit/experimental/components/test_resolver.py new file mode 100644 index 00000000..558b23f5 --- /dev/null +++ b/tests/unit/experimental/components/test_resolver.py @@ -0,0 +1,59 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import MagicMock, call + +import neo4j +import pytest +from neo4j_graphrag.experimental.components.resolver import ( + SinglePropertyExactMatchResolver, +) +from neo4j_graphrag.experimental.components.types import ResolutionStats + + +@pytest.mark.asyncio +async def test_simple_resolver(driver: MagicMock) -> None: + driver.execute_query.side_effect = [ + ([neo4j.Record({"c": 2})], None, None), + ([neo4j.Record({"c": 1})], None, None), + ] + resolver = SinglePropertyExactMatchResolver(driver=driver) + res = await resolver.run() + assert isinstance(res, ResolutionStats) + assert res.number_of_nodes_to_resolve == 2 + assert res.number_of_created_nodes == 1 + assert driver.execute_query.call_count == 2 + driver.execute_query.assert_has_calls( + [call("MATCH (entity:__Entity__) RETURN count(entity) as c", database_=None)] + ) + + +@pytest.mark.asyncio +async def test_simple_resolver_custom_filter(driver: MagicMock) -> None: + driver.execute_query.side_effect = [ + ([neo4j.Record({"c": 2})], None, None), + ([neo4j.Record({"c": 1})], None, None), + ] + resolver = SinglePropertyExactMatchResolver( + driver=driver, filter_query="WHERE not entity:Resolved" + ) + await resolver.run() + driver.execute_query.assert_has_calls( + [ + call( + "MATCH (entity:__Entity__) WHERE not entity:Resolved RETURN count(entity) as c", + database_=None, + ) + ] + ) diff --git a/tests/unit/experimental/pipeline/test_kg_builder.py b/tests/unit/experimental/pipeline/test_kg_builder.py new file mode 100644 index 00000000..f509be6b --- /dev/null +++ b/tests/unit/experimental/pipeline/test_kg_builder.py @@ -0,0 +1,256 @@ +# Copyright (c) "Neo4j" +# Neo4j Sweden AB [https://neo4j.com] +# # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# # +# https://www.apache.org/licenses/LICENSE-2.0 +# # +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import MagicMock, patch + +import neo4j +import pytest +from neo4j_graphrag.experimental.components.entity_relation_extractor import OnError +from neo4j_graphrag.experimental.components.schema import SchemaEntity, SchemaRelation +from neo4j_graphrag.experimental.pipeline.exceptions import PipelineDefinitionError +from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline +from neo4j_graphrag.experimental.pipeline.pipeline import PipelineResult +from neo4j_graphrag.llm.base import LLMInterface + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_init_with_text() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=False, + ) + + assert kg_builder.llm == llm + assert kg_builder.driver == driver + assert kg_builder.from_pdf is False + assert kg_builder.entities == [] + assert kg_builder.relations == [] + assert kg_builder.potential_schema == [] + assert "pdf_loader" not in kg_builder.pipeline + + text_input = "May thy knife chip and shatter." + + with patch.object( + kg_builder.pipeline, + "run", + return_value=PipelineResult(run_id="test_run", result=None), + ) as mock_run: + await kg_builder.run_async(text=text_input) + mock_run.assert_called_once() + pipe_inputs = mock_run.call_args[0][0] + assert pipe_inputs["splitter"]["text"] == text_input + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_init_with_file_path() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=True, + ) + + assert kg_builder.llm == llm + assert kg_builder.driver == driver + assert kg_builder.from_pdf is True + assert kg_builder.entities == [] + assert kg_builder.relations == [] + assert kg_builder.potential_schema == [] + assert "pdf_loader" in kg_builder.pipeline + + file_path = "path/to/test.pdf" + + with patch.object( + kg_builder.pipeline, + "run", + return_value=PipelineResult(run_id="test_run", result=None), + ) as mock_run: + await kg_builder.run_async(file_path=file_path) + mock_run.assert_called_once() + pipe_inputs = mock_run.call_args[0][0] + assert pipe_inputs["pdf_loader"]["filepath"] == file_path + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_run_with_both_inputs() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=True, + ) + + text_input = "May thy knife chip and shatter." + file_path = "path/to/test.pdf" + + with pytest.raises(PipelineDefinitionError) as exc_info: + await kg_builder.run_async(file_path=file_path, text=text_input) + + assert "Expected 'file_path' argument when 'from_pdf' is True." in str( + exc_info.value + ) or "Expected 'text' argument when 'from_pdf' is False." in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_run_with_no_inputs() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=True, # or False + ) + + with pytest.raises(PipelineDefinitionError) as exc_info: + await kg_builder.run_async() + + assert "Expected 'file_path' argument when 'from_pdf' is True." in str( + exc_info.value + ) or "Expected 'text' argument when 'from_pdf' is False." in str(exc_info.value) + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_document_info_with_file() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=True, + ) + + file_path = "path/to/test.pdf" + + with patch.object( + kg_builder.pipeline, + "run", + return_value=PipelineResult(run_id="test_run", result=None), + ) as mock_run: + await kg_builder.run_async(file_path=file_path) + + pipe_inputs = mock_run.call_args[0][0] + assert "pdf_loader" in pipe_inputs + assert pipe_inputs["pdf_loader"] == {"filepath": file_path} + assert "extractor" not in pipe_inputs + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_document_info_with_text() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + from_pdf=False, + ) + + text_input = "May thy knife chip and shatter." + + with patch.object( + kg_builder.pipeline, + "run", + return_value=PipelineResult(run_id="test_run", result=None), + ) as mock_run: + await kg_builder.run_async(text=text_input) + + pipe_inputs = mock_run.call_args[0][0] + assert "splitter" in pipe_inputs + assert pipe_inputs["splitter"] == {"text": text_input} + + +@pytest.mark.asyncio +async def test_knowledge_graph_builder_with_entities_and_file() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + entities = ["Document", "Section"] + relations = ["CONTAINS"] + potential_schema = [("Document", "CONTAINS", "Section")] + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + entities=entities, + relations=relations, + potential_schema=potential_schema, + from_pdf=True, + ) + + internal_entities = [SchemaEntity(label=label) for label in entities] + internal_relations = [SchemaRelation(label=label) for label in relations] + assert kg_builder.entities == internal_entities + assert kg_builder.relations == internal_relations + assert kg_builder.potential_schema == potential_schema + + file_path = "path/to/test.pdf" + + with patch.object( + kg_builder.pipeline, + "run", + return_value=PipelineResult(run_id="test_run", result=None), + ) as mock_run: + await kg_builder.run_async(file_path=file_path) + pipe_inputs = mock_run.call_args[0][0] + assert pipe_inputs["schema"]["entities"] == internal_entities + assert pipe_inputs["schema"]["relations"] == internal_relations + assert pipe_inputs["schema"]["potential_schema"] == potential_schema + + +def test_simple_kg_pipeline_on_error_conversion() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, + driver=driver, + on_error="RAISE", + ) + + assert kg_builder.on_error == OnError.RAISE + + +def test_simple_kg_pipeline_on_error_invalid_value() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + with pytest.raises(PipelineDefinitionError) as exc_info: + SimpleKGPipeline( + llm=llm, + driver=driver, + on_error="IGNORE", + ) + + assert "Expected 'RAISE' or 'CONTINUE'" in str(exc_info.value) + + +def test_simple_kg_pipeline_no_entity_resolution() -> None: + llm = MagicMock(spec=LLMInterface) + driver = MagicMock(spec=neo4j.Driver) + + kg_builder = SimpleKGPipeline( + llm=llm, driver=driver, on_error="CONTINUE", perform_entity_resolution=False + ) + + assert "resolver" not in kg_builder.pipeline