From 62076c30dd00278573538991293059b130ec46ab Mon Sep 17 00:00:00 2001 From: abhijeet Date: Sun, 11 Feb 2024 12:38:44 -0500 Subject: [PATCH 1/5] checking text property first in neo4j to avoid duplicate nodes --- .../langchain_community/vectorstores/neo4j_vector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index bb8f1b9b30136..125a4d5f59b8b 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -501,12 +501,12 @@ def add_embeddings( import_query = ( "UNWIND $data AS row " "CALL { WITH row " - f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + f"MERGE (c:`{self.node_label}` {{{self.text_node_property}: row.text}}) " "WITH c, row " f"CALL db.create.setVectorProperty(c, " f"'{self.embedding_node_property}', row.embedding) " "YIELD node " - f"SET c.`{self.text_node_property}` = row.text " + "SET c.id = row.id " "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS" ) From c12017d9c582b2e208aa7195190873b37a046c86 Mon Sep 17 00:00:00 2001 From: abhijeet Date: Mon, 12 Feb 2024 17:35:02 -0500 Subject: [PATCH 2/5] adding hash function as id to not have duplicates. adding relationships with parent or child nodes --- .../vectorstores/neo4j_vector.py | 88 +++++++++++-------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index 125a4d5f59b8b..e80e6ad4c8245 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -14,6 +14,7 @@ Tuple, Type, ) +import hashlib from langchain_core.documents import Document from langchain_core.embeddings import Embeddings @@ -481,47 +482,64 @@ def add_embeddings( texts: Iterable[str], embeddings: List[List[float]], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + parent_ids: Optional[List[str]] = None, # New argument for parent IDs **kwargs: Any, - ) -> List[str]: - """Add embeddings to the vectorstore. + ) -> List[str]: + """Add embeddings to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + embeddings: List of list of embedding vectors. + metadatas: List of metadatas associated with the texts. + parent_ids: Optionally, a list of parent identifiers for the texts. + kwargs: Additional vectorstore-specific parameters. + """ + + # Get child ids from the kwargs + child_ids = kwargs.get("child_ids", []) + parent_ids = kwargs.get("parent_ids", []) + relationship = kwargs.get("relationship", "HAS_CHILD") + + # Generate deterministic IDs based on hash of texts + ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() for text in texts] + + # Initialize metadata for each text if not provided + if not metadatas: + metadatas = [{} for _ in texts] + + # Define the import query with dynamic relationship type + import_query = """ + UNWIND $data AS row + MERGE (c:`{node_label}` {{id: row.id}}) + ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata + WITH c, row + FOREACH (parentId IN (CASE WHEN $parent_ids IS NOT NULL THEN $parent_ids ELSE [] END) | + MERGE (p:`{node_label}` {{id: parentId}}) + MERGE (p)-[:HAS_CHILD]->(c) + ) + WITH c + FOREACH (childId IN (CASE WHEN $child_ids IS NOT NULL THEN $child_ids ELSE [] END) | + MERGE (ch:`{node_label}` {{id: childId}}) + MERGE (c)-[:{relationship}]->(ch) + ) + """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship) - Args: - texts: Iterable of strings to add to the vectorstore. - embeddings: List of list of embedding vectors. - metadatas: List of metadatas associated with the texts. - kwargs: vectorstore specific parameters - """ - if ids is None: - ids = [str(uuid.uuid1()) for _ in texts] - if not metadatas: - metadatas = [{} for _ in texts] - - import_query = ( - "UNWIND $data AS row " - "CALL { WITH row " - f"MERGE (c:`{self.node_label}` {{{self.text_node_property}: row.text}}) " - "WITH c, row " - f"CALL db.create.setVectorProperty(c, " - f"'{self.embedding_node_property}', row.embedding) " - "YIELD node " - "SET c.id = row.id " - "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS" - ) + parameters = { + "data": [ + {"text": text, "metadata": metadata, "embedding": embedding, "id": id} + for text, metadata, embedding, id in zip( + texts, metadatas, embeddings, ids + ) + ], + "parent_ids": parent_ids, + "child_ids": child_ids, + } - parameters = { - "data": [ - {"text": text, "metadata": metadata, "embedding": embedding, "id": id} - for text, metadata, embedding, id in zip( - texts, metadatas, embeddings, ids - ) - ] - } + self.query(import_query, params=parameters) - self.query(import_query, params=parameters) + return ids - return ids def add_texts( self, From 63ce9dfe223942e8c50279272ff9aaabe7ac2189 Mon Sep 17 00:00:00 2001 From: abhijeet Date: Mon, 12 Feb 2024 17:38:48 -0500 Subject: [PATCH 3/5] updated --- .../vectorstores/neo4j_vector.py | 107 ++++++++++-------- 1 file changed, 58 insertions(+), 49 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index e80e6ad4c8245..d916129fdf5f4 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -71,7 +71,8 @@ def check_if_not_null(props: List[str], values: List[Any]) -> None: """Check if the values are not None or empty string""" for prop, value in zip(props, values): if not value: - raise ValueError(f"Parameter `{prop}` must not be None or empty string") + raise ValueError( + f"Parameter `{prop}` must not be None or empty string") def sort_by_index_name( @@ -193,7 +194,8 @@ def __init__( password = get_from_env("password", "NEO4J_PASSWORD", password) database = get_from_env("database", "NEO4J_DATABASE", database) - self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password)) + self._driver = neo4j.GraphDatabase.driver( + url, auth=(username, password)) self._database = database self.schema = "" # Verify connection @@ -288,7 +290,8 @@ def verify_version(self) -> None: """ version = self.query("CALL dbms.components()")[0]["versions"][0] if "aura" in version: - version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,) + version_tuple = tuple( + map(int, version.split("-")[0].split("."))) + (0,) else: version_tuple = tuple(map(int, version.split("."))) @@ -326,7 +329,8 @@ def retrieve_existing_index(self) -> Optional[int]: }, ) # sort by index_name - index_information = sort_by_index_name(index_information, self.index_name) + index_information = sort_by_index_name( + index_information, self.index_name) try: self.index_name = index_information[0]["name"] self.node_label = index_information[0]["labelsOrTypes"][0] @@ -365,7 +369,8 @@ def retrieve_existing_fts_index( }, ) # sort by index_name - index_information = sort_by_index_name(index_information, self.index_name) + index_information = sort_by_index_name( + index_information, self.index_name) try: self.keyword_index_name = index_information[0]["name"] self.text_node_property = index_information[0]["properties"][0] @@ -482,33 +487,34 @@ def add_embeddings( texts: Iterable[str], embeddings: List[List[float]], metadatas: Optional[List[dict]] = None, - parent_ids: Optional[List[str]] = None, # New argument for parent IDs + ids: Optional[List[str]] = None, **kwargs: Any, - ) -> List[str]: - """Add embeddings to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - embeddings: List of list of embedding vectors. - metadatas: List of metadatas associated with the texts. - parent_ids: Optionally, a list of parent identifiers for the texts. - kwargs: Additional vectorstore-specific parameters. - """ - - # Get child ids from the kwargs - child_ids = kwargs.get("child_ids", []) - parent_ids = kwargs.get("parent_ids", []) - relationship = kwargs.get("relationship", "HAS_CHILD") - - # Generate deterministic IDs based on hash of texts - ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() for text in texts] - - # Initialize metadata for each text if not provided - if not metadatas: - metadatas = [{} for _ in texts] - - # Define the import query with dynamic relationship type - import_query = """ + ) -> List[str]: + """Add embeddings to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + embeddings: List of list of embedding vectors. + metadatas: List of metadatas associated with the texts. + parent_ids: Optionally, a list of parent identifiers for the texts. + kwargs: Additional vectorstore-specific parameters. + """ + + # Get child ids from the kwargs + child_ids = kwargs.get("child_ids", []) + parent_ids = kwargs.get("parent_ids", []) + relationship = kwargs.get("relationship", "HAS_CHILD") + + # Generate deterministic IDs based on hash of texts + ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() + for text in texts] + + # Initialize metadata for each text if not provided + if not metadatas: + metadatas = [{} for _ in texts] + + # Define the import query with dynamic relationship type + import_query = """ UNWIND $data AS row MERGE (c:`{node_label}` {{id: row.id}}) ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata @@ -524,22 +530,21 @@ def add_embeddings( ) """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship) + parameters = { + "data": [ + {"text": text, "metadata": metadata, + "embedding": embedding, "id": id} + for text, metadata, embedding, id in zip( + texts, metadatas, embeddings, ids + ) + ], + "parent_ids": parent_ids, + "child_ids": child_ids, + } - parameters = { - "data": [ - {"text": text, "metadata": metadata, "embedding": embedding, "id": id} - for text, metadata, embedding, id in zip( - texts, metadatas, embeddings, ids - ) - ], - "parent_ids": parent_ids, - "child_ids": child_ids, - } - - self.query(import_query, params=parameters) - - return ids + self.query(import_query, params=parameters) + return ids def add_texts( self, @@ -634,7 +639,8 @@ def similarity_search_with_score_by_vector( self.retrieval_query if self.retrieval_query else default_retrieval ) - read_query = _get_search_index_query(self.search_type) + retrieval_query + read_query = _get_search_index_query( + self.search_type) + retrieval_query parameters = { "index": self.index_name, "k": k, @@ -922,7 +928,8 @@ def from_existing_graph( ) # FTS index for Hybrid search if search_type == SearchType.HYBRID: - fts_node_label = store.retrieve_existing_fts_index(text_node_properties) + fts_node_label = store.retrieve_existing_fts_index( + text_node_properties) # If the FTS index doesn't exist yet if not fts_node_label: store.create_new_keyword_index(text_node_properties) @@ -942,8 +949,10 @@ def from_existing_graph( "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text " "LIMIT 1000" ) - data = store.query(fetch_query, params={"props": text_node_properties}) - text_embeddings = embedding.embed_documents([el["text"] for el in data]) + data = store.query(fetch_query, params={ + "props": text_node_properties}) + text_embeddings = embedding.embed_documents( + [el["text"] for el in data]) params = { "data": [ From e380adaf2b7d66b89dfa6f195ac80ec0b6ede0c5 Mon Sep 17 00:00:00 2001 From: abhijeet Date: Tue, 13 Feb 2024 14:46:47 -0500 Subject: [PATCH 4/5] fixing params --- libs/community/langchain_community/vectorstores/neo4j_vector.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index d916129fdf5f4..68ab811e74456 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -496,7 +496,6 @@ def add_embeddings( texts: Iterable of strings to add to the vectorstore. embeddings: List of list of embedding vectors. metadatas: List of metadatas associated with the texts. - parent_ids: Optionally, a list of parent identifiers for the texts. kwargs: Additional vectorstore-specific parameters. """ From e6d059a9b984ba0b2bcf85efe623a255264d8598 Mon Sep 17 00:00:00 2001 From: abhijeet Date: Tue, 13 Feb 2024 14:50:56 -0500 Subject: [PATCH 5/5] reverting changes --- .../vectorstores/neo4j_vector.py | 76 +++++++------------ 1 file changed, 26 insertions(+), 50 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py index 68ab811e74456..93a01e2daaa31 100644 --- a/libs/community/langchain_community/vectorstores/neo4j_vector.py +++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py @@ -4,6 +4,7 @@ import logging import os import uuid +import hashlib from typing import ( Any, Callable, @@ -14,7 +15,6 @@ Tuple, Type, ) -import hashlib from langchain_core.documents import Document from langchain_core.embeddings import Embeddings @@ -71,8 +71,7 @@ def check_if_not_null(props: List[str], values: List[Any]) -> None: """Check if the values are not None or empty string""" for prop, value in zip(props, values): if not value: - raise ValueError( - f"Parameter `{prop}` must not be None or empty string") + raise ValueError(f"Parameter `{prop}` must not be None or empty string") def sort_by_index_name( @@ -194,8 +193,7 @@ def __init__( password = get_from_env("password", "NEO4J_PASSWORD", password) database = get_from_env("database", "NEO4J_DATABASE", database) - self._driver = neo4j.GraphDatabase.driver( - url, auth=(username, password)) + self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password)) self._database = database self.schema = "" # Verify connection @@ -290,8 +288,7 @@ def verify_version(self) -> None: """ version = self.query("CALL dbms.components()")[0]["versions"][0] if "aura" in version: - version_tuple = tuple( - map(int, version.split("-")[0].split("."))) + (0,) + version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,) else: version_tuple = tuple(map(int, version.split("."))) @@ -329,8 +326,7 @@ def retrieve_existing_index(self) -> Optional[int]: }, ) # sort by index_name - index_information = sort_by_index_name( - index_information, self.index_name) + index_information = sort_by_index_name(index_information, self.index_name) try: self.index_name = index_information[0]["name"] self.node_label = index_information[0]["labelsOrTypes"][0] @@ -369,8 +365,7 @@ def retrieve_existing_fts_index( }, ) # sort by index_name - index_information = sort_by_index_name( - index_information, self.index_name) + index_information = sort_by_index_name(index_information, self.index_name) try: self.keyword_index_name = index_information[0]["name"] self.text_node_property = index_information[0]["properties"][0] @@ -496,49 +491,34 @@ def add_embeddings( texts: Iterable of strings to add to the vectorstore. embeddings: List of list of embedding vectors. metadatas: List of metadatas associated with the texts. - kwargs: Additional vectorstore-specific parameters. + kwargs: vectorstore specific parameters """ - - # Get child ids from the kwargs - child_ids = kwargs.get("child_ids", []) - parent_ids = kwargs.get("parent_ids", []) - relationship = kwargs.get("relationship", "HAS_CHILD") - - # Generate deterministic IDs based on hash of texts - ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() + if ids is None: + ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() for text in texts] - # Initialize metadata for each text if not provided if not metadatas: metadatas = [{} for _ in texts] - # Define the import query with dynamic relationship type - import_query = """ - UNWIND $data AS row - MERGE (c:`{node_label}` {{id: row.id}}) - ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata - WITH c, row - FOREACH (parentId IN (CASE WHEN $parent_ids IS NOT NULL THEN $parent_ids ELSE [] END) | - MERGE (p:`{node_label}` {{id: parentId}}) - MERGE (p)-[:HAS_CHILD]->(c) - ) - WITH c - FOREACH (childId IN (CASE WHEN $child_ids IS NOT NULL THEN $child_ids ELSE [] END) | - MERGE (ch:`{node_label}` {{id: childId}}) - MERGE (c)-[:{relationship}]->(ch) - ) - """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship) + import_query = ( + "UNWIND $data AS row " + "CALL { WITH row " + f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + "WITH c, row " + f"CALL db.create.setVectorProperty(c, " + f"'{self.embedding_node_property}', row.embedding) " + "YIELD node " + f"SET c.`{self.text_node_property}` = row.text " + "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS" + ) parameters = { "data": [ - {"text": text, "metadata": metadata, - "embedding": embedding, "id": id} + {"text": text, "metadata": metadata, "embedding": embedding, "id": id} for text, metadata, embedding, id in zip( texts, metadatas, embeddings, ids ) - ], - "parent_ids": parent_ids, - "child_ids": child_ids, + ] } self.query(import_query, params=parameters) @@ -638,8 +618,7 @@ def similarity_search_with_score_by_vector( self.retrieval_query if self.retrieval_query else default_retrieval ) - read_query = _get_search_index_query( - self.search_type) + retrieval_query + read_query = _get_search_index_query(self.search_type) + retrieval_query parameters = { "index": self.index_name, "k": k, @@ -927,8 +906,7 @@ def from_existing_graph( ) # FTS index for Hybrid search if search_type == SearchType.HYBRID: - fts_node_label = store.retrieve_existing_fts_index( - text_node_properties) + fts_node_label = store.retrieve_existing_fts_index(text_node_properties) # If the FTS index doesn't exist yet if not fts_node_label: store.create_new_keyword_index(text_node_properties) @@ -948,10 +926,8 @@ def from_existing_graph( "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text " "LIMIT 1000" ) - data = store.query(fetch_query, params={ - "props": text_node_properties}) - text_embeddings = embedding.embed_documents( - [el["text"] for el in data]) + data = store.query(fetch_query, params={"props": text_node_properties}) + text_embeddings = embedding.embed_documents([el["text"] for el in data]) params = { "data": [