From 62076c30dd00278573538991293059b130ec46ab Mon Sep 17 00:00:00 2001
From: abhijeet <abhijeetmalamkar@gmail.com>
Date: Sun, 11 Feb 2024 12:38:44 -0500
Subject: [PATCH 1/5] checking text property first in neo4j to avoid duplicate
 nodes

---
 .../langchain_community/vectorstores/neo4j_vector.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index bb8f1b9b30136..125a4d5f59b8b 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -501,12 +501,12 @@ def add_embeddings(
         import_query = (
             "UNWIND $data AS row "
             "CALL { WITH row "
-            f"MERGE (c:`{self.node_label}` {{id: row.id}}) "
+            f"MERGE (c:`{self.node_label}` {{{self.text_node_property}: row.text}}) "
             "WITH c, row "
             f"CALL db.create.setVectorProperty(c, "
             f"'{self.embedding_node_property}', row.embedding) "
             "YIELD node "
-            f"SET c.`{self.text_node_property}` = row.text "
+            "SET c.id = row.id "
             "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS"
         )
 

From c12017d9c582b2e208aa7195190873b37a046c86 Mon Sep 17 00:00:00 2001
From: abhijeet <abhijeetmalamkar@gmail.com>
Date: Mon, 12 Feb 2024 17:35:02 -0500
Subject: [PATCH 2/5] adding hash function as id to not have duplicates. adding
 relationships with parent or child nodes

---
 .../vectorstores/neo4j_vector.py              | 88 +++++++++++--------
 1 file changed, 53 insertions(+), 35 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index 125a4d5f59b8b..e80e6ad4c8245 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -14,6 +14,7 @@
     Tuple,
     Type,
 )
+import hashlib
 
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
@@ -481,47 +482,64 @@ def add_embeddings(
         texts: Iterable[str],
         embeddings: List[List[float]],
         metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
+        parent_ids: Optional[List[str]] = None,  # New argument for parent IDs
         **kwargs: Any,
-    ) -> List[str]:
-        """Add embeddings to the vectorstore.
+        ) -> List[str]:
+            """Add embeddings to the vectorstore.
+
+            Args:
+                texts: Iterable of strings to add to the vectorstore.
+                embeddings: List of list of embedding vectors.
+                metadatas: List of metadatas associated with the texts.
+                parent_ids: Optionally, a list of parent identifiers for the texts.
+                kwargs: Additional vectorstore-specific parameters.
+            """
+
+            # Get child ids from the kwargs
+            child_ids = kwargs.get("child_ids", [])
+            parent_ids = kwargs.get("parent_ids", [])
+            relationship = kwargs.get("relationship", "HAS_CHILD")
+
+            # Generate deterministic IDs based on hash of texts
+            ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() for text in texts]
+
+            # Initialize metadata for each text if not provided
+            if not metadatas:
+                metadatas = [{} for _ in texts]
+
+            # Define the import query with dynamic relationship type
+            import_query = """
+            UNWIND $data AS row
+            MERGE (c:`{node_label}` {{id: row.id}})
+            ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata
+            WITH c, row
+            FOREACH (parentId IN (CASE WHEN $parent_ids IS NOT NULL THEN $parent_ids ELSE [] END) |
+                MERGE (p:`{node_label}` {{id: parentId}})
+                MERGE (p)-[:HAS_CHILD]->(c)
+            )
+            WITH c
+            FOREACH (childId IN (CASE WHEN $child_ids IS NOT NULL THEN $child_ids ELSE [] END) |
+                MERGE (ch:`{node_label}` {{id: childId}})
+                MERGE (c)-[:{relationship}]->(ch)
+            )
+            """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship)
 
-        Args:
-            texts: Iterable of strings to add to the vectorstore.
-            embeddings: List of list of embedding vectors.
-            metadatas: List of metadatas associated with the texts.
-            kwargs: vectorstore specific parameters
-        """
-        if ids is None:
-            ids = [str(uuid.uuid1()) for _ in texts]
 
-        if not metadatas:
-            metadatas = [{} for _ in texts]
-
-        import_query = (
-            "UNWIND $data AS row "
-            "CALL { WITH row "
-            f"MERGE (c:`{self.node_label}` {{{self.text_node_property}: row.text}}) "
-            "WITH c, row "
-            f"CALL db.create.setVectorProperty(c, "
-            f"'{self.embedding_node_property}', row.embedding) "
-            "YIELD node "
-            "SET c.id = row.id "
-            "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS"
-        )
+            parameters = {
+                "data": [
+                    {"text": text, "metadata": metadata, "embedding": embedding, "id": id}
+                    for text, metadata, embedding, id in zip(
+                        texts, metadatas, embeddings, ids
+                    )
+                ],
+                "parent_ids": parent_ids,
+                "child_ids": child_ids,
+            }
 
-        parameters = {
-            "data": [
-                {"text": text, "metadata": metadata, "embedding": embedding, "id": id}
-                for text, metadata, embedding, id in zip(
-                    texts, metadatas, embeddings, ids
-                )
-            ]
-        }
+            self.query(import_query, params=parameters)
 
-        self.query(import_query, params=parameters)
+            return ids
 
-        return ids
 
     def add_texts(
         self,

From 63ce9dfe223942e8c50279272ff9aaabe7ac2189 Mon Sep 17 00:00:00 2001
From: abhijeet <abhijeetmalamkar@gmail.com>
Date: Mon, 12 Feb 2024 17:38:48 -0500
Subject: [PATCH 3/5] updated

---
 .../vectorstores/neo4j_vector.py              | 107 ++++++++++--------
 1 file changed, 58 insertions(+), 49 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index e80e6ad4c8245..d916129fdf5f4 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -71,7 +71,8 @@ def check_if_not_null(props: List[str], values: List[Any]) -> None:
     """Check if the values are not None or empty string"""
     for prop, value in zip(props, values):
         if not value:
-            raise ValueError(f"Parameter `{prop}` must not be None or empty string")
+            raise ValueError(
+                f"Parameter `{prop}` must not be None or empty string")
 
 
 def sort_by_index_name(
@@ -193,7 +194,8 @@ def __init__(
         password = get_from_env("password", "NEO4J_PASSWORD", password)
         database = get_from_env("database", "NEO4J_DATABASE", database)
 
-        self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
+        self._driver = neo4j.GraphDatabase.driver(
+            url, auth=(username, password))
         self._database = database
         self.schema = ""
         # Verify connection
@@ -288,7 +290,8 @@ def verify_version(self) -> None:
         """
         version = self.query("CALL dbms.components()")[0]["versions"][0]
         if "aura" in version:
-            version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,)
+            version_tuple = tuple(
+                map(int, version.split("-")[0].split("."))) + (0,)
         else:
             version_tuple = tuple(map(int, version.split(".")))
 
@@ -326,7 +329,8 @@ def retrieve_existing_index(self) -> Optional[int]:
             },
         )
         # sort by index_name
-        index_information = sort_by_index_name(index_information, self.index_name)
+        index_information = sort_by_index_name(
+            index_information, self.index_name)
         try:
             self.index_name = index_information[0]["name"]
             self.node_label = index_information[0]["labelsOrTypes"][0]
@@ -365,7 +369,8 @@ def retrieve_existing_fts_index(
             },
         )
         # sort by index_name
-        index_information = sort_by_index_name(index_information, self.index_name)
+        index_information = sort_by_index_name(
+            index_information, self.index_name)
         try:
             self.keyword_index_name = index_information[0]["name"]
             self.text_node_property = index_information[0]["properties"][0]
@@ -482,33 +487,34 @@ def add_embeddings(
         texts: Iterable[str],
         embeddings: List[List[float]],
         metadatas: Optional[List[dict]] = None,
-        parent_ids: Optional[List[str]] = None,  # New argument for parent IDs
+        ids: Optional[List[str]] = None,
         **kwargs: Any,
-        ) -> List[str]:
-            """Add embeddings to the vectorstore.
-
-            Args:
-                texts: Iterable of strings to add to the vectorstore.
-                embeddings: List of list of embedding vectors.
-                metadatas: List of metadatas associated with the texts.
-                parent_ids: Optionally, a list of parent identifiers for the texts.
-                kwargs: Additional vectorstore-specific parameters.
-            """
-
-            # Get child ids from the kwargs
-            child_ids = kwargs.get("child_ids", [])
-            parent_ids = kwargs.get("parent_ids", [])
-            relationship = kwargs.get("relationship", "HAS_CHILD")
-
-            # Generate deterministic IDs based on hash of texts
-            ids = [hashlib.sha256(text.encode('utf-8')).hexdigest() for text in texts]
-
-            # Initialize metadata for each text if not provided
-            if not metadatas:
-                metadatas = [{} for _ in texts]
-
-            # Define the import query with dynamic relationship type
-            import_query = """
+    ) -> List[str]:
+        """Add embeddings to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            embeddings: List of list of embedding vectors.
+            metadatas: List of metadatas associated with the texts.
+            parent_ids: Optionally, a list of parent identifiers for the texts.
+            kwargs: Additional vectorstore-specific parameters.
+        """
+
+        # Get child ids from the kwargs
+        child_ids = kwargs.get("child_ids", [])
+        parent_ids = kwargs.get("parent_ids", [])
+        relationship = kwargs.get("relationship", "HAS_CHILD")
+
+        # Generate deterministic IDs based on hash of texts
+        ids = [hashlib.sha256(text.encode('utf-8')).hexdigest()
+               for text in texts]
+
+        # Initialize metadata for each text if not provided
+        if not metadatas:
+            metadatas = [{} for _ in texts]
+
+        # Define the import query with dynamic relationship type
+        import_query = """
             UNWIND $data AS row
             MERGE (c:`{node_label}` {{id: row.id}})
             ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata
@@ -524,22 +530,21 @@ def add_embeddings(
             )
             """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship)
 
+        parameters = {
+            "data": [
+                {"text": text, "metadata": metadata,
+                    "embedding": embedding, "id": id}
+                for text, metadata, embedding, id in zip(
+                    texts, metadatas, embeddings, ids
+                )
+            ],
+            "parent_ids": parent_ids,
+            "child_ids": child_ids,
+        }
 
-            parameters = {
-                "data": [
-                    {"text": text, "metadata": metadata, "embedding": embedding, "id": id}
-                    for text, metadata, embedding, id in zip(
-                        texts, metadatas, embeddings, ids
-                    )
-                ],
-                "parent_ids": parent_ids,
-                "child_ids": child_ids,
-            }
-
-            self.query(import_query, params=parameters)
-
-            return ids
+        self.query(import_query, params=parameters)
 
+        return ids
 
     def add_texts(
         self,
@@ -634,7 +639,8 @@ def similarity_search_with_score_by_vector(
             self.retrieval_query if self.retrieval_query else default_retrieval
         )
 
-        read_query = _get_search_index_query(self.search_type) + retrieval_query
+        read_query = _get_search_index_query(
+            self.search_type) + retrieval_query
         parameters = {
             "index": self.index_name,
             "k": k,
@@ -922,7 +928,8 @@ def from_existing_graph(
             )
         # FTS index for Hybrid search
         if search_type == SearchType.HYBRID:
-            fts_node_label = store.retrieve_existing_fts_index(text_node_properties)
+            fts_node_label = store.retrieve_existing_fts_index(
+                text_node_properties)
             # If the FTS index doesn't exist yet
             if not fts_node_label:
                 store.create_new_keyword_index(text_node_properties)
@@ -942,8 +949,10 @@ def from_existing_graph(
                 "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text "
                 "LIMIT 1000"
             )
-            data = store.query(fetch_query, params={"props": text_node_properties})
-            text_embeddings = embedding.embed_documents([el["text"] for el in data])
+            data = store.query(fetch_query, params={
+                               "props": text_node_properties})
+            text_embeddings = embedding.embed_documents(
+                [el["text"] for el in data])
 
             params = {
                 "data": [

From e380adaf2b7d66b89dfa6f195ac80ec0b6ede0c5 Mon Sep 17 00:00:00 2001
From: abhijeet <abhijeetmalamkar@gmail.com>
Date: Tue, 13 Feb 2024 14:46:47 -0500
Subject: [PATCH 4/5] fixing params

---
 libs/community/langchain_community/vectorstores/neo4j_vector.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index d916129fdf5f4..68ab811e74456 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -496,7 +496,6 @@ def add_embeddings(
             texts: Iterable of strings to add to the vectorstore.
             embeddings: List of list of embedding vectors.
             metadatas: List of metadatas associated with the texts.
-            parent_ids: Optionally, a list of parent identifiers for the texts.
             kwargs: Additional vectorstore-specific parameters.
         """
 

From e6d059a9b984ba0b2bcf85efe623a255264d8598 Mon Sep 17 00:00:00 2001
From: abhijeet <abhijeetmalamkar@gmail.com>
Date: Tue, 13 Feb 2024 14:50:56 -0500
Subject: [PATCH 5/5] reverting changes

---
 .../vectorstores/neo4j_vector.py              | 76 +++++++------------
 1 file changed, 26 insertions(+), 50 deletions(-)

diff --git a/libs/community/langchain_community/vectorstores/neo4j_vector.py b/libs/community/langchain_community/vectorstores/neo4j_vector.py
index 68ab811e74456..93a01e2daaa31 100644
--- a/libs/community/langchain_community/vectorstores/neo4j_vector.py
+++ b/libs/community/langchain_community/vectorstores/neo4j_vector.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import uuid
+import hashlib
 from typing import (
     Any,
     Callable,
@@ -14,7 +15,6 @@
     Tuple,
     Type,
 )
-import hashlib
 
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
@@ -71,8 +71,7 @@ def check_if_not_null(props: List[str], values: List[Any]) -> None:
     """Check if the values are not None or empty string"""
     for prop, value in zip(props, values):
         if not value:
-            raise ValueError(
-                f"Parameter `{prop}` must not be None or empty string")
+            raise ValueError(f"Parameter `{prop}` must not be None or empty string")
 
 
 def sort_by_index_name(
@@ -194,8 +193,7 @@ def __init__(
         password = get_from_env("password", "NEO4J_PASSWORD", password)
         database = get_from_env("database", "NEO4J_DATABASE", database)
 
-        self._driver = neo4j.GraphDatabase.driver(
-            url, auth=(username, password))
+        self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
         self._database = database
         self.schema = ""
         # Verify connection
@@ -290,8 +288,7 @@ def verify_version(self) -> None:
         """
         version = self.query("CALL dbms.components()")[0]["versions"][0]
         if "aura" in version:
-            version_tuple = tuple(
-                map(int, version.split("-")[0].split("."))) + (0,)
+            version_tuple = tuple(map(int, version.split("-")[0].split("."))) + (0,)
         else:
             version_tuple = tuple(map(int, version.split(".")))
 
@@ -329,8 +326,7 @@ def retrieve_existing_index(self) -> Optional[int]:
             },
         )
         # sort by index_name
-        index_information = sort_by_index_name(
-            index_information, self.index_name)
+        index_information = sort_by_index_name(index_information, self.index_name)
         try:
             self.index_name = index_information[0]["name"]
             self.node_label = index_information[0]["labelsOrTypes"][0]
@@ -369,8 +365,7 @@ def retrieve_existing_fts_index(
             },
         )
         # sort by index_name
-        index_information = sort_by_index_name(
-            index_information, self.index_name)
+        index_information = sort_by_index_name(index_information, self.index_name)
         try:
             self.keyword_index_name = index_information[0]["name"]
             self.text_node_property = index_information[0]["properties"][0]
@@ -496,49 +491,34 @@ def add_embeddings(
             texts: Iterable of strings to add to the vectorstore.
             embeddings: List of list of embedding vectors.
             metadatas: List of metadatas associated with the texts.
-            kwargs: Additional vectorstore-specific parameters.
+            kwargs: vectorstore specific parameters
         """
-
-        # Get child ids from the kwargs
-        child_ids = kwargs.get("child_ids", [])
-        parent_ids = kwargs.get("parent_ids", [])
-        relationship = kwargs.get("relationship", "HAS_CHILD")
-
-        # Generate deterministic IDs based on hash of texts
-        ids = [hashlib.sha256(text.encode('utf-8')).hexdigest()
+        if ids is None:
+            ids = [hashlib.sha256(text.encode('utf-8')).hexdigest()
                for text in texts]
 
-        # Initialize metadata for each text if not provided
         if not metadatas:
             metadatas = [{} for _ in texts]
 
-        # Define the import query with dynamic relationship type
-        import_query = """
-            UNWIND $data AS row
-            MERGE (c:`{node_label}` {{id: row.id}})
-            ON CREATE SET c.{text_property} = row.text, c.{embedding_property} = row.embedding, c += row.metadata
-            WITH c, row
-            FOREACH (parentId IN (CASE WHEN $parent_ids IS NOT NULL THEN $parent_ids ELSE [] END) |
-                MERGE (p:`{node_label}` {{id: parentId}})
-                MERGE (p)-[:HAS_CHILD]->(c)
-            )
-            WITH c
-            FOREACH (childId IN (CASE WHEN $child_ids IS NOT NULL THEN $child_ids ELSE [] END) |
-                MERGE (ch:`{node_label}` {{id: childId}})
-                MERGE (c)-[:{relationship}]->(ch)
-            )
-            """.format(node_label=self.node_label, text_property=self.text_node_property, embedding_property=self.embedding_node_property, relationship=relationship)
+        import_query = (
+            "UNWIND $data AS row "
+            "CALL { WITH row "
+            f"MERGE (c:`{self.node_label}` {{id: row.id}}) "
+            "WITH c, row "
+            f"CALL db.create.setVectorProperty(c, "
+            f"'{self.embedding_node_property}', row.embedding) "
+            "YIELD node "
+            f"SET c.`{self.text_node_property}` = row.text "
+            "SET c += row.metadata } IN TRANSACTIONS OF 1000 ROWS"
+        )
 
         parameters = {
             "data": [
-                {"text": text, "metadata": metadata,
-                    "embedding": embedding, "id": id}
+                {"text": text, "metadata": metadata, "embedding": embedding, "id": id}
                 for text, metadata, embedding, id in zip(
                     texts, metadatas, embeddings, ids
                 )
-            ],
-            "parent_ids": parent_ids,
-            "child_ids": child_ids,
+            ]
         }
 
         self.query(import_query, params=parameters)
@@ -638,8 +618,7 @@ def similarity_search_with_score_by_vector(
             self.retrieval_query if self.retrieval_query else default_retrieval
         )
 
-        read_query = _get_search_index_query(
-            self.search_type) + retrieval_query
+        read_query = _get_search_index_query(self.search_type) + retrieval_query
         parameters = {
             "index": self.index_name,
             "k": k,
@@ -927,8 +906,7 @@ def from_existing_graph(
             )
         # FTS index for Hybrid search
         if search_type == SearchType.HYBRID:
-            fts_node_label = store.retrieve_existing_fts_index(
-                text_node_properties)
+            fts_node_label = store.retrieve_existing_fts_index(text_node_properties)
             # If the FTS index doesn't exist yet
             if not fts_node_label:
                 store.create_new_keyword_index(text_node_properties)
@@ -948,10 +926,8 @@ def from_existing_graph(
                 "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text "
                 "LIMIT 1000"
             )
-            data = store.query(fetch_query, params={
-                               "props": text_node_properties})
-            text_embeddings = embedding.embed_documents(
-                [el["text"] for el in data])
+            data = store.query(fetch_query, params={"props": text_node_properties})
+            text_embeddings = embedding.embed_documents([el["text"] for el in data])
 
             params = {
                 "data": [