From f265d654ee6fafd1f33c808b1207d14958296614 Mon Sep 17 00:00:00 2001 From: Martin Kolb Date: Tue, 2 Apr 2024 17:32:13 +0200 Subject: [PATCH] Improve insert performance Instead of inserting every single vector into the database using the "execute" API, we now use "executemany" for bulk-insertion. --- .../vectorstores/hanavector.py | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/hanavector.py b/libs/community/langchain_community/vectorstores/hanavector.py index cc8822c888e20..8188c8429e66c 100644 --- a/libs/community/langchain_community/vectorstores/hanavector.py +++ b/libs/community/langchain_community/vectorstores/hanavector.py @@ -216,30 +216,32 @@ def add_texts( # type: ignore[override] if embeddings is None: embeddings = self.embedding.embed_documents(list(texts)) - cur = self.connection.cursor() - try: - # Insert data into the table - for i, text in enumerate(texts): - # Use provided values by default or fallback - metadata = metadatas[i] if metadatas else {} - embedding = ( - embeddings[i] - if embeddings - else self.embedding.embed_documents([text])[0] - ) - sql_str = ( - f'INSERT INTO "{self.table_name}" ("{self.content_column}", ' - f'"{self.metadata_column}", "{self.vector_column}") ' - f"VALUES (?, ?, TO_REAL_VECTOR (?));" - ) - cur.execute( - sql_str, - ( - text, - json.dumps(HanaDB._sanitize_metadata_keys(metadata)), - f"[{','.join(map(str, embedding))}]", - ), + sql_params = [] + for i, text in enumerate(texts): + # Use provided values by default or fallback + metadata = metadatas[i] if metadatas else {} + embedding = ( + embeddings[i] + if embeddings + else self.embedding.embed_documents([text])[0] + ) + sql_params.append( + ( + text, + json.dumps(HanaDB._sanitize_metadata_keys(metadata)), + f"[{','.join(map(str, embedding))}]", ) + ) + + # Insert data into the table + try: + cur = self.connection.cursor() + sql_str = ( + f'INSERT INTO "{self.table_name}" ("{self.content_column}", ' + f'"{self.metadata_column}", "{self.vector_column}") ' + f"VALUES (?, ?, TO_REAL_VECTOR (?));" + ) + cur.executemany(sql_str, sql_params) finally: cur.close() return []