From 11c1867032ebb7b245bce6ed917aedf054d21970 Mon Sep 17 00:00:00 2001 From: hsm207 Date: Wed, 7 Feb 2024 15:15:06 +0000 Subject: [PATCH 1/2] allow return subset of properties --- langchain_weaviate/vectorstores.py | 6 ++ tests/integration_tests/test_vectorstores.py | 65 ++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/langchain_weaviate/vectorstores.py b/langchain_weaviate/vectorstores.py index 6011f87..4e6c87f 100644 --- a/langchain_weaviate/vectorstores.py +++ b/langchain_weaviate/vectorstores.py @@ -230,6 +230,12 @@ def _perform_search( else: kwargs["return_metadata"] = ["score"] + if ( + "return_properties" in kwargs + and self._text_key not in kwargs["return_properties"] + ): + kwargs["return_properties"].append(self._text_key) + with self._tenant_context(tenant) as collection: try: if search_method == "hybrid": diff --git a/tests/integration_tests/test_vectorstores.py b/tests/integration_tests/test_vectorstores.py index 4f7ee50..00a572b 100644 --- a/tests/integration_tests/test_vectorstores.py +++ b/tests/integration_tests/test_vectorstores.py @@ -622,3 +622,68 @@ def test_embedding_property(weaviate_client, embedding_openai): ) assert type(docsearch.embeddings) == OpenAIEmbeddings + + +def test_documents_with_many_properties(weaviate_client, embedding_openai): + data = [ + { + "aliases": ["Big Tech Co", "Tech Giant"], + "categoryid": "101", + "name": "Tech Innovations Drive Market Surge", + "page_content": "The latest product launch by Big Tech Co " + "has exceeded expectations, " + "pushing its stock to record highs and invigorating the tech sector.", + "ticker": "BTCH", + }, + { + "aliases": ["Global Energy Leader", "Energy Corp"], + "categoryid": "102", + "name": "Energy Corp Announces Renewable Initiative", + "page_content": "In a bold move towards sustainability, " + "Energy Corp has unveiled plans " + "to significantly increase its investment in renewable energy sources, " + "sparking investor interest.", + "ticker": "GEL", + }, + { + "aliases": ["Pharma Pioneer", "Healthcare Innovator"], + "categoryid": "103", + "name": "Breakthrough Drug Approval", + "page_content": "Pharma Pioneer's latest drug has received FDA approval, " + "setting the stage " + "for a major shift in treatment options and a positive outlook for the " + "company's stock.", + "ticker": "PPHI", + }, + ] + + uuids = [uuid.uuid4().hex for _ in range(3)] + properties = set(data[0].keys()) + + index_name = f"TestIndex_{uuid.uuid4().hex}" + text_key = "page_content" + + # since text_key is a separate field in a LangChain Document, + # we remove it from the properties + properties.remove(text_key) + + docsearch = WeaviateVectorStore( + client=weaviate_client, + index_name=index_name, + text_key=text_key, + embedding=embedding_openai, + ) + + texts = [doc["page_content"] for doc in data] + metadatas = [{k: doc[k] for k in doc if k != "page_content"} for doc in data] + doc_ids = docsearch.add_texts(texts, metadatas=metadatas, uuids=uuids) + + weaviate_client.collections.get(index_name).query.fetch_object_by_id(doc_ids[0]) + + # by default, all the properties are returned + doc = docsearch.similarity_search("foo", k=1)[0] + assert set(doc.metadata.keys()) == properties + + # you can also specify which properties to return + doc = docsearch.similarity_search("foo", k=1, return_properties=["ticker"])[0] + assert set(doc.metadata.keys()) == {"ticker"} From 6c68c9aed3dd352ebb3b6a62e67a6fc1981316b4 Mon Sep 17 00:00:00 2001 From: hsm207 Date: Wed, 7 Feb 2024 15:15:46 +0000 Subject: [PATCH 2/2] support returning doc's uuid --- langchain_weaviate/vectorstores.py | 3 +++ tests/integration_tests/test_vectorstores.py | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/langchain_weaviate/vectorstores.py b/langchain_weaviate/vectorstores.py index 4e6c87f..f2b2f9f 100644 --- a/langchain_weaviate/vectorstores.py +++ b/langchain_weaviate/vectorstores.py @@ -236,6 +236,8 @@ def _perform_search( ): kwargs["return_properties"].append(self._text_key) + return_uuids = kwargs.pop("return_uuids", False) + with self._tenant_context(tenant) as collection: try: if search_method == "hybrid": @@ -262,6 +264,7 @@ def _perform_search( **obj.properties, **filtered_metadata, **({"vector": obj.vector["default"]} if obj.vector else {}), + **({"uuid": str(obj.uuid)} if return_uuids else {}), } doc = Document(page_content=text, metadata=merged_props) if not return_score: diff --git a/tests/integration_tests/test_vectorstores.py b/tests/integration_tests/test_vectorstores.py index 00a572b..1115821 100644 --- a/tests/integration_tests/test_vectorstores.py +++ b/tests/integration_tests/test_vectorstores.py @@ -687,3 +687,9 @@ def test_documents_with_many_properties(weaviate_client, embedding_openai): # you can also specify which properties to return doc = docsearch.similarity_search("foo", k=1, return_properties=["ticker"])[0] assert set(doc.metadata.keys()) == {"ticker"} + + # returning the uuids requires a different method + doc = docsearch.similarity_search( + "foo", k=1, return_uuids=True, return_properties=["ticker", "categoryid"] + )[0] + assert set(doc.metadata.keys()) == {"uuid", "ticker", "categoryid"}