From 5378522257e70cd88002b2ac38d051e1fa022a5c Mon Sep 17 00:00:00 2001 From: Shukri Date: Wed, 7 Feb 2024 16:22:23 +0100 Subject: [PATCH] Add support for accessing document id (#87) FIxes langchain-ai/langchain#13238 --- langchain_weaviate/vectorstores.py | 9 +++ tests/integration_tests/test_vectorstores.py | 71 ++++++++++++++++++++ 2 files changed, 80 insertions(+) diff --git a/langchain_weaviate/vectorstores.py b/langchain_weaviate/vectorstores.py index 6011f87..f2b2f9f 100644 --- a/langchain_weaviate/vectorstores.py +++ b/langchain_weaviate/vectorstores.py @@ -230,6 +230,14 @@ def _perform_search( else: kwargs["return_metadata"] = ["score"] + if ( + "return_properties" in kwargs + and self._text_key not in kwargs["return_properties"] + ): + kwargs["return_properties"].append(self._text_key) + + return_uuids = kwargs.pop("return_uuids", False) + with self._tenant_context(tenant) as collection: try: if search_method == "hybrid": @@ -256,6 +264,7 @@ def _perform_search( **obj.properties, **filtered_metadata, **({"vector": obj.vector["default"]} if obj.vector else {}), + **({"uuid": str(obj.uuid)} if return_uuids else {}), } doc = Document(page_content=text, metadata=merged_props) if not return_score: diff --git a/tests/integration_tests/test_vectorstores.py b/tests/integration_tests/test_vectorstores.py index 4f7ee50..1115821 100644 --- a/tests/integration_tests/test_vectorstores.py +++ b/tests/integration_tests/test_vectorstores.py @@ -622,3 +622,74 @@ def test_embedding_property(weaviate_client, embedding_openai): ) assert type(docsearch.embeddings) == OpenAIEmbeddings + + +def test_documents_with_many_properties(weaviate_client, embedding_openai): + data = [ + { + "aliases": ["Big Tech Co", "Tech Giant"], + "categoryid": "101", + "name": "Tech Innovations Drive Market Surge", + "page_content": "The latest product launch by Big Tech Co " + "has exceeded expectations, " + "pushing its stock to record highs and invigorating the tech sector.", + "ticker": "BTCH", + }, + { + "aliases": ["Global Energy Leader", "Energy Corp"], + "categoryid": "102", + "name": "Energy Corp Announces Renewable Initiative", + "page_content": "In a bold move towards sustainability, " + "Energy Corp has unveiled plans " + "to significantly increase its investment in renewable energy sources, " + "sparking investor interest.", + "ticker": "GEL", + }, + { + "aliases": ["Pharma Pioneer", "Healthcare Innovator"], + "categoryid": "103", + "name": "Breakthrough Drug Approval", + "page_content": "Pharma Pioneer's latest drug has received FDA approval, " + "setting the stage " + "for a major shift in treatment options and a positive outlook for the " + "company's stock.", + "ticker": "PPHI", + }, + ] + + uuids = [uuid.uuid4().hex for _ in range(3)] + properties = set(data[0].keys()) + + index_name = f"TestIndex_{uuid.uuid4().hex}" + text_key = "page_content" + + # since text_key is a separate field in a LangChain Document, + # we remove it from the properties + properties.remove(text_key) + + docsearch = WeaviateVectorStore( + client=weaviate_client, + index_name=index_name, + text_key=text_key, + embedding=embedding_openai, + ) + + texts = [doc["page_content"] for doc in data] + metadatas = [{k: doc[k] for k in doc if k != "page_content"} for doc in data] + doc_ids = docsearch.add_texts(texts, metadatas=metadatas, uuids=uuids) + + weaviate_client.collections.get(index_name).query.fetch_object_by_id(doc_ids[0]) + + # by default, all the properties are returned + doc = docsearch.similarity_search("foo", k=1)[0] + assert set(doc.metadata.keys()) == properties + + # you can also specify which properties to return + doc = docsearch.similarity_search("foo", k=1, return_properties=["ticker"])[0] + assert set(doc.metadata.keys()) == {"ticker"} + + # returning the uuids requires a different method + doc = docsearch.similarity_search( + "foo", k=1, return_uuids=True, return_properties=["ticker", "categoryid"] + )[0] + assert set(doc.metadata.keys()) == {"uuid", "ticker", "categoryid"}