Skip to content

Commit

Permalink
Add support for accessing document id (#87)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsm207 authored Feb 7, 2024
1 parent 2a5e71b commit 5378522
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
9 changes: 9 additions & 0 deletions langchain_weaviate/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,14 @@ def _perform_search(
else:
kwargs["return_metadata"] = ["score"]

if (
"return_properties" in kwargs
and self._text_key not in kwargs["return_properties"]
):
kwargs["return_properties"].append(self._text_key)

return_uuids = kwargs.pop("return_uuids", False)

with self._tenant_context(tenant) as collection:
try:
if search_method == "hybrid":
Expand All @@ -256,6 +264,7 @@ def _perform_search(
**obj.properties,
**filtered_metadata,
**({"vector": obj.vector["default"]} if obj.vector else {}),
**({"uuid": str(obj.uuid)} if return_uuids else {}),
}
doc = Document(page_content=text, metadata=merged_props)
if not return_score:
Expand Down
71 changes: 71 additions & 0 deletions tests/integration_tests/test_vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,3 +622,74 @@ def test_embedding_property(weaviate_client, embedding_openai):
)

assert type(docsearch.embeddings) == OpenAIEmbeddings


def test_documents_with_many_properties(weaviate_client, embedding_openai):
data = [
{
"aliases": ["Big Tech Co", "Tech Giant"],
"categoryid": "101",
"name": "Tech Innovations Drive Market Surge",
"page_content": "The latest product launch by Big Tech Co "
"has exceeded expectations, "
"pushing its stock to record highs and invigorating the tech sector.",
"ticker": "BTCH",
},
{
"aliases": ["Global Energy Leader", "Energy Corp"],
"categoryid": "102",
"name": "Energy Corp Announces Renewable Initiative",
"page_content": "In a bold move towards sustainability, "
"Energy Corp has unveiled plans "
"to significantly increase its investment in renewable energy sources, "
"sparking investor interest.",
"ticker": "GEL",
},
{
"aliases": ["Pharma Pioneer", "Healthcare Innovator"],
"categoryid": "103",
"name": "Breakthrough Drug Approval",
"page_content": "Pharma Pioneer's latest drug has received FDA approval, "
"setting the stage "
"for a major shift in treatment options and a positive outlook for the "
"company's stock.",
"ticker": "PPHI",
},
]

uuids = [uuid.uuid4().hex for _ in range(3)]
properties = set(data[0].keys())

index_name = f"TestIndex_{uuid.uuid4().hex}"
text_key = "page_content"

# since text_key is a separate field in a LangChain Document,
# we remove it from the properties
properties.remove(text_key)

docsearch = WeaviateVectorStore(
client=weaviate_client,
index_name=index_name,
text_key=text_key,
embedding=embedding_openai,
)

texts = [doc["page_content"] for doc in data]
metadatas = [{k: doc[k] for k in doc if k != "page_content"} for doc in data]
doc_ids = docsearch.add_texts(texts, metadatas=metadatas, uuids=uuids)

weaviate_client.collections.get(index_name).query.fetch_object_by_id(doc_ids[0])

# by default, all the properties are returned
doc = docsearch.similarity_search("foo", k=1)[0]
assert set(doc.metadata.keys()) == properties

# you can also specify which properties to return
doc = docsearch.similarity_search("foo", k=1, return_properties=["ticker"])[0]
assert set(doc.metadata.keys()) == {"ticker"}

# returning the uuids requires a different method
doc = docsearch.similarity_search(
"foo", k=1, return_uuids=True, return_properties=["ticker", "categoryid"]
)[0]
assert set(doc.metadata.keys()) == {"uuid", "ticker", "categoryid"}

0 comments on commit 5378522

Please sign in to comment.