Skip to content

Commit

Permalink
Merge branch 'main' into feat-vertexai-streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
Amnah199 authored Aug 21, 2024
2 parents 7acaf5a + b3a9723 commit 66bbcee
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@


DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}}
METADATA_SUPPORTED_TYPES = str, int, bool, float # List[str] is supported and checked separately


class PineconeDocumentStore:
Expand Down Expand Up @@ -295,6 +296,37 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li

return documents

@staticmethod
def _discard_invalid_meta(document: Document):
"""
Remove metadata fields with unsupported types from the document.
"""

def valid_type(value: Any):
return isinstance(value, METADATA_SUPPORTED_TYPES) or (
isinstance(value, list) and all(isinstance(i, str) for i in value)
)

if document.meta:
discarded_keys = []
new_meta = {}
for key, value in document.meta.items():
if not valid_type(value):
discarded_keys.append(key)
else:
new_meta[key] = value

if discarded_keys:
msg = (
f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. "
f"Only str, int, bool, and List[str] are supported. The values of these fields will be discarded."
)
logger.warning(msg)

document.meta = new_meta

return document

def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]:
documents_for_pinecone = []
for document in documents:
Expand All @@ -305,6 +337,10 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li
"A dummy embedding will be used, but this can affect the search results. "
)
embedding = self._dummy_vector

if document.meta:
self._discard_invalid_meta(document)

doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)}

# we save content/dataframe as metadata
Expand Down
36 changes: 36 additions & 0 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,42 @@ def test_convert_dict_spec_to_pinecone_object_fail():
PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec)


def test_discard_invalid_meta_invalid():
invalid_metadata_doc = Document(
content="The moonlight shimmered ",
meta={
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
"_split_overlap": [
{"doc_id": "68ed48ba830048c5d7815874ed2de794722e6d10866b6c55349a914fd9a0df65", "range": (0, 20)}
],
},
)
pinecone_doc = PineconeDocumentStore._discard_invalid_meta(invalid_metadata_doc)

assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert pinecone_doc.meta["page_number"] == 1
assert pinecone_doc.meta["split_id"] == 0
assert pinecone_doc.meta["split_idx_start"] == 0
assert "_split_overlap" not in pinecone_doc.meta


def test_discard_invalid_meta_valid():
valid_metadata_doc = Document(
content="The moonlight shimmered ",
meta={
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
"page_number": 1,
},
)
pinecone_doc = PineconeDocumentStore._discard_invalid_meta(valid_metadata_doc)

assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert pinecone_doc.meta["page_number"] == 1


@pytest.mark.integration
@pytest.mark.skipif("PINECONE_API_KEY" not in os.environ, reason="PINECONE_API_KEY not set")
def test_serverless_index_creation_from_scratch(sleep_time):
Expand Down

0 comments on commit 66bbcee

Please sign in to comment.