Skip to content

Commit

Permalink
PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista committed Aug 21, 2024
1 parent 2c71970 commit 1ed5843
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


DEFAULT_STARTER_PLAN_SPEC = {"serverless": {"region": "us-east-1", "cloud": "aws"}}
METADATA_SUPPORTED_TYPES = str, int, bool # List[str] is supported and checked separately
METADATA_SUPPORTED_TYPES = str, int, bool, float # List[str] is supported and checked separately


class PineconeDocumentStore:
Expand Down Expand Up @@ -297,18 +297,24 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li
return documents

@staticmethod
def check_metadata(document: Document):
def _discard_invalid_meta(document: Document):
"""
Remove metadata fields with unsupported types from the document.
"""

def valid_type(value: Any):
return isinstance(value, METADATA_SUPPORTED_TYPES) or (
isinstance(value, list) and all(isinstance(i, str) for i in value)
)

if document.meta:
discarded_keys = []
new_meta = {}
for key, value in document.meta.items():
if not valid_type(value):
discarded_keys.append(key)
document.meta[key] = "IGNORED"
else:
new_meta[key] = value

if discarded_keys:
msg = (
Expand All @@ -317,6 +323,8 @@ def valid_type(value: Any):
)
logger.warning(msg)

document.meta = new_meta

return document

def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]:
Expand All @@ -331,7 +339,7 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li
embedding = self._dummy_vector

if document.meta:
self.check_metadata(document)
self._discard_invalid_meta(document)

doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)}

Expand Down
10 changes: 5 additions & 5 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_convert_dict_spec_to_pinecone_object_fail():
PineconeDocumentStore._convert_dict_spec_to_pinecone_object(dict_spec)


def test_check_metadata_invalid():
def test_validate_metadata_invalid():
invalid_metadata_doc = Document(
content="The moonlight shimmered ",
meta={
Expand All @@ -155,24 +155,24 @@ def test_check_metadata_invalid():
],
},
)
pinecone_doc = PineconeDocumentStore.check_metadata(invalid_metadata_doc)
pinecone_doc = PineconeDocumentStore._discard_invalid_meta(invalid_metadata_doc)

assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert pinecone_doc.meta["page_number"] == 1
assert pinecone_doc.meta["split_id"] == 0
assert pinecone_doc.meta["split_idx_start"] == 0
assert pinecone_doc.meta["_split_overlap"] == "IGNORED"
assert "_split_overlap" not in pinecone_doc.meta


def test_check_metadata_valid():
def test_validate_metadata_valid():
valid_metadata_doc = Document(
content="The moonlight shimmered ",
meta={
"source_id": "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0",
"page_number": 1,
},
)
pinecone_doc = PineconeDocumentStore.check_metadata(valid_metadata_doc)
pinecone_doc = PineconeDocumentStore._discard_invalid_meta(valid_metadata_doc)

assert pinecone_doc.meta["source_id"] == "62049ba1d1e1d5ebb1f6230b0b00c5356b8706c56e0b9c36b1dfc86084cd75f0"
assert pinecone_doc.meta["page_number"] == 1
Expand Down

0 comments on commit 1ed5843

Please sign in to comment.