diff --git a/integrations/astra/src/haystack_integrations/document_stores/astra/document_store.py b/integrations/astra/src/haystack_integrations/document_stores/astra/document_store.py index 9c0d1ee1d..a894b94c1 100644 --- a/integrations/astra/src/haystack_integrations/document_stores/astra/document_store.py +++ b/integrations/astra/src/haystack_integrations/document_stores/astra/document_store.py @@ -210,12 +210,15 @@ def _convert_input_document(document: Union[dict, Document]): documents_to_write = [_convert_input_document(doc) for doc in documents] duplicate_documents = [] - new_documents = [] + new_documents: List[Document] = [] i = 0 while i < len(documents_to_write): doc = documents_to_write[i] + # check to see if this ID already exists in our new_documents array + exists = [d for d in new_documents if d["_id"] == doc["_id"]] + # check to see if this ID is already in the DB response = self.index.find_documents({"filter": {"_id": doc["_id"]}}) - if response: + if response or exists: if policy == DuplicatePolicy.FAIL: msg = f"ID '{doc['_id']}' already exists." raise DuplicateDocumentError(msg) diff --git a/integrations/astra/tests/test_document_store.py b/integrations/astra/tests/test_document_store.py index 7669fa8e1..f1fad4f5d 100644 --- a/integrations/astra/tests/test_document_store.py +++ b/integrations/astra/tests/test_document_store.py @@ -74,6 +74,13 @@ def test_write_documents(self, document_store: AstraDocumentStore): assert document_store.write_documents(documents=[doc1], policy=DuplicatePolicy.OVERWRITE) == 1 self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) + def test_write_documents_skip_duplicates(self, document_store: AstraDocumentStore): + docs = [ + Document(id="1", content="test doc 1"), + Document(id="1", content="test doc 2"), + ] + assert document_store.write_documents(docs, policy=DuplicatePolicy.SKIP) == 1 + def test_delete_documents_non_existing_document(self, document_store: AstraDocumentStore): """ Test delete_documents() doesn't delete any Document when called with non existing id.