diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 188dd9e6e..54cbd5559 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -175,9 +175,10 @@ def _create_documents( docs = [Document(content=texts_per_page[page], meta=meta_per_page[page]) for page in texts_per_page.keys()] elif document_creation_mode == "one-doc-per-element": - for el in elements: + for index, el in enumerate(elements): metadata = copy.deepcopy(meta) metadata["file_path"] = str(filepath) + metadata["element_index"] = index if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) if hasattr(el, "category"): diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index ca590ab2f..e03e2e58e 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -123,7 +123,6 @@ def test_run_one_doc_per_page_with_meta(self, samples_path): ) documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] - assert len(documents) == 4 for i, doc in enumerate(documents, start=1): assert doc.meta["file_path"] == str(pdf_path) @@ -142,6 +141,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] assert len(documents) > 4 + first_element_index = 0 for doc in documents: assert doc.meta["file_path"] == str(pdf_path) assert "page_number" in doc.meta @@ -150,6 +150,8 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): assert "category" in doc.meta assert "custom_meta" in doc.meta assert doc.meta["custom_meta"] == "foobar" + assert doc.meta["element_index"] == first_element_index + first_element_index += 1 @pytest.mark.integration def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):