Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(unstructured): add element index as metadata #382

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,10 @@ def _create_documents(
docs = [Document(content=texts_per_page[page], meta=meta_per_page[page]) for page in texts_per_page.keys()]

elif document_creation_mode == "one-doc-per-element":
for el in elements:
for index, el in enumerate(elements):
metadata = copy.deepcopy(meta)
metadata["file_path"] = str(filepath)
metadata["element_index"] = index
if hasattr(el, "metadata"):
metadata.update(el.metadata.to_dict())
if hasattr(el, "category"):
Expand Down
4 changes: 3 additions & 1 deletion integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def test_run_one_doc_per_page_with_meta(self, samples_path):
)

documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) == 4
for i, doc in enumerate(documents, start=1):
assert doc.meta["file_path"] == str(pdf_path)
Expand All @@ -142,6 +141,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path):
documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) > 4
first_element_index = 0
for doc in documents:
assert doc.meta["file_path"] == str(pdf_path)
assert "page_number" in doc.meta
Expand All @@ -150,6 +150,8 @@ def test_run_one_doc_per_element_with_meta(self, samples_path):
assert "category" in doc.meta
assert "custom_meta" in doc.meta
assert doc.meta["custom_meta"] == "foobar"
assert doc.meta["element_index"] == first_element_index
first_element_index += 1

@pytest.mark.integration
def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):
Expand Down