Skip to content

Commit

Permalink
Renaming "name" meta to "file_path" and deepcopy fix
Browse files Browse the repository at this point in the history
  • Loading branch information
lambda-science committed Jan 23, 2024
1 parent 0f81c16 commit a7d9b74
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import copy
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -149,7 +150,7 @@ def _create_documents(

if document_creation_mode == "one-doc-per-file":
text = separator.join([str(el) for el in elements])
metadata = meta.copy()
metadata = copy.deepcopy(meta)
metadata["file_path"] = str(filepath)
docs = [Document(content=text, meta=metadata)]

Expand All @@ -158,7 +159,7 @@ def _create_documents(
meta_per_page: defaultdict[int, dict] = defaultdict(dict)
for el in elements:
metadata = copy.deepcopy(meta)
metadata["name"] = str(filepath)
metadata["file_path"] = str(filepath)
if hasattr(el, "metadata"):
metadata.update(el.metadata.to_dict())
page_number = int(metadata.get("page_number", 1))
Expand All @@ -170,8 +171,8 @@ def _create_documents(

elif document_creation_mode == "one-doc-per-element":
for el in elements:
metadata = meta.copy()
metadata["name"] = str(filepath)
metadata = copy.deepcopy(meta)
metadata["file_path"] = str(filepath)
if hasattr(el, "metadata"):
metadata.update(el.metadata.to_dict())
if hasattr(el, "category"):
Expand Down
18 changes: 9 additions & 9 deletions integrations/unstructured/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_run_one_doc_per_file(self, samples_path):
documents = local_converter.run([pdf_path])["documents"]

assert len(documents) == 1
assert documents[0].meta == {"name": str(pdf_path)}
assert documents[0].meta == {"file_path": str(pdf_path)}

@pytest.mark.integration
def test_run_one_doc_per_page(self, samples_path):
Expand All @@ -77,7 +77,7 @@ def test_run_one_doc_per_page(self, samples_path):

assert len(documents) == 4
for i, doc in enumerate(documents, start=1):
assert doc.meta["name"] == str(pdf_path)
assert doc.meta["file_path"] == str(pdf_path)
assert doc.meta["page_number"] == i

@pytest.mark.integration
Expand All @@ -92,7 +92,7 @@ def test_run_one_doc_per_element(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert doc.meta["name"] == str(pdf_path)
assert doc.meta["file_path"] == str(pdf_path)
assert "page_number" in doc.meta

# elements have a category attribute that is saved in the document meta
Expand All @@ -109,10 +109,10 @@ def test_run_one_doc_per_file_with_meta(self, samples_path):
documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"]

assert len(documents) == 1
assert documents[0].meta["name"] == str(pdf_path)
assert documents[0].meta["file_path"] == str(pdf_path)
assert "custom_meta" in documents[0].meta
assert documents[0].meta["custom_meta"] == "foobar"
assert documents[0].meta == {"name": str(pdf_path), "custom_meta": "foobar"}
assert documents[0].meta == {"file_path": str(pdf_path), "custom_meta": "foobar"}

@pytest.mark.integration
def test_run_one_doc_per_page_with_meta(self, samples_path):
Expand All @@ -126,7 +126,7 @@ def test_run_one_doc_per_page_with_meta(self, samples_path):

assert len(documents) == 4
for i, doc in enumerate(documents, start=1):
assert doc.meta["name"] == str(pdf_path)
assert doc.meta["file_path"] == str(pdf_path)
assert doc.meta["page_number"] == i
assert "custom_meta" in doc.meta
assert doc.meta["custom_meta"] == "foobar"
Expand All @@ -143,7 +143,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert doc.meta["name"] == str(pdf_path)
assert doc.meta["file_path"] == str(pdf_path)
assert "page_number" in doc.meta

# elements have a category attribute that is saved in the document meta
Expand All @@ -163,7 +163,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert "name" in doc.meta
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta
Expand All @@ -182,7 +182,7 @@ def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path):

assert len(documents) > 4
for doc in documents:
assert "name" in doc.meta
assert "file_path" in doc.meta
assert "page_number" in doc.meta
# elements have a category attribute that is saved in the document meta
assert "category" in doc.meta
Expand Down

0 comments on commit a7d9b74

Please sign in to comment.