Skip to content

Commit

Permalink
Fixing metadata page number bug. Deep copy of dict
Browse files Browse the repository at this point in the history
  • Loading branch information
lambda-science committed Jan 19, 2024
1 parent 3bc51a7 commit 86eec4d
Showing 1 changed file with 4 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,6 @@ def run(
meta=metadata,
)
documents.extend(docs_for_file)

return {"documents": documents}

def _create_documents(
Expand All @@ -140,7 +139,7 @@ def _create_documents(
elements: List[Element],
document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"],
separator: str,
meta: Optional[Dict[str, Any]] = None,
meta: Dict[str, Any],
) -> List[Document]:
"""
Create Haystack Documents from the elements returned by Unstructured.
Expand All @@ -149,15 +148,15 @@ def _create_documents(

if document_creation_mode == "one-doc-per-file":
text = separator.join([str(el) for el in elements])
metadata = meta
metadata = meta.copy()
metadata["name"] = str(filepath)
docs = [Document(content=text, meta=metadata)]

elif document_creation_mode == "one-doc-per-page":
texts_per_page: defaultdict[int, str] = defaultdict(str)
meta_per_page: defaultdict[int, dict] = defaultdict(dict)
for el in elements:
metadata = meta
metadata = meta.copy()
metadata["name"] = str(filepath)
if hasattr(el, "metadata"):
metadata.update(el.metadata.to_dict())
Expand All @@ -170,15 +169,14 @@ def _create_documents(

elif document_creation_mode == "one-doc-per-element":
for el in elements:
metadata = meta
metadata = meta.copy()
metadata["name"] = str(filepath)
if hasattr(el, "metadata"):
metadata.update(el.metadata.to_dict())
if hasattr(el, "category"):
metadata["category"] = el.category
doc = Document(content=str(el), meta=metadata)
docs.append(doc)

return docs

def _partition_file_into_elements(self, filepath: Path) -> List[Element]:
Expand Down

0 comments on commit 86eec4d

Please sign in to comment.