From a7d9b74e98b443599ff86b9903ad3447316eadfb Mon Sep 17 00:00:00 2001 From: Corentin Meyer Date: Tue, 23 Jan 2024 15:25:14 +0100 Subject: [PATCH] Renaming "name" meta to "file_path" and deepcopy fix --- .../converters/unstructured/converter.py | 9 +++++---- .../unstructured/tests/test_converter.py | 18 +++++++++--------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py index 94bc19082..83b4457c8 100644 --- a/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py +++ b/integrations/unstructured/src/haystack_integrations/components/converters/unstructured/converter.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import os +import copy from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union @@ -149,7 +150,7 @@ def _create_documents( if document_creation_mode == "one-doc-per-file": text = separator.join([str(el) for el in elements]) - metadata = meta.copy() + metadata = copy.deepcopy(meta) metadata["file_path"] = str(filepath) docs = [Document(content=text, meta=metadata)] @@ -158,7 +159,7 @@ def _create_documents( meta_per_page: defaultdict[int, dict] = defaultdict(dict) for el in elements: metadata = copy.deepcopy(meta) - metadata["name"] = str(filepath) + metadata["file_path"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) page_number = int(metadata.get("page_number", 1)) @@ -170,8 +171,8 @@ def _create_documents( elif document_creation_mode == "one-doc-per-element": for el in elements: - metadata = meta.copy() - metadata["name"] = str(filepath) + metadata = copy.deepcopy(meta) + metadata["file_path"] = str(filepath) if hasattr(el, "metadata"): metadata.update(el.metadata.to_dict()) if hasattr(el, "category"): diff --git a/integrations/unstructured/tests/test_converter.py b/integrations/unstructured/tests/test_converter.py index 2a9c332f9..d5266ac62 100644 --- a/integrations/unstructured/tests/test_converter.py +++ b/integrations/unstructured/tests/test_converter.py @@ -63,7 +63,7 @@ def test_run_one_doc_per_file(self, samples_path): documents = local_converter.run([pdf_path])["documents"] assert len(documents) == 1 - assert documents[0].meta == {"name": str(pdf_path)} + assert documents[0].meta == {"file_path": str(pdf_path)} @pytest.mark.integration def test_run_one_doc_per_page(self, samples_path): @@ -77,7 +77,7 @@ def test_run_one_doc_per_page(self, samples_path): assert len(documents) == 4 for i, doc in enumerate(documents, start=1): - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert doc.meta["page_number"] == i @pytest.mark.integration @@ -92,7 +92,7 @@ def test_run_one_doc_per_element(self, samples_path): assert len(documents) > 4 for doc in documents: - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta @@ -109,10 +109,10 @@ def test_run_one_doc_per_file_with_meta(self, samples_path): documents = local_converter.run(paths=[pdf_path], meta=meta)["documents"] assert len(documents) == 1 - assert documents[0].meta["name"] == str(pdf_path) + assert documents[0].meta["file_path"] == str(pdf_path) assert "custom_meta" in documents[0].meta assert documents[0].meta["custom_meta"] == "foobar" - assert documents[0].meta == {"name": str(pdf_path), "custom_meta": "foobar"} + assert documents[0].meta == {"file_path": str(pdf_path), "custom_meta": "foobar"} @pytest.mark.integration def test_run_one_doc_per_page_with_meta(self, samples_path): @@ -126,7 +126,7 @@ def test_run_one_doc_per_page_with_meta(self, samples_path): assert len(documents) == 4 for i, doc in enumerate(documents, start=1): - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert doc.meta["page_number"] == i assert "custom_meta" in doc.meta assert doc.meta["custom_meta"] == "foobar" @@ -143,7 +143,7 @@ def test_run_one_doc_per_element_with_meta(self, samples_path): assert len(documents) > 4 for doc in documents: - assert doc.meta["name"] == str(pdf_path) + assert doc.meta["file_path"] == str(pdf_path) assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta @@ -163,7 +163,7 @@ def test_run_one_doc_per_element_with_meta_list_two_files(self, samples_path): assert len(documents) > 4 for doc in documents: - assert "name" in doc.meta + assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta assert "category" in doc.meta @@ -182,7 +182,7 @@ def test_run_one_doc_per_element_with_meta_list_folder(self, samples_path): assert len(documents) > 4 for doc in documents: - assert "name" in doc.meta + assert "file_path" in doc.meta assert "page_number" in doc.meta # elements have a category attribute that is saved in the document meta assert "category" in doc.meta