diff --git a/.github/workflows/readme_sync.yml b/.github/workflows/readme_sync.yml index b387dbb622..be797e5102 100644 --- a/.github/workflows/readme_sync.yml +++ b/.github/workflows/readme_sync.yml @@ -54,7 +54,7 @@ jobs: - name: Sync docs if: github.event_name == 'push' - uses: readmeio/rdme@v8 + uses: readmeio/rdme@v9 with: rdme: docs ./docs/pydoc/temp --key=${{ secrets.README_API_KEY }} --version=${{ steps.version-getter.outputs.version }} diff --git a/haystack/components/converters/azure.py b/haystack/components/converters/azure.py index edcc348d84..d55a6b8ac8 100644 --- a/haystack/components/converters/azure.py +++ b/haystack/components/converters/azure.py @@ -5,7 +5,6 @@ import copy import hashlib import os -import warnings from collections import defaultdict from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union @@ -61,7 +60,7 @@ def __init__( # pylint: disable=too-many-positional-arguments merge_multiple_column_headers: bool = True, page_layout: Literal["natural", "single_column"] = "natural", threshold_y: Optional[float] = 0.05, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Creates an AzureOCRDocumentConverter component. @@ -143,12 +142,6 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D azure_output.append(result.to_dict()) merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py index 2b36d4cc67..248ce69620 100644 --- a/haystack/components/converters/csv.py +++ b/haystack/components/converters/csv.py @@ -4,7 +4,6 @@ import io import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -36,7 +35,7 @@ class CSVToDocument: ``` """ - def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = False): """ Creates a CSVToDocument component. @@ -94,13 +93,6 @@ def run( merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) - if not self.store_full_path and "file_path" in bytestream.meta: file_path = bytestream.meta.get("file_path") if file_path: # Ensure the value is not None for pylint diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py index 604b1a5173..b9d59bd564 100644 --- a/haystack/components/converters/docx.py +++ b/haystack/components/converters/docx.py @@ -5,7 +5,6 @@ import csv import io import os -import warnings from dataclasses import dataclass from enum import Enum from io import StringIO @@ -109,7 +108,7 @@ class DOCXToDocument: ``` """ - def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True): + def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False): """ Create a DOCXToDocument component. @@ -189,13 +188,6 @@ def run( ) continue - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) - docx_metadata = self._get_docx_metadata(document=docx_document) merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata} diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py index 4c226ab99f..10509e1fab 100644 --- a/haystack/components/converters/html.py +++ b/haystack/components/converters/html.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -35,7 +34,7 @@ class HTMLToDocument: ``` """ - def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True): + def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = False): """ Create an HTMLToDocument component. @@ -123,12 +122,6 @@ def run( merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and "file_path" in bytestream.meta: file_path = bytestream.meta.get("file_path") if file_path: # Ensure the value is not None for pylint diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py index e6fc56460a..3a8c6f52f0 100644 --- a/haystack/components/converters/json.py +++ b/haystack/components/converters/json.py @@ -4,7 +4,6 @@ import json import os -import warnings from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union @@ -95,7 +94,7 @@ def __init__( jq_schema: Optional[str] = None, content_key: Optional[str] = None, extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Creates a JSONConverter component. @@ -280,12 +279,6 @@ def run( data = self._get_content_and_meta(bytestream) - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) for text, extra_meta in data: merged_metadata = {**bytestream.meta, **metadata, **extra_meta} diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index 0bd722c704..2ffbe4b745 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -40,7 +39,7 @@ class MarkdownToDocument: ``` """ - def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True): + def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False): """ Create a MarkdownToDocument component. @@ -112,13 +111,6 @@ def run( merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) - if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index a66fde927e..8642447816 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -4,7 +4,6 @@ import io import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -48,7 +47,7 @@ def __init__( # pylint: disable=too-many-positional-arguments boxes_flow: Optional[float] = 0.5, detect_vertical: bool = True, all_texts: bool = False, - store_full_path: bool = True, + store_full_path: bool = False, ) -> None: """ Create a PDFMinerToDocument component. @@ -172,12 +171,6 @@ def run( ) merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/haystack/components/converters/pptx.py b/haystack/components/converters/pptx.py index f671acabf4..7282cc5ddb 100644 --- a/haystack/components/converters/pptx.py +++ b/haystack/components/converters/pptx.py @@ -4,7 +4,6 @@ import io import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -37,7 +36,7 @@ class PPTXToDocument: ``` """ - def __init__(self, store_full_path: bool = True): + def __init__(self, store_full_path: bool = False): """ Create an PPTXToDocument component. @@ -104,12 +103,6 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/haystack/components/converters/pypdf.py b/haystack/components/converters/pypdf.py index 2086873c4f..19a4e2e453 100644 --- a/haystack/components/converters/pypdf.py +++ b/haystack/components/converters/pypdf.py @@ -4,7 +4,6 @@ import io import os -import warnings from enum import Enum from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -79,7 +78,7 @@ def __init__( layout_mode_scale_weight: float = 1.25, layout_mode_strip_rotated: bool = True, layout_mode_font_height_weight: float = 1.0, - store_full_path: bool = True, + store_full_path: bool = False, ): """ Create an PyPDFToDocument component. @@ -220,12 +219,7 @@ def run( ) merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) document.meta = merged_metadata diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 7a8fdb36b6..980fb00911 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -4,7 +4,6 @@ import io import os -import warnings from html.parser import HTMLParser from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -75,7 +74,7 @@ class TikaDocumentConverter: ``` """ - def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True): + def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = False): """ Create a TikaDocumentConverter component. @@ -139,12 +138,6 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py index 8d7a12a4f5..0ebbda8dfc 100644 --- a/haystack/components/converters/txt.py +++ b/haystack/components/converters/txt.py @@ -3,7 +3,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -36,7 +35,7 @@ class TextFileToDocument: ``` """ - def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = False): """ Creates a TextFileToDocument component. @@ -93,12 +92,6 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} - warnings.warn( - "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " - "In the 2.9.0 release, the default value for `store_full_path` will change to False, " - "storing only file names to improve privacy.", - DeprecationWarning, - ) if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): merged_metadata["file_path"] = os.path.basename(file_path) diff --git a/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml b/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml new file mode 100644 index 0000000000..61a0110958 --- /dev/null +++ b/releasenotes/notes/update-store-full-path-default-value-129f701ba07b944b.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + Update default value of `store_full_path` to `False` in converters diff --git a/test/components/converters/test_azure_ocr_doc_converter.py b/test/components/converters/test_azure_ocr_doc_converter.py index f8db0d734c..eef16ab8d9 100644 --- a/test/components/converters/test_azure_ocr_doc_converter.py +++ b/test/components/converters/test_azure_ocr_doc_converter.py @@ -105,7 +105,7 @@ def test_to_dict(self, mock_resolve_value): "page_layout": "natural", "preceding_context_len": 3, "threshold_y": 0.05, - "store_full_path": True, + "store_full_path": False, }, } diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py index 3271c09c17..cfb955041b 100644 --- a/test/components/converters/test_csv_to_document.py +++ b/test/components/converters/test_csv_to_document.py @@ -5,6 +5,7 @@ from unittest.mock import patch import pandas as pd from pathlib import Path +import os import pytest @@ -35,9 +36,9 @@ def test_run(self, test_files_path): assert len(docs) == 3 assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content assert isinstance(docs[0].content, str) - assert docs[0].meta == bytestream.meta - assert docs[1].meta["file_path"] == str(files[1]) - assert docs[2].meta["file_path"] == str(files[2]) + assert docs[0].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} + assert docs[1].meta["file_path"] == os.path.basename(files[1]) + assert docs[2].meta["file_path"] == os.path.basename(files[2]) def test_run_with_store_full_path_false(self, test_files_path): """ @@ -73,7 +74,7 @@ def test_run_error_handling(self, test_files_path, caplog): assert "non_existing_file.csv" in caplog.text docs = output["documents"] assert len(docs) == 2 - assert docs[0].meta["file_path"] == str(paths[0]) + assert docs[0].meta["file_path"] == os.path.basename(paths[0]) def test_encoding_override(self, test_files_path, caplog): """ diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py index c0ce370f3e..9b4ee3fe60 100644 --- a/test/components/converters/test_docx_file_to_document.py +++ b/test/components/converters/test_docx_file_to_document.py @@ -1,4 +1,5 @@ import json +import os import logging import pytest import csv @@ -32,7 +33,7 @@ def test_to_dict(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } def test_to_dict_custom_parameters(self): @@ -40,28 +41,28 @@ def test_to_dict_custom_parameters(self): data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown"}, } converter = DOCXToDocument(table_format="csv") data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "markdown"}, + "init_parameters": {"store_full_path": False, "table_format": "markdown"}, } converter = DOCXToDocument(table_format=DOCXTableFormat.CSV) data = converter.to_dict() assert data == { "type": "haystack.components.converters.docx.DOCXToDocument", - "init_parameters": {"store_full_path": True, "table_format": "csv"}, + "init_parameters": {"store_full_path": False, "table_format": "csv"}, } def test_from_dict(self): @@ -119,7 +120,7 @@ def test_run(self, test_files_path, docx_converter): assert "History" in docs[0].content assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Microsoft Office User", category="", @@ -151,7 +152,7 @@ def test_run_with_table(self, test_files_path): assert "Donald Trump" in docs[0].content ## :-) assert docs[0].meta.keys() == {"file_path", "docx"} assert docs[0].meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Saha, Anirban", category="", @@ -283,7 +284,7 @@ def test_run_with_additional_meta(self, test_files_path, docx_converter): output = docx_converter.run(sources=paths, meta={"language": "it", "author": "test_author"}) doc = output["documents"][0] assert doc.meta == { - "file_path": str(paths[0]), + "file_path": os.path.basename(paths[0]), "docx": DOCXMetadata( author="Microsoft Office User", category="", diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py index df76c8e892..890c927434 100644 --- a/test/components/converters/test_html_to_document.py +++ b/test/components/converters/test_html_to_document.py @@ -42,7 +42,7 @@ def test_run_with_store_full_path(self, test_files_path): """ Test if the component runs correctly when metadata is supplied by the user. """ - converter = HTMLToDocument() + converter = HTMLToDocument(store_full_path=True) sources = [test_files_path / "html" / "what_is_haystack.html"] results = converter.run(sources=sources) # store_full_path is True by default diff --git a/test/components/converters/test_json.py b/test/components/converters/test_json.py index d85a0e187a..f9dcf2fa0c 100644 --- a/test/components/converters/test_json.py +++ b/test/components/converters/test_json.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os from unittest.mock import patch from pathlib import Path import logging @@ -104,7 +105,7 @@ def test_to_dict(): "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": {"firstname", "surname"}, - "store_full_path": True, + "store_full_path": False, }, } @@ -145,11 +146,11 @@ def test_run(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file)} + assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -254,11 +255,20 @@ def test_run_with_single_meta(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][1].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][2].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -290,11 +300,20 @@ def test_run_with_meta_list(tmpdir): == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "creation_date": "1945-05-25T00:00:00", + } assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"} + assert result["documents"][1].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1943-09-03T00:00:00", + } assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file), "creation_date": "1943-09-03T00:00:00"} + assert result["documents"][2].meta == { + "file_path": os.path.basename(second_test_file), + "creation_date": "1943-09-03T00:00:00", + } assert ( result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -329,11 +348,11 @@ def test_run_with_jq_schema_and_content_key(tmpdir): result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "for their discoveries of growth factors" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "for their discoveries of growth factors" - assert result["documents"][2].meta == {"file_path": str(second_test_file)} + assert result["documents"][2].meta == {"file_path": os.path.basename(second_test_file)} assert ( result["documents"][3].content == "for his demonstrations of the existence of new " "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " @@ -361,16 +380,20 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir): result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fokin"} + assert result["documents"][0].meta == { + "file_path": os.path.basename(first_test_file), + "firstname": "Dario", + "surname": "Fokin", + } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "firstname": "Stanley", "surname": "Cohen", } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "firstname": "Rita", "surname": "Levi-Montalcini", } @@ -396,9 +419,9 @@ def test_run_with_content_key(tmpdir): assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" - assert result["documents"][0].meta == {"file_path": str(first_test_file)} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file)} assert result["documents"][1].content == "medicine" - assert result["documents"][1].meta == {"file_path": str(second_test_file)} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file)} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {} @@ -417,9 +440,9 @@ def test_run_with_content_key_and_extra_meta_fields(tmpdir): assert len(result) == 1 assert len(result["documents"]) == 3 assert result["documents"][0].content == "literature" - assert result["documents"][0].meta == {"file_path": str(first_test_file), "year": "1997"} + assert result["documents"][0].meta == {"file_path": os.path.basename(first_test_file), "year": "1997"} assert result["documents"][1].content == "medicine" - assert result["documents"][1].meta == {"file_path": str(second_test_file), "year": "1986"} + assert result["documents"][1].meta == {"file_path": os.path.basename(second_test_file), "year": "1986"} assert result["documents"][2].content == "physics" assert result["documents"][2].meta == {"year": "1938"} @@ -442,7 +465,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): == "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == { - "file_path": str(first_test_file), + "file_path": os.path.basename(first_test_file), "id": "674", "firstname": "Dario", "surname": "Fokin", @@ -450,7 +473,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): } assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "id": "434", "firstname": "Stanley", "surname": "Cohen", @@ -458,7 +481,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): } assert result["documents"][2].content == "for their discoveries of growth factors" assert result["documents"][2].meta == { - "file_path": str(second_test_file), + "file_path": os.path.basename(second_test_file), "id": "435", "firstname": "Rita", "surname": "Levi-Montalcini", diff --git a/test/components/converters/test_pptx_to_document.py b/test/components/converters/test_pptx_to_document.py index 135b1b08a8..595e1a1234 100644 --- a/test/components/converters/test_pptx_to_document.py +++ b/test/components/converters/test_pptx_to_document.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging +import os from haystack.dataclasses import ByteStream from haystack.components.converters.pptx import PPTXToDocument @@ -29,8 +30,8 @@ def test_run(self, test_files_path): "Sample Title Slide\nJane Doe\fTitle of First Slide\nThis is a bullet point\nThis is another bullet point" in docs[0].content ) - assert docs[0].meta["file_path"] == str(files[0]) - assert docs[1].meta == bytestream.meta + assert docs[0].meta["file_path"] == os.path.basename(files[0]) + assert docs[1].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} def test_run_error_non_existent_file(self, caplog): sources = ["non_existing_file.pptx"] @@ -58,7 +59,7 @@ def test_run_with_meta(self, test_files_path): document = output["documents"][0] assert document.meta == { - "file_path": str(test_files_path / "pptx" / "sample_pptx.pptx"), + "file_path": os.path.basename(test_files_path / "pptx" / "sample_pptx.pptx"), "key": "value", "language": "it", } diff --git a/test/components/converters/test_pypdf_to_document.py b/test/components/converters/test_pypdf_to_document.py index e5a9964434..fa8f295db7 100644 --- a/test/components/converters/test_pypdf_to_document.py +++ b/test/components/converters/test_pypdf_to_document.py @@ -61,7 +61,7 @@ def test_to_dict(self, pypdf_component): "layout_mode_scale_weight": 1.25, "layout_mode_strip_rotated": True, "layout_mode_font_height_weight": 1.0, - "store_full_path": True, + "store_full_path": False, }, } diff --git a/test/components/converters/test_textfile_to_document.py b/test/components/converters/test_textfile_to_document.py index 419b042134..5698501a80 100644 --- a/test/components/converters/test_textfile_to_document.py +++ b/test/components/converters/test_textfile_to_document.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging +import os from unittest.mock import patch from pathlib import Path @@ -27,9 +28,9 @@ def test_run(self, test_files_path): assert "Some text for testing." in docs[0].content assert "This is a test line." in docs[1].content assert "That's yet another file!" in docs[2].content - assert docs[0].meta["file_path"] == str(files[0]) - assert docs[1].meta["file_path"] == str(files[1]) - assert docs[2].meta == bytestream.meta + assert docs[0].meta["file_path"] == os.path.basename(files[0]) + assert docs[1].meta["file_path"] == os.path.basename(files[1]) + assert docs[2].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"} def test_run_with_store_full_path(self, test_files_path): """ @@ -59,8 +60,8 @@ def test_run_error_handling(self, test_files_path, caplog): assert "non_existing_file.txt" in caplog.text docs = output["documents"] assert len(docs) == 2 - assert docs[0].meta["file_path"] == str(paths[0]) - assert docs[1].meta["file_path"] == str(paths[2]) + assert docs[0].meta["file_path"] == os.path.basename(paths[0]) + assert docs[1].meta["file_path"] == os.path.basename(paths[2]) def test_encoding_override(self, test_files_path): """