diff --git a/haystack/components/converters/json.py b/haystack/components/converters/json.py index 966da881b8..e6fc56460a 100644 --- a/haystack/components/converters/json.py +++ b/haystack/components/converters/json.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import json +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union @@ -93,6 +95,7 @@ def __init__( jq_schema: Optional[str] = None, content_key: Optional[str] = None, extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None, + store_full_path: bool = True, ): """ Creates a JSONConverter component. @@ -129,6 +132,9 @@ def __init__( :param extra_meta_fields: An optional set of meta keys to extract from the content. If `jq_schema` is specified, all keys will be extracted from that object. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ self._compiled_filter = None if jq_schema: @@ -138,6 +144,7 @@ def __init__( self._jq_schema = jq_schema self._content_key = content_key self._meta_fields = extra_meta_fields + self._store_full_path = store_full_path if self._compiled_filter is None and self._content_key is None: msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data." @@ -151,7 +158,11 @@ def to_dict(self) -> Dict[str, Any]: Dictionary with serialized data. """ return default_to_dict( - self, jq_schema=self._jq_schema, content_key=self._content_key, extra_meta_fields=self._meta_fields + self, + jq_schema=self._jq_schema, + content_key=self._content_key, + extra_meta_fields=self._meta_fields, + store_full_path=self._store_full_path, ) @classmethod @@ -269,8 +280,17 @@ def run( data = self._get_content_and_meta(bytestream) + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) for text, extra_meta in data: merged_metadata = {**bytestream.meta, **metadata, **extra_meta} + + if not self._store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/markdown.py b/haystack/components/converters/markdown.py index c983ef212f..0bd722c704 100644 --- a/haystack/components/converters/markdown.py +++ b/haystack/components/converters/markdown.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -38,7 +40,7 @@ class MarkdownToDocument: ``` """ - def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True): + def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True): """ Create a MarkdownToDocument component. @@ -46,11 +48,15 @@ def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True If True converts table contents into a single line. :param progress_bar: If True shows a progress bar when running. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ markdown_conversion_imports.check() self.table_to_single_line = table_to_single_line self.progress_bar = progress_bar + self.store_full_path = store_full_path @component.output_types(documents=List[Document]) def run( @@ -105,6 +111,17 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} + + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) + document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py index acf9db28f2..a66fde927e 100644 --- a/haystack/components/converters/pdfminer.py +++ b/haystack/components/converters/pdfminer.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -46,6 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments boxes_flow: Optional[float] = 0.5, detect_vertical: bool = True, all_texts: bool = False, + store_full_path: bool = True, ) -> None: """ Create a PDFMinerToDocument component. @@ -78,6 +81,9 @@ def __init__( # pylint: disable=too-many-positional-arguments This parameter determines whether vertical text should be considered during layout analysis. :param all_texts: If layout analysis should be performed on text in figures. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ pdfminer_import.check() @@ -91,6 +97,7 @@ def __init__( # pylint: disable=too-many-positional-arguments detect_vertical=detect_vertical, all_texts=all_texts, ) + self.store_full_path = store_full_path def _converter(self, extractor) -> Document: """ @@ -165,6 +172,15 @@ def run( ) merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) document.meta = merged_metadata documents.append(document) diff --git a/haystack/components/converters/pptx.py b/haystack/components/converters/pptx.py index b665abc4a7..f671acabf4 100644 --- a/haystack/components/converters/pptx.py +++ b/haystack/components/converters/pptx.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -35,11 +37,16 @@ class PPTXToDocument: ``` """ - def __init__(self): + def __init__(self, store_full_path: bool = True): """ Create an PPTXToDocument component. + + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ pptx_import.check() + self.store_full_path = store_full_path def _convert(self, file_content: io.BytesIO) -> str: """ @@ -97,6 +104,15 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) documents.append(Document(content=text, meta=merged_metadata)) return {"documents": documents} diff --git a/haystack/components/converters/tika.py b/haystack/components/converters/tika.py index 926968a176..7a8fdb36b6 100644 --- a/haystack/components/converters/tika.py +++ b/haystack/components/converters/tika.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 import io +import os +import warnings from html.parser import HTMLParser from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -73,15 +75,19 @@ class TikaDocumentConverter: ``` """ - def __init__(self, tika_url: str = "http://localhost:9998/tika"): + def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True): """ Create a TikaDocumentConverter component. :param tika_url: Tika server URL. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ tika_import.check() self.tika_url = tika_url + self.store_full_path = store_full_path @component.output_types(documents=List[Document]) def run( @@ -133,6 +139,16 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) + document = Document(content=text, meta=merged_metadata) documents.append(document) return {"documents": documents} diff --git a/haystack/components/converters/txt.py b/haystack/components/converters/txt.py index da5ca9a259..8d7a12a4f5 100644 --- a/haystack/components/converters/txt.py +++ b/haystack/components/converters/txt.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +import os +import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -34,7 +36,7 @@ class TextFileToDocument: ``` """ - def __init__(self, encoding: str = "utf-8"): + def __init__(self, encoding: str = "utf-8", store_full_path: bool = True): """ Creates a TextFileToDocument component. @@ -42,8 +44,12 @@ def __init__(self, encoding: str = "utf-8"): The encoding of the text files to convert. If the encoding is specified in the metadata of a source ByteStream, it overrides this value. + :param store_full_path: + If True, the full path of the file is stored in the metadata of the document. + If False, only the file name is stored. """ self.encoding = encoding + self.store_full_path = store_full_path @component.output_types(documents=List[Document]) def run( @@ -87,6 +93,15 @@ def run( continue merged_metadata = {**bytestream.meta, **metadata} + warnings.warn( + "The `store_full_path` parameter defaults to True, storing full file paths in metadata. " + "In the 2.9.0 release, the default value for `store_full_path` will change to False, " + "storing only file names to improve privacy.", + DeprecationWarning, + ) + + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) document = Document(content=text, meta=merged_metadata) documents.append(document) diff --git a/releasenotes/notes/add-store-full-path-param-to-converters-43b50812b1600eb2.yaml b/releasenotes/notes/add-store-full-path-param-to-converters-43b50812b1600eb2.yaml new file mode 100644 index 0000000000..6978ab5aac --- /dev/null +++ b/releasenotes/notes/add-store-full-path-param-to-converters-43b50812b1600eb2.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Added a new `store_full_path` parameter to the `__init__` methods of `JSONConverter`, `MarkdownToDocument`, `PDFMinerToDocument`, `PPTXToDocument`, `TikaDocumentConverter` and `TextFileToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored. + +deprecations: + - | + The default value of the `store_full_path` parameter will change to `False` in Haysatck 2.9.0 to enhance privacy. diff --git a/test/components/converters/test_json.py b/test/components/converters/test_json.py index fe43044f74..d85a0e187a 100644 --- a/test/components/converters/test_json.py +++ b/test/components/converters/test_json.py @@ -21,7 +21,7 @@ { "id": "674", "firstname": "Dario", - "surname": "Fo", + "surname": "Fokin", "motivation": "who emulates the jesters of the Middle Ages in scourging authority and upholding the dignity of the downtrodden", "share": "1", } @@ -104,6 +104,7 @@ def test_to_dict(): "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": {"firstname", "surname"}, + "store_full_path": True, }, } @@ -115,6 +116,7 @@ def test_from_dict(): "content_key": "motivation", "jq_schema": ".laureates[]", "extra_meta_fields": ["firstname", "surname"], + "store_full_path": True, }, } converter = JSONConverter.from_dict(data) @@ -140,7 +142,7 @@ def test_run(tmpdir): assert len(result["documents"]) == 4 assert ( result["documents"][0].content - == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and " + == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": str(first_test_file)} @@ -156,6 +158,43 @@ def test_run(tmpdir): assert result["documents"][3].meta == {} +def test_run_with_store_full_path_false(tmpdir): + """ + Test if the component runs correctly with store_full_path=False + """ + first_test_file = Path(tmpdir / "first_test_file.json") + second_test_file = Path(tmpdir / "second_test_file.json") + + first_test_file.write_text(json.dumps(test_data[0]), "utf-8") + second_test_file.write_text(json.dumps(test_data[1]), "utf-8") + byte_stream = ByteStream.from_string(json.dumps(test_data[2])) + + sources = [str(first_test_file), second_test_file, byte_stream] + + converter = JSONConverter( + jq_schema='.laureates[] | .firstname + " " + .surname + " " + .motivation', store_full_path=False + ) + result = converter.run(sources=sources) + assert len(result) == 1 + assert len(result["documents"]) == 4 + assert ( + result["documents"][0].content + == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " + "upholding the dignity of the downtrodden" + ) + assert result["documents"][0].meta == {"file_path": "first_test_file.json"} + assert result["documents"][1].content == "Stanley Cohen for their discoveries of growth factors" + assert result["documents"][1].meta == {"file_path": "second_test_file.json"} + assert result["documents"][2].content == "Rita Levi-Montalcini for their discoveries of growth factors" + assert result["documents"][2].meta == {"file_path": "second_test_file.json"} + assert ( + result["documents"][3].content == "Enrico Fermi for his demonstrations of the existence of new " + "radioactive elements produced by neutron irradiation, and for his related discovery of nuclear " + "reactions brought about by slow neutrons" + ) + assert result["documents"][3].meta == {} + + def test_run_with_non_json_file(tmpdir, caplog): test_file = Path(tmpdir / "test_file.md") test_file.write_text("This is not a JSON file.", "utf-8") @@ -212,7 +251,7 @@ def test_run_with_single_meta(tmpdir): assert len(result["documents"]) == 4 assert ( result["documents"][0].content - == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and " + == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} @@ -248,7 +287,7 @@ def test_run_with_meta_list(tmpdir): assert len(result["documents"]) == 4 assert ( result["documents"][0].content - == "Dario Fo who emulates the jesters of the Middle Ages in scourging authority and " + == "Dario Fokin who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) assert result["documents"][0].meta == {"file_path": str(first_test_file), "creation_date": "1945-05-25T00:00:00"} @@ -322,7 +361,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields(tmpdir): result["documents"][0].content == "who emulates the jesters of the Middle Ages in scourging authority and " "upholding the dignity of the downtrodden" ) - assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fo"} + assert result["documents"][0].meta == {"file_path": str(first_test_file), "firstname": "Dario", "surname": "Fokin"} assert result["documents"][1].content == "for their discoveries of growth factors" assert result["documents"][1].meta == { "file_path": str(second_test_file), @@ -406,7 +445,7 @@ def test_run_with_jq_schema_content_key_and_extra_meta_fields_literal(tmpdir): "file_path": str(first_test_file), "id": "674", "firstname": "Dario", - "surname": "Fo", + "surname": "Fokin", "share": "1", } assert result["documents"][1].content == "for their discoveries of growth factors" diff --git a/test/components/converters/test_markdown_to_document.py b/test/components/converters/test_markdown_to_document.py index 302cc10434..fdd0ee6cd9 100644 --- a/test/components/converters/test_markdown_to_document.py +++ b/test/components/converters/test_markdown_to_document.py @@ -18,9 +18,10 @@ def test_init_params_default(self): assert converter.progress_bar is True def test_init_params_custom(self): - converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False) + converter = MarkdownToDocument(table_to_single_line=True, progress_bar=False, store_full_path=False) assert converter.table_to_single_line is True assert converter.progress_bar is False + assert converter.store_full_path is False @pytest.mark.integration def test_run(self, test_files_path): @@ -34,6 +35,21 @@ def test_run(self, test_files_path): assert "What to build with Haystack" in doc.content assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + converter = MarkdownToDocument(store_full_path=False) + sources = [test_files_path / "markdown" / "sample.md"] + results = converter.run(sources=sources) + docs = results["documents"] + + assert len(docs) == 1 + for doc in docs: + assert "What to build with Haystack" in doc.content + assert "# git clone https://github.com/deepset-ai/haystack.git" in doc.content + assert doc.meta["file_path"] == "sample.md" + def test_run_calls_normalize_metadata(self, test_files_path): bytestream = ByteStream(data=b"test", meta={"author": "test_author", "language": "en"}) diff --git a/test/components/converters/test_pdfminer_to_document.py b/test/components/converters/test_pdfminer_to_document.py index 53dececbee..4b30f2819a 100644 --- a/test/components/converters/test_pdfminer_to_document.py +++ b/test/components/converters/test_pdfminer_to_document.py @@ -28,9 +28,25 @@ def test_init_params_custom(self, test_files_path): """ Test if init arguments are passed successfully to PDFMinerToDocument layout parameters """ - converter = PDFMinerToDocument(char_margin=0.5, all_texts=True) + converter = PDFMinerToDocument(char_margin=0.5, all_texts=True, store_full_path=False) assert converter.layout_params.char_margin == 0.5 assert converter.layout_params.all_texts is True + assert converter.store_full_path is False + + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + converter = PDFMinerToDocument(store_full_path=False) + sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"] + results = converter.run(sources=sources) + docs = results["documents"] + + assert len(docs) == 1 + for doc in docs: + assert "the page 3 is empty" in doc.content + assert "Page 4 of Sample PDF" in doc.content + assert doc.meta["file_path"] == "sample_pdf_1.pdf" def test_run_wrong_file_type(self, test_files_path, caplog): """ diff --git a/test/components/converters/test_pptx_to_document.py b/test/components/converters/test_pptx_to_document.py index 9a0c314e33..135b1b08a8 100644 --- a/test/components/converters/test_pptx_to_document.py +++ b/test/components/converters/test_pptx_to_document.py @@ -62,3 +62,17 @@ def test_run_with_meta(self, test_files_path): "key": "value", "language": "it", } + + def test_run_with_store_full_path_false(self, test_files_path): + """ + Test if the component runs correctly with store_full_path=False + """ + bytestream = ByteStream.from_file_path(test_files_path / "pptx" / "sample_pptx.pptx") + bytestream.meta["file_path"] = str(test_files_path / "pptx" / "sample_pptx.pptx") + bytestream.meta["key"] = "value" + + converter = PPTXToDocument(store_full_path=False) + output = converter.run(sources=[bytestream], meta=[{"language": "it"}]) + document = output["documents"][0] + + assert document.meta == {"file_path": "sample_pptx.pptx", "key": "value", "language": "it"} diff --git a/test/components/converters/test_textfile_to_document.py b/test/components/converters/test_textfile_to_document.py index ba3bdec101..419b042134 100644 --- a/test/components/converters/test_textfile_to_document.py +++ b/test/components/converters/test_textfile_to_document.py @@ -31,6 +31,23 @@ def test_run(self, test_files_path): assert docs[1].meta["file_path"] == str(files[1]) assert docs[2].meta == bytestream.meta + def test_run_with_store_full_path(self, test_files_path): + """ + Test if the component runs correctly with store_full_path= False. + """ + bytestream = ByteStream.from_file_path(test_files_path / "txt" / "doc_3.txt") + bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt") + bytestream.meta["key"] = "value" + files = [str(test_files_path / "txt" / "doc_1.txt"), bytestream] + converter = TextFileToDocument(store_full_path=False) + output = converter.run(sources=files) + docs = output["documents"] + assert len(docs) == 2 + assert "Some text for testing." in docs[0].content + assert "That's yet another file!" in docs[1].content + assert docs[0].meta["file_path"] == "doc_1.txt" + assert docs[1].meta["file_path"] == "doc_3.txt" + def test_run_error_handling(self, test_files_path, caplog): """ Test if the component correctly handles errors. diff --git a/test/components/converters/test_tika_doc_converter.py b/test/components/converters/test_tika_doc_converter.py index 42ba3ebe1d..f7099e077f 100644 --- a/test/components/converters/test_tika_doc_converter.py +++ b/test/components/converters/test_tika_doc_converter.py @@ -35,6 +35,18 @@ def test_run_with_meta(self, test_files_path): assert output["documents"][0].meta["language"] == "it" assert output["documents"][1].meta["language"] == "it" + def test_run_with_store_full_path_false(self, test_files_path): + bytestream = ByteStream(data=b"test") + bytestream.meta["file_path"] = str(test_files_path / "txt" / "doc_3.txt") + + converter = TikaDocumentConverter(store_full_path=False) + with patch("haystack.components.converters.tika.tika_parser.from_buffer"): + output = converter.run(sources=[bytestream, test_files_path / "markdown" / "sample.md"]) + + # check that the metadata from the sources is merged with that from the meta parameter + assert output["documents"][0].meta["file_path"] == "doc_3.txt" + assert output["documents"][1].meta["file_path"] == "sample.md" + def test_run_nonexistent_file(self, caplog): component = TikaDocumentConverter() with caplog.at_level("WARNING"):