feat: Add store_full_path to converters (1/3) (#8566)

* Add store_full_path param to 3 converters
deepset-ai · Nov 22, 2024 · 21906d0 · 21906d0
1 parent 0b2a299
commit 21906d0
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 12 deletions.
diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import io
+import os
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -34,16 +36,20 @@ class CSVToDocument:
     ```
     """
 
-    def __init__(self, encoding: str = "utf-8"):
+    def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
         """
         Creates a CSVToDocument component.
 
         :param encoding:
             The encoding of the csv files to convert.
             If the encoding is specified in the metadata of a source ByteStream,
             it overrides this value.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
         """
         self.encoding = encoding
+        self.store_full_path = store_full_path
 
     @component.output_types(documents=List[Document])
     def run(
@@ -87,6 +93,19 @@ def run(
                 continue
 
             merged_metadata = {**bytestream.meta, **metadata}
+
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
             document = Document(content=data, meta=merged_metadata)
             documents.append(document)
 

diff --git a/haystack/components/converters/docx.py b/haystack/components/converters/docx.py
@@ -4,6 +4,8 @@
 
 import csv
 import io
+import os
+import warnings
 from dataclasses import dataclass
 from enum import Enum
 from io import StringIO
@@ -107,15 +109,19 @@ class DOCXToDocument:
     ```
     """
 
-    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
+    def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
         """
         Create a DOCXToDocument component.
 
         :param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
             DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
+        :param store_full_path:
+            If True, the full path of the file is stored in the metadata of the document.
+            If False, only the file name is stored.
         """
         docx_import.check()
         self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
+        self.store_full_path = store_full_path
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -124,7 +130,7 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, table_format=str(self.table_format))
+        return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
@@ -158,7 +164,6 @@ def run(
             If it's a list, the length of the list must match the number of sources, because the two lists will
             be zipped.
             If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
-
         :returns:
             A dictionary with the following keys:
             - `documents`: Created Documents
@@ -184,8 +189,21 @@ def run(
                 )
                 continue
 
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+
             docx_metadata = self._get_docx_metadata(document=docx_document)
             merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}
+
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
             document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 

diff --git a/haystack/components/converters/html.py b/haystack/components/converters/html.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import warnings
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -33,17 +35,21 @@ class HTMLToDocument:
     ```
     """
 
-    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None):
+    def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
         """
         Create an HTMLToDocument component.
 
         :param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
             are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
             the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
+        :param store_full_path:
+        If True, the full path of the file is stored in the metadata of the document.
+        If False, only the file name is stored.
         """
         trafilatura_import.check()
 
         self.extraction_kwargs = extraction_kwargs or {}
+        self.store_full_path = store_full_path
 
     def to_dict(self) -> Dict[str, Any]:
         """
@@ -52,7 +58,7 @@ def to_dict(self) -> Dict[str, Any]:
         :returns:
             Dictionary with serialized data.
         """
-        return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)
+        return default_to_dict(self, extraction_kwargs=self.extraction_kwargs, store_full_path=self.store_full_path)
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
@@ -115,7 +121,20 @@ def run(
                 )
                 continue
 
-            document = Document(content=text, meta={**bytestream.meta, **metadata})
+            merged_metadata = {**bytestream.meta, **metadata}
+
+            warnings.warn(
+                "The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
+                "In the 2.9.0 release, the default value for `store_full_path` will change to False, "
+                "storing only file names to improve privacy.",
+                DeprecationWarning,
+            )
+            if not self.store_full_path and "file_path" in bytestream.meta:
+                file_path = bytestream.meta.get("file_path")
+                if file_path:  # Ensure the value is not None for pylint
+                    merged_metadata["file_path"] = os.path.basename(file_path)
+
+            document = Document(content=text, meta=merged_metadata)
             documents.append(document)
 
         return {"documents": documents}
diff --git a/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml b/releasenotes/notes/add-store-full-path-param-to-converters-60007b232e279a33.yaml
@@ -0,0 +1,8 @@
+---
+features:
+  - |
+    Added a new `store_full_path` parameter to the `__init__` method of `CSVToDocument`, `DOCXToDocument`, and `HTMLToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
+
+deprecations:
+  - |
+    The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py
@@ -39,6 +39,25 @@ def test_run(self, test_files_path):
         assert docs[1].meta["file_path"] == str(files[1])
         assert docs[2].meta["file_path"] == str(files[2])
 
+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
+        bytestream.meta["key"] = "value"
+        files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
+        converter = CSVToDocument(store_full_path=False)
+        output = converter.run(sources=files)
+        docs = output["documents"]
+        assert len(docs) == 3
+        assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
+        assert isinstance(docs[0].content, str)
+        assert docs[0].meta["file_path"] == "sample_1.csv"
+        assert docs[0].meta["key"] == "value"
+        assert docs[1].meta["file_path"] == "sample_2.csv"
+        assert docs[2].meta["file_path"] == "sample_3.csv"
+
     def test_run_error_handling(self, test_files_path, caplog):
         """
         Test if the component correctly handles errors.

diff --git a/test/components/converters/test_docx_file_to_document.py b/test/components/converters/test_docx_file_to_document.py
@@ -32,36 +32,36 @@ def test_to_dict(self):
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
         }
 
     def test_to_dict_custom_parameters(self):
         converter = DOCXToDocument(table_format="markdown")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
         }
 
         converter = DOCXToDocument(table_format="csv")
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
         }
 
         converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "markdown"},
+            "init_parameters": {"store_full_path": True, "table_format": "markdown"},
         }
 
         converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
         data = converter.to_dict()
         assert data == {
             "type": "haystack.components.converters.docx.DOCXToDocument",
-            "init_parameters": {"table_format": "csv"},
+            "init_parameters": {"store_full_path": True, "table_format": "csv"},
         }
 
     def test_from_dict(self):
@@ -179,6 +179,38 @@ def test_run_with_table(self, test_files_path):
             "Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
         ), "Text after table not found"
 
+    def test_run_with_store_full_path_false(self, test_files_path):
+        """
+        Test if the component runs correctly with store_full_path=False
+        """
+        docx_converter = DOCXToDocument(store_full_path=False)
+        paths = [test_files_path / "docx" / "sample_docx_1.docx"]
+        output = docx_converter.run(sources=paths)
+        docs = output["documents"]
+        assert len(docs) == 1
+        assert "History" in docs[0].content
+        assert docs[0].meta.keys() == {"file_path", "docx"}
+        assert docs[0].meta == {
+            "file_path": "sample_docx_1.docx",
+            "docx": DOCXMetadata(
+                author="Microsoft Office User",
+                category="",
+                comments="",
+                content_status="",
+                created="2024-06-09T21:17:00+00:00",
+                identifier="",
+                keywords="",
+                language="",
+                last_modified_by="Carlos Fernández Lorán",
+                last_printed=None,
+                modified="2024-06-09T21:27:00+00:00",
+                revision=2,
+                subject="",
+                title="",
+                version="",
+            ),
+        }
+
     @pytest.mark.parametrize("table_format", ["markdown", "csv"])
     def test_table_between_two_paragraphs(self, test_files_path, table_format):
         docx_converter = DOCXToDocument(table_format=table_format)

diff --git a/test/components/converters/test_html_to_document.py b/test/components/converters/test_html_to_document.py
@@ -38,6 +38,28 @@ def test_run_doc_metadata(self, test_files_path):
         assert "Haystack" in docs[0].content
         assert docs[0].meta["file_name"] == "what_is_haystack.html"
 
+    def test_run_with_store_full_path(self, test_files_path):
+        """
+        Test if the component runs correctly when metadata is supplied by the user.
+        """
+        converter = HTMLToDocument()
+        sources = [test_files_path / "html" / "what_is_haystack.html"]
+
+        results = converter.run(sources=sources)  # store_full_path is True by default
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta["file_path"] == str(sources[0])
+
+        converter_2 = HTMLToDocument(store_full_path=False)
+        results = converter_2.run(sources=sources)
+        docs = results["documents"]
+
+        assert len(docs) == 1
+        assert "Haystack" in docs[0].content
+        assert docs[0].meta["file_path"] == "what_is_haystack.html"
+
     def test_incorrect_meta(self, test_files_path):
         """
         Test if the component raises an error when incorrect metadata is supplied by the user.