Skip to content

Commit

Permalink
feat: Add store_full_path to converters (1/3) (#8566)
Browse files Browse the repository at this point in the history
* Add store_full_path param to 3 converters
  • Loading branch information
Amnah199 authored Nov 22, 2024
1 parent 0b2a299 commit 21906d0
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 12 deletions.
21 changes: 20 additions & 1 deletion haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -34,16 +36,20 @@ class CSVToDocument:
```
"""

def __init__(self, encoding: str = "utf-8"):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
"""
Creates a CSVToDocument component.
:param encoding:
The encoding of the csv files to convert.
If the encoding is specified in the metadata of a source ByteStream,
it overrides this value.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
self.encoding = encoding
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -87,6 +93,19 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=data, meta=merged_metadata)
documents.append(document)

Expand Down
24 changes: 21 additions & 3 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import csv
import io
import os
import warnings
from dataclasses import dataclass
from enum import Enum
from io import StringIO
Expand Down Expand Up @@ -107,15 +109,19 @@ class DOCXToDocument:
```
"""

def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV):
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
"""
Create a DOCXToDocument component.
:param table_format: The format for table output. Can be either DOCXTableFormat.MARKDOWN,
DOCXTableFormat.CSV, "markdown", or "csv". Defaults to DOCXTableFormat.CSV.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
docx_import.check()
self.table_format = DOCXTableFormat.from_str(table_format) if isinstance(table_format, str) else table_format
self.store_full_path = store_full_path

def to_dict(self) -> Dict[str, Any]:
"""
Expand All @@ -124,7 +130,7 @@ def to_dict(self) -> Dict[str, Any]:
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, table_format=str(self.table_format))
return default_to_dict(self, table_format=str(self.table_format), store_full_path=self.store_full_path)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DOCXToDocument":
Expand Down Expand Up @@ -158,7 +164,6 @@ def run(
If it's a list, the length of the list must match the number of sources, because the two lists will
be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
:returns:
A dictionary with the following keys:
- `documents`: Created Documents
Expand All @@ -184,8 +189,21 @@ def run(
)
continue

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
25 changes: 22 additions & 3 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -33,17 +35,21 @@ class HTMLToDocument:
```
"""

def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None):
def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
"""
Create an HTMLToDocument component.
:param extraction_kwargs: A dictionary containing keyword arguments to customize the extraction process. These
are passed to the underlying Trafilatura `extract` function. For the full list of available arguments, see
the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
trafilatura_import.check()

self.extraction_kwargs = extraction_kwargs or {}
self.store_full_path = store_full_path

def to_dict(self) -> Dict[str, Any]:
"""
Expand All @@ -52,7 +58,7 @@ def to_dict(self) -> Dict[str, Any]:
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, extraction_kwargs=self.extraction_kwargs)
return default_to_dict(self, extraction_kwargs=self.extraction_kwargs, store_full_path=self.store_full_path)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "HTMLToDocument":
Expand Down Expand Up @@ -115,7 +121,20 @@ def run(
)
continue

document = Document(content=text, meta={**bytestream.meta, **metadata})
merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)

return {"documents": documents}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Added a new `store_full_path` parameter to the `__init__` method of `CSVToDocument`, `DOCXToDocument`, and `HTMLToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
deprecations:
- |
The default value of the `store_full_path` parameter will be changed to `False` in Haysatck 2.9.0 to enhance privacy.
19 changes: 19 additions & 0 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,25 @@ def test_run(self, test_files_path):
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta["file_path"] == str(files[2])

def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
bytestream = ByteStream.from_file_path(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["file_path"] = str(test_files_path / "csv" / "sample_1.csv")
bytestream.meta["key"] = "value"
files = [bytestream, test_files_path / "csv" / "sample_2.csv", test_files_path / "csv" / "sample_3.csv"]
converter = CSVToDocument(store_full_path=False)
output = converter.run(sources=files)
docs = output["documents"]
assert len(docs) == 3
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
assert isinstance(docs[0].content, str)
assert docs[0].meta["file_path"] == "sample_1.csv"
assert docs[0].meta["key"] == "value"
assert docs[1].meta["file_path"] == "sample_2.csv"
assert docs[2].meta["file_path"] == "sample_3.csv"

def test_run_error_handling(self, test_files_path, caplog):
"""
Test if the component correctly handles errors.
Expand Down
42 changes: 37 additions & 5 deletions test/components/converters/test_docx_file_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,36 @@ def test_to_dict(self):
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "csv"},
"init_parameters": {"store_full_path": True, "table_format": "csv"},
}

def test_to_dict_custom_parameters(self):
converter = DOCXToDocument(table_format="markdown")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "markdown"},
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
}

converter = DOCXToDocument(table_format="csv")
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "csv"},
"init_parameters": {"store_full_path": True, "table_format": "csv"},
}

converter = DOCXToDocument(table_format=DOCXTableFormat.MARKDOWN)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "markdown"},
"init_parameters": {"store_full_path": True, "table_format": "markdown"},
}

converter = DOCXToDocument(table_format=DOCXTableFormat.CSV)
data = converter.to_dict()
assert data == {
"type": "haystack.components.converters.docx.DOCXToDocument",
"init_parameters": {"table_format": "csv"},
"init_parameters": {"store_full_path": True, "table_format": "csv"},
}

def test_from_dict(self):
Expand Down Expand Up @@ -179,6 +179,38 @@ def test_run_with_table(self, test_files_path):
"Now we are in Page 2" in part for part in content_parts[table_index + 1 :]
), "Text after table not found"

def test_run_with_store_full_path_false(self, test_files_path):
"""
Test if the component runs correctly with store_full_path=False
"""
docx_converter = DOCXToDocument(store_full_path=False)
paths = [test_files_path / "docx" / "sample_docx_1.docx"]
output = docx_converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
assert "History" in docs[0].content
assert docs[0].meta.keys() == {"file_path", "docx"}
assert docs[0].meta == {
"file_path": "sample_docx_1.docx",
"docx": DOCXMetadata(
author="Microsoft Office User",
category="",
comments="",
content_status="",
created="2024-06-09T21:17:00+00:00",
identifier="",
keywords="",
language="",
last_modified_by="Carlos Fernández Lorán",
last_printed=None,
modified="2024-06-09T21:27:00+00:00",
revision=2,
subject="",
title="",
version="",
),
}

@pytest.mark.parametrize("table_format", ["markdown", "csv"])
def test_table_between_two_paragraphs(self, test_files_path, table_format):
docx_converter = DOCXToDocument(table_format=table_format)
Expand Down
22 changes: 22 additions & 0 deletions test/components/converters/test_html_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,28 @@ def test_run_doc_metadata(self, test_files_path):
assert "Haystack" in docs[0].content
assert docs[0].meta["file_name"] == "what_is_haystack.html"

def test_run_with_store_full_path(self, test_files_path):
"""
Test if the component runs correctly when metadata is supplied by the user.
"""
converter = HTMLToDocument()
sources = [test_files_path / "html" / "what_is_haystack.html"]

results = converter.run(sources=sources) # store_full_path is True by default
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta["file_path"] == str(sources[0])

converter_2 = HTMLToDocument(store_full_path=False)
results = converter_2.run(sources=sources)
docs = results["documents"]

assert len(docs) == 1
assert "Haystack" in docs[0].content
assert docs[0].meta["file_path"] == "what_is_haystack.html"

def test_incorrect_meta(self, test_files_path):
"""
Test if the component raises an error when incorrect metadata is supplied by the user.
Expand Down

0 comments on commit 21906d0

Please sign in to comment.