Skip to content

Commit

Permalink
feat: Add store_full_path to converters (2/3) (#8573)
Browse files Browse the repository at this point in the history
  • Loading branch information
Amnah199 authored Nov 25, 2024
1 parent eace2a9 commit 9302d3d
Show file tree
Hide file tree
Showing 13 changed files with 235 additions and 13 deletions.
22 changes: 21 additions & 1 deletion haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

Expand Down Expand Up @@ -93,6 +95,7 @@ def __init__(
jq_schema: Optional[str] = None,
content_key: Optional[str] = None,
extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
store_full_path: bool = True,
):
"""
Creates a JSONConverter component.
Expand Down Expand Up @@ -129,6 +132,9 @@ def __init__(
:param extra_meta_fields:
An optional set of meta keys to extract from the content.
If `jq_schema` is specified, all keys will be extracted from that object.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
self._compiled_filter = None
if jq_schema:
Expand All @@ -138,6 +144,7 @@ def __init__(
self._jq_schema = jq_schema
self._content_key = content_key
self._meta_fields = extra_meta_fields
self._store_full_path = store_full_path

if self._compiled_filter is None and self._content_key is None:
msg = "No `jq_schema` nor `content_key` specified. Set either or both to extract data."
Expand All @@ -151,7 +158,11 @@ def to_dict(self) -> Dict[str, Any]:
Dictionary with serialized data.
"""
return default_to_dict(
self, jq_schema=self._jq_schema, content_key=self._content_key, extra_meta_fields=self._meta_fields
self,
jq_schema=self._jq_schema,
content_key=self._content_key,
extra_meta_fields=self._meta_fields,
store_full_path=self._store_full_path,
)

@classmethod
Expand Down Expand Up @@ -269,8 +280,17 @@ def run(

data = self._get_content_and_meta(bytestream)

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
for text, extra_meta in data:
merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

if not self._store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
19 changes: 18 additions & 1 deletion haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -38,19 +40,23 @@ class MarkdownToDocument:
```
"""

def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True):
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
"""
Create a MarkdownToDocument component.
:param table_to_single_line:
If True converts table contents into a single line.
:param progress_bar:
If True shows a progress bar when running.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
markdown_conversion_imports.check()

self.table_to_single_line = table_to_single_line
self.progress_bar = progress_bar
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -105,6 +111,17 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
16 changes: 16 additions & 0 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -46,6 +48,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = True,
all_texts: bool = False,
store_full_path: bool = True,
) -> None:
"""
Create a PDFMinerToDocument component.
Expand Down Expand Up @@ -78,6 +81,9 @@ def __init__( # pylint: disable=too-many-positional-arguments
This parameter determines whether vertical text should be considered during layout analysis.
:param all_texts:
If layout analysis should be performed on text in figures.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""

pdfminer_import.check()
Expand All @@ -91,6 +97,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
detect_vertical=detect_vertical,
all_texts=all_texts,
)
self.store_full_path = store_full_path

def _converter(self, extractor) -> Document:
"""
Expand Down Expand Up @@ -165,6 +172,15 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
documents.append(document)

Expand Down
18 changes: 17 additions & 1 deletion haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -35,11 +37,16 @@ class PPTXToDocument:
```
"""

def __init__(self):
def __init__(self, store_full_path: bool = True):
"""
Create an PPTXToDocument component.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
pptx_import.check()
self.store_full_path = store_full_path

def _convert(self, file_content: io.BytesIO) -> str:
"""
Expand Down Expand Up @@ -97,6 +104,15 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
documents.append(Document(content=text, meta=merged_metadata))

return {"documents": documents}
18 changes: 17 additions & 1 deletion haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

import io
import os
import warnings
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -73,15 +75,19 @@ class TikaDocumentConverter:
```
"""

def __init__(self, tika_url: str = "http://localhost:9998/tika"):
def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True):
"""
Create a TikaDocumentConverter component.
:param tika_url:
Tika server URL.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
tika_import.check()
self.tika_url = tika_url
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -133,6 +139,16 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)

document = Document(content=text, meta=merged_metadata)
documents.append(document)
return {"documents": documents}
17 changes: 16 additions & 1 deletion haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -34,16 +36,20 @@ class TextFileToDocument:
```
"""

def __init__(self, encoding: str = "utf-8"):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
"""
Creates a TextFileToDocument component.
:param encoding:
The encoding of the text files to convert.
If the encoding is specified in the metadata of a source ByteStream,
it overrides this value.
:param store_full_path:
If True, the full path of the file is stored in the metadata of the document.
If False, only the file name is stored.
"""
self.encoding = encoding
self.store_full_path = store_full_path

@component.output_types(documents=List[Document])
def run(
Expand Down Expand Up @@ -87,6 +93,15 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document = Document(content=text, meta=merged_metadata)
documents.append(document)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
features:
- |
Added a new `store_full_path` parameter to the `__init__` methods of `JSONConverter`, `MarkdownToDocument`, `PDFMinerToDocument`, `PPTXToDocument`, `TikaDocumentConverter` and `TextFileToDocument`. The default value is `True`, which stores full file path in the metadata of the output documents. When set to `False`, only the file name is stored.
deprecations:
- |
The default value of the `store_full_path` parameter will change to `False` in Haysatck 2.9.0 to enhance privacy.
Loading

0 comments on commit 9302d3d

Please sign in to comment.