Skip to content

Commit

Permalink
Merge branch 'main' into unify-DocumentSplitter-NLTKDocumentSplitter
Browse files Browse the repository at this point in the history
  • Loading branch information
davidsbatista authored Dec 12, 2024
2 parents 06803d9 + 04fc187 commit 73a0e68
Show file tree
Hide file tree
Showing 21 changed files with 90 additions and 138 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/readme_sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Sync docs
if: github.event_name == 'push'
uses: readmeio/rdme@v8
uses: readmeio/rdme@v9
with:
rdme: docs ./docs/pydoc/temp --key=${{ secrets.README_API_KEY }} --version=${{ steps.version-getter.outputs.version }}

Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import copy
import hashlib
import os
import warnings
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union
Expand Down Expand Up @@ -61,7 +60,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
merge_multiple_column_headers: bool = True,
page_layout: Literal["natural", "single_column"] = "natural",
threshold_y: Optional[float] = 0.05,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Creates an AzureOCRDocumentConverter component.
Expand Down Expand Up @@ -143,12 +142,6 @@ def run(self, sources: List[Union[str, Path, ByteStream]], meta: Optional[List[D
azure_output.append(result.to_dict())

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
10 changes: 1 addition & 9 deletions haystack/components/converters/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -36,7 +35,7 @@ class CSVToDocument:
```
"""

def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
"""
Creates a CSVToDocument component.
Expand Down Expand Up @@ -94,13 +93,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
10 changes: 1 addition & 9 deletions haystack/components/converters/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import csv
import io
import os
import warnings
from dataclasses import dataclass
from enum import Enum
from io import StringIO
Expand Down Expand Up @@ -109,7 +108,7 @@ class DOCXToDocument:
```
"""

def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = True):
def __init__(self, table_format: Union[str, DOCXTableFormat] = DOCXTableFormat.CSV, store_full_path: bool = False):
"""
Create a DOCXToDocument component.
Expand Down Expand Up @@ -189,13 +188,6 @@ def run(
)
continue

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

docx_metadata = self._get_docx_metadata(document=docx_document)
merged_metadata = {**bytestream.meta, **metadata, "docx": docx_metadata}

Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -35,7 +34,7 @@ class HTMLToDocument:
```
"""

def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = True):
def __init__(self, extraction_kwargs: Optional[Dict[str, Any]] = None, store_full_path: bool = False):
"""
Create an HTMLToDocument component.
Expand Down Expand Up @@ -123,12 +122,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
if not self.store_full_path and "file_path" in bytestream.meta:
file_path = bytestream.meta.get("file_path")
if file_path: # Ensure the value is not None for pylint
Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import json
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Union

Expand Down Expand Up @@ -95,7 +94,7 @@ def __init__(
jq_schema: Optional[str] = None,
content_key: Optional[str] = None,
extra_meta_fields: Optional[Union[Set[str], Literal["*"]]] = None,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Creates a JSONConverter component.
Expand Down Expand Up @@ -280,12 +279,6 @@ def run(

data = self._get_content_and_meta(bytestream)

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)
for text, extra_meta in data:
merged_metadata = {**bytestream.meta, **metadata, **extra_meta}

Expand Down
10 changes: 1 addition & 9 deletions haystack/components/converters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -40,7 +39,7 @@ class MarkdownToDocument:
```
"""

def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = True):
def __init__(self, table_to_single_line: bool = False, progress_bar: bool = True, store_full_path: bool = False):
"""
Create a MarkdownToDocument component.
Expand Down Expand Up @@ -112,13 +111,6 @@ def run(

merged_metadata = {**bytestream.meta, **metadata}

warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)

Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -48,7 +47,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
boxes_flow: Optional[float] = 0.5,
detect_vertical: bool = True,
all_texts: bool = False,
store_full_path: bool = True,
store_full_path: bool = False,
) -> None:
"""
Create a PDFMinerToDocument component.
Expand Down Expand Up @@ -172,12 +171,6 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -37,7 +36,7 @@ class PPTXToDocument:
```
"""

def __init__(self, store_full_path: bool = True):
def __init__(self, store_full_path: bool = False):
"""
Create an PPTXToDocument component.
Expand Down Expand Up @@ -104,12 +103,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
10 changes: 2 additions & 8 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -79,7 +78,7 @@ def __init__(
layout_mode_scale_weight: float = 1.25,
layout_mode_strip_rotated: bool = True,
layout_mode_font_height_weight: float = 1.0,
store_full_path: bool = True,
store_full_path: bool = False,
):
"""
Create an PyPDFToDocument component.
Expand Down Expand Up @@ -220,12 +219,7 @@ def run(
)

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
document.meta = merged_metadata
Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import os
import warnings
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -75,7 +74,7 @@ class TikaDocumentConverter:
```
"""

def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = True):
def __init__(self, tika_url: str = "http://localhost:9998/tika", store_full_path: bool = False):
"""
Create a TikaDocumentConverter component.
Expand Down Expand Up @@ -139,12 +138,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
9 changes: 1 addition & 8 deletions haystack/components/converters/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# SPDX-License-Identifier: Apache-2.0

import os
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -36,7 +35,7 @@ class TextFileToDocument:
```
"""

def __init__(self, encoding: str = "utf-8", store_full_path: bool = True):
def __init__(self, encoding: str = "utf-8", store_full_path: bool = False):
"""
Creates a TextFileToDocument component.
Expand Down Expand Up @@ -93,12 +92,6 @@ def run(
continue

merged_metadata = {**bytestream.meta, **metadata}
warnings.warn(
"The `store_full_path` parameter defaults to True, storing full file paths in metadata. "
"In the 2.9.0 release, the default value for `store_full_path` will change to False, "
"storing only file names to improve privacy.",
DeprecationWarning,
)

if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
merged_metadata["file_path"] = os.path.basename(file_path)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
upgrade:
- |
Update default value of `store_full_path` to `False` in converters
2 changes: 1 addition & 1 deletion test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_to_dict(self, mock_resolve_value):
"page_layout": "natural",
"preceding_context_len": 3,
"threshold_y": 0.05,
"store_full_path": True,
"store_full_path": False,
},
}

Expand Down
9 changes: 5 additions & 4 deletions test/components/converters/test_csv_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from unittest.mock import patch
import pandas as pd
from pathlib import Path
import os

import pytest

Expand Down Expand Up @@ -35,9 +36,9 @@ def test_run(self, test_files_path):
assert len(docs) == 3
assert "Name,Age\r\nJohn Doe,27\r\nJane Smith,37\r\nMike Johnson,47\r\n" == docs[0].content
assert isinstance(docs[0].content, str)
assert docs[0].meta == bytestream.meta
assert docs[1].meta["file_path"] == str(files[1])
assert docs[2].meta["file_path"] == str(files[2])
assert docs[0].meta == {"file_path": os.path.basename(bytestream.meta["file_path"]), "key": "value"}
assert docs[1].meta["file_path"] == os.path.basename(files[1])
assert docs[2].meta["file_path"] == os.path.basename(files[2])

def test_run_with_store_full_path_false(self, test_files_path):
"""
Expand Down Expand Up @@ -73,7 +74,7 @@ def test_run_error_handling(self, test_files_path, caplog):
assert "non_existing_file.csv" in caplog.text
docs = output["documents"]
assert len(docs) == 2
assert docs[0].meta["file_path"] == str(paths[0])
assert docs[0].meta["file_path"] == os.path.basename(paths[0])

def test_encoding_override(self, test_files_path, caplog):
"""
Expand Down
Loading

0 comments on commit 73a0e68

Please sign in to comment.