Skip to content

Commit

Permalink
feat: Add raise_on_failure to BaseConverter (#6930)
Browse files Browse the repository at this point in the history
* add raise_on_failure flag to BaseConverter

* add unit tests

* release notes

* Empty-Commit

* assert the correct length

* log warn with all failed conversion paths

* Update wording

Co-authored-by: Stefano Fiorucci <[email protected]>

* Copy paste whoops

Co-authored-by: Stefano Fiorucci <[email protected]>

* cast path as str

* small refinement

---------

Co-authored-by: anakin87 <[email protected]>
  • Loading branch information
isaac-chung and anakin87 authored Feb 7, 2024
1 parent af0166f commit 56b8b0d
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 8 deletions.
28 changes: 20 additions & 8 deletions haystack/nodes/file_converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def run( # type: ignore
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,
raise_on_failure: bool = True,
):
"""
Extract text from a file.
Expand Down Expand Up @@ -188,6 +189,7 @@ def run( # type: ignore
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
In this case the id will be generated by using the content and the defined metadata.
:param raise_on_failure: If true, raises an exception if the conversion of a single file fails. If False, skips the file without failing.
"""
if known_ligatures is None:
known_ligatures = KNOWN_LIGATURES
Expand All @@ -199,24 +201,34 @@ def run( # type: ignore
meta = [meta] * len(file_paths)

documents: list = []
failed_paths: list = []
for file_path, file_meta in tqdm(
zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files"
):
documents += self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
)
try:
documents += self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
)
except Exception as e:
if raise_on_failure:
raise e
failed_paths.append(str(file_path))
continue

# Cleanup ligatures
for document in documents:
for ligature, letters in known_ligatures.items():
if document.content is not None:
document.content = document.content.replace(ligature, letters)

if failed_paths:
logger.warning("Conversion of the following file paths failed: %s", ",".join(failed_paths))

result = {"documents": documents}
return result, "output_1"

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Add `raise_on_failure` flag to BaseConverter class so that big processes can optionally continue without breaking from exceptions.
28 changes: 28 additions & 0 deletions test/nodes/test_file_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,34 @@ def test_csv_to_document_with_wrong_qa_headers(tmp_path):
node.run(file_paths=csv_path)


@pytest.mark.unit
def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_true(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "headers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)

with pytest.raises(ValueError):
node.run(file_paths=csv_path, raise_on_failure=True)


@pytest.mark.unit
def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_false(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "headers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)

result, _ = node.run(file_paths=csv_path, raise_on_failure=False)
assert len(result["documents"]) == 0


@pytest.mark.unit
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
Expand Down

0 comments on commit 56b8b0d

Please sign in to comment.