feat: Add raise_on_failure to BaseConverter (#6930)

* add raise_on_failure flag to BaseConverter * add unit tests * release notes * Empty-Commit * assert the correct length * log warn with all failed conversion paths * Update wording Co-authored-by: Stefano Fiorucci <[email protected]> * Copy paste whoops Co-authored-by: Stefano Fiorucci <[email protected]> * cast path as str * small refinement --------- Co-authored-by: anakin87 <[email protected]>
deepset-ai · Feb 7, 2024 · 56b8b0d · 56b8b0d
1 parent af0166f
commit 56b8b0d
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 8 deletions.
diff --git a/haystack/nodes/file_converter/base.py b/haystack/nodes/file_converter/base.py
@@ -158,6 +158,7 @@ def run(  # type: ignore
         valid_languages: Optional[List[str]] = None,
         encoding: Optional[str] = "UTF-8",
         id_hash_keys: Optional[List[str]] = None,
+        raise_on_failure: bool = True,
     ):
         """
         Extract text from a file.
@@ -188,6 +189,7 @@ def run(  # type: ignore
             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
             In this case the id will be generated by using the content and the defined metadata.
+        :param raise_on_failure: If true, raises an exception if the conversion of a single file fails. If False, skips the file without failing.
         """
         if known_ligatures is None:
             known_ligatures = KNOWN_LIGATURES
@@ -199,24 +201,34 @@ def run(  # type: ignore
             meta = [meta] * len(file_paths)
 
         documents: list = []
+        failed_paths: list = []
         for file_path, file_meta in tqdm(
             zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files"
         ):
-            documents += self.convert(
-                file_path=file_path,
-                meta=file_meta,
-                remove_numeric_tables=remove_numeric_tables,
-                valid_languages=valid_languages,
-                encoding=encoding,
-                id_hash_keys=id_hash_keys,
-            )
+            try:
+                documents += self.convert(
+                    file_path=file_path,
+                    meta=file_meta,
+                    remove_numeric_tables=remove_numeric_tables,
+                    valid_languages=valid_languages,
+                    encoding=encoding,
+                    id_hash_keys=id_hash_keys,
+                )
+            except Exception as e:
+                if raise_on_failure:
+                    raise e
+                failed_paths.append(str(file_path))
+                continue
 
         # Cleanup ligatures
         for document in documents:
             for ligature, letters in known_ligatures.items():
                 if document.content is not None:
                     document.content = document.content.replace(ligature, letters)
 
+        if failed_paths:
+            logger.warning("Conversion of the following file paths failed: %s", ",".join(failed_paths))
+
         result = {"documents": documents}
         return result, "output_1"
 

diff --git a/releasenotes/notes/add-raise-on-failure-to-base-converter-8c5e9b3dd51c0e0c.yaml b/releasenotes/notes/add-raise-on-failure-to-base-converter-8c5e9b3dd51c0e0c.yaml
@@ -0,0 +1,4 @@
+---
+enhancements:
+  - |
+    Add `raise_on_failure` flag to BaseConverter class so that big processes can optionally continue without breaking from exceptions.
diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py
@@ -422,6 +422,34 @@ def test_csv_to_document_with_wrong_qa_headers(tmp_path):
         node.run(file_paths=csv_path)
 
 
+@pytest.mark.unit
+def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_true(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["wrong", "headers"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError):
+        node.run(file_paths=csv_path, raise_on_failure=True)
+
+
+@pytest.mark.unit
+def test_csv_to_document_with_wrong_qa_headers_raise_on_failure_false(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["wrong", "headers"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    result, _ = node.run(file_paths=csv_path, raise_on_failure=False)
+    assert len(result["documents"]) == 0
+
+
 @pytest.mark.unit
 def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
     node = CsvTextConverter()