Merge branch 'main' into add-recursive-chunking

deepset-ai · Dec 16, 2024 · 080a529 · 080a529
2 parents d9addfa + a5b57f4
commit 080a529
Show file tree

Hide file tree

Showing 10 changed files with 16 additions and 7 deletions.
diff --git a/e2e/pipelines/test_dense_doc_search.py b/e2e/pipelines/test_dense_doc_search.py
@@ -26,7 +26,7 @@ def test_dense_doc_search_pipeline(tmp_path, samples_path):
     indexing_pipeline.add_component(instance=DocumentJoiner(), name="joiner")
     indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
     indexing_pipeline.add_component(
-        instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter"
+        instance=DocumentSplitter(split_by="period", split_length=250, split_overlap=30), name="splitter"
     )
     indexing_pipeline.add_component(
         instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"

diff --git a/e2e/pipelines/test_preprocessing_pipeline.py b/e2e/pipelines/test_preprocessing_pipeline.py
@@ -2,8 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import json
-
 from haystack import Pipeline
 from haystack.components.classifiers import DocumentLanguageClassifier
 from haystack.components.converters import TextFileToDocument
@@ -25,9 +23,7 @@ def test_preprocessing_pipeline(tmp_path):
         instance=MetadataRouter(rules={"en": {"field": "language", "operator": "==", "value": "en"}}), name="router"
     )
     preprocessing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
-    preprocessing_pipeline.add_component(
-        instance=DocumentSplitter(split_by="sentence", split_length=1), name="splitter"
-    )
+    preprocessing_pipeline.add_component(instance=DocumentSplitter(split_by="period", split_length=1), name="splitter")
     preprocessing_pipeline.add_component(
         instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"), name="embedder"
     )

diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py
@@ -45,6 +45,7 @@ def test_to_dict(self):
 
     def test_from_dict(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.classifiers.zero_shot_document_classifier.TransformersZeroShotDocumentClassifier",
             "init_parameters": {
@@ -73,6 +74,7 @@ def test_from_dict(self, monkeypatch):
 
     def test_from_dict_no_default_parameters(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.classifiers.zero_shot_document_classifier.TransformersZeroShotDocumentClassifier",
             "init_parameters": {"model": "cross-encoder/nli-deberta-v3-xsmall", "labels": ["positive", "negative"]},

diff --git a/test/components/generators/chat/test_hugging_face_local.py b/test/components/generators/chat/test_hugging_face_local.py
@@ -166,6 +166,7 @@ def test_from_dict(self, model_info_mock):
     @patch("haystack.components.generators.chat.hugging_face_local.pipeline")
     def test_warm_up(self, pipeline_mock, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         generator = HuggingFaceLocalChatGenerator(
             model="mistralai/Mistral-7B-Instruct-v0.2",
             task="text2text-generation",

diff --git a/test/components/generators/test_hugging_face_local_generator.py b/test/components/generators/test_hugging_face_local_generator.py
@@ -18,6 +18,7 @@ class TestHuggingFaceLocalGenerator:
     @patch("haystack.utils.hf.model_info")
     def test_init_default(self, model_info_mock, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         model_info_mock.return_value.pipeline_tag = "text2text-generation"
         generator = HuggingFaceLocalGenerator()
 

diff --git a/test/components/rankers/test_sentence_transformers_diversity.py b/test/components/rankers/test_sentence_transformers_diversity.py
@@ -273,7 +273,7 @@ def test_warm_up(self, similarity, monkeypatch):
         Test that ranker loads the SentenceTransformer model correctly during warm up.
         """
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
-
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         mock_model_class = MagicMock()
         mock_model_instance = MagicMock()
         mock_model_class.return_value = mock_model_instance

diff --git a/test/components/rankers/test_transformers_similarity.py b/test/components/rankers/test_transformers_similarity.py
@@ -313,6 +313,7 @@ def test_device_map_and_device_raises(self, caplog):
     @patch("haystack.components.rankers.transformers_similarity.AutoModelForSequenceClassification.from_pretrained")
     def test_device_map_dict(self, mocked_automodel, _mocked_autotokenizer, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         ranker = TransformersSimilarityRanker("model", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}})
 
         class MockedModel:

diff --git a/test/components/readers/test_extractive.py b/test/components/readers/test_extractive.py
@@ -519,6 +519,7 @@ def __init__(self):
 @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained")
 def test_device_map_auto(mocked_automodel, _mocked_autotokenizer, monkeypatch):
     monkeypatch.delenv("HF_API_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
     reader = ExtractiveReader("deepset/roberta-base-squad2", model_kwargs={"device_map": "auto"})
     auto_device = ComponentDevice.resolve_device(None)
 
@@ -537,6 +538,7 @@ def __init__(self):
 @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained")
 def test_device_map_str(mocked_automodel, _mocked_autotokenizer, monkeypatch):
     monkeypatch.delenv("HF_API_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
     reader = ExtractiveReader("deepset/roberta-base-squad2", model_kwargs={"device_map": "cpu:0"})
 
     class MockedModel:
@@ -554,6 +556,7 @@ def __init__(self):
 @patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained")
 def test_device_map_dict(mocked_automodel, _mocked_autotokenizer, monkeypatch):
     monkeypatch.delenv("HF_API_TOKEN", raising=False)
+    monkeypatch.delenv("HF_TOKEN", raising=False)
     reader = ExtractiveReader(
         "deepset/roberta-base-squad2", model_kwargs={"device_map": {"layer_1": 1, "classifier": "cpu"}}
     )

diff --git a/test/components/routers/test_transformers_text_router.py b/test/components/routers/test_transformers_text_router.py
@@ -54,6 +54,7 @@ def test_to_dict_with_cpu_device(self, mock_auto_config_from_pretrained):
     def test_from_dict(self, mock_auto_config_from_pretrained, monkeypatch):
         mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1})
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter",
             "init_parameters": {
@@ -84,6 +85,7 @@ def test_from_dict(self, mock_auto_config_from_pretrained, monkeypatch):
     def test_from_dict_no_default_parameters(self, mock_auto_config_from_pretrained, monkeypatch):
         mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1})
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter",
             "init_parameters": {"model": "papluca/xlm-roberta-base-language-detection"},
@@ -105,6 +107,7 @@ def test_from_dict_no_default_parameters(self, mock_auto_config_from_pretrained,
     def test_from_dict_with_cpu_device(self, mock_auto_config_from_pretrained, monkeypatch):
         mock_auto_config_from_pretrained.return_value = MagicMock(label2id={"en": 0, "de": 1})
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.routers.transformers_text_router.TransformersTextRouter",
             "init_parameters": {

diff --git a/test/components/routers/test_zero_shot_text_router.py b/test/components/routers/test_zero_shot_text_router.py
@@ -28,6 +28,7 @@ def test_to_dict(self):
 
     def test_from_dict(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.routers.zero_shot_text_router.TransformersZeroShotTextRouter",
             "init_parameters": {
@@ -56,6 +57,7 @@ def test_from_dict(self, monkeypatch):
 
     def test_from_dict_no_default_parameters(self, monkeypatch):
         monkeypatch.delenv("HF_API_TOKEN", raising=False)
+        monkeypatch.delenv("HF_TOKEN", raising=False)
         data = {
             "type": "haystack.components.routers.zero_shot_text_router.TransformersZeroShotTextRouter",
             "init_parameters": {"labels": ["query", "passage"]},