Merge branch 'main' into pinecone-filters

deepset-ai · Dec 22, 2023 · b69f747 · b69f747
2 parents b9f9571 + fbdb9a0
commit b69f747
Show file tree

Hide file tree

Showing 50 changed files with 123 additions and 65 deletions.
diff --git a/.github/ISSUE_TEMPLATE/new-integration-proposal.md b/.github/ISSUE_TEMPLATE/new-integration-proposal.md
@@ -2,7 +2,7 @@
 name: New Integration Proposal
 about: Track the creation process for a new integration
 title: ''
-labels: New integration request
+labels: new integration
 assignees: ''
 
 ---
@@ -30,4 +30,4 @@ If the request is accepted, ensure the following checklist is complete before cl
 - [ ] An integration tile has been added to https://github.com/deepset-ai/haystack-integrations
 - [ ] The integration has been listed in the [Inventory section](https://github.com/deepset-ai/haystack-core-integrations#inventory) of this repo README
 - [ ] There is an example available to demonstrate the feature
-- [ ] The feature was announced through social media
+- [ ] The feature was announced through social media
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -16,7 +16,7 @@ integration:elasticsearch:
 
 integration:google-vertex:
   - changed-files:
-      - any-glob-to-any-file: "integrations/google-vertex/**/*"
+      - any-glob-to-any-file: "integrations/google_vertex/**/*"
       - any-glob-to-any-file: ".github/workflows/google_vertex.yml"
 
 integration:gradient:
@@ -26,7 +26,7 @@ integration:gradient:
 
 integration:instructor-embedders:
   - changed-files:
-      - any-glob-to-any-file: "integrations/instructor-embedders/**/*"
+      - any-glob-to-any-file: "integrations/instructor_embedders/**/*"
       - any-glob-to-any-file: ".github/workflows/instructor_embedders.yml"
 
 integration:jina:

diff --git a/.github/workflows/CI_check_integration_format.yml b/.github/workflows/CI_check_integration_format.yml
@@ -0,0 +1,16 @@
+name: Core / Check Integration Format
+on:
+- pull_request
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Ensure no hyphens
+        run: |
+          set +e
+          find -name "*-*" -type d -maxdepth 2 | grep integrations
+          test "$?" -eq 1 && exit 0 || echo "::error::Names of folders in ./integrations must not contain hyphens" && exit 1
diff --git a/.github/workflows/labeler.yml → .github/workflows/CI_labeler.yml b/.github/workflows/labeler.yml → .github/workflows/CI_labeler.yml
@@ -1,4 +1,4 @@
-name: "Labeler"
+name: Core / Labeler
 on:
 - pull_request_target
 

diff --git a/.github/workflows/project.yml → .github/workflows/CI_project.yml b/.github/workflows/project.yml → .github/workflows/CI_project.yml
@@ -1,4 +1,4 @@
-name: Track issues with Github project
+name: Core / Add issues to Github project
 
 on:
   issues:

diff --git a/.github/workflows/pypi_release.yml → .github/workflows/CI_pypi_release.yml b/.github/workflows/pypi_release.yml → .github/workflows/CI_pypi_release.yml
@@ -1,13 +1,12 @@
-name: Project release on PyPi
+name: Core / Project release on PyPi
 
 # The pushed tag must be formatted like so:
-# * nodes/speech2text-v1.0.0
-# * stores/foo-documentstore-v1.2.3
+# * integrations/<INTEGRATION_FOLDER_NAME>-v1.0.0
 #
-# The first part must be the path of the project being released.
-# If we want to release version 1.0.0 of project text2speech
-# that lives in path nodes/text2speech we'd have to push a
-# nodes/text2speech-v1.0.0 tag.
+# For example, if we want to release version 1.0.99
+# of the google-vertex-haystack integration we'd have to push the tag:
+#
+# integrations/google_vertex-v1.0.99
 
 on:
   push:

diff --git a/.github/workflows/google_vertex.yml b/.github/workflows/google_vertex.yml
@@ -7,12 +7,12 @@ on:
     - cron: "0 0 * * *"
   pull_request:
     paths:
-      - "integrations/google-vertex/**"
-      - ".github/workflows/google-vertex.yml"
+      - "integrations/google_vertex/**"
+      - ".github/workflows/google_vertex.yml"
 
 defaults:
   run:
-    working-directory: integrations/google-vertex
+    working-directory: integrations/google_vertex
 
 concurrency:
   group: google-vertex-${{ github.head_ref }}

diff --git a/.github/workflows/instructor_embedders.yml b/.github/workflows/instructor_embedders.yml
@@ -5,12 +5,12 @@ on:
     - cron: "0 0 * * *"
   pull_request:
     paths:
-      - 'integrations/instructor-embedders/**'
+      - 'integrations/instructor_embedders/**'
       - '.github/workflows/instructor_embedders.yml'
 
 defaults:
   run:
-    working-directory: integrations/instructor-embedders
+    working-directory: integrations/instructor_embedders
 
 jobs:
   test:

diff --git a/README.md b/README.md
@@ -65,9 +65,9 @@ deepset-haystack
 | [chroma-haystack](integrations/chroma/)                                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack)                                         | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml)                                                               |
 | [cohere-haystack](integrations/cohere/)                                         | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack)                                         | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml)                                                               |
 | [elasticsearch-haystack](integrations/elasticsearch/)                           | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack)                           | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml)                                          |
-| [google-vertex-haystack](integrations/google-vertex/)                           | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack)                           | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml)                                          |
+| [google-vertex-haystack](integrations/google_vertex/)                           | Generator           | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack)                           | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml)                                          |
 | [gradient-haystack](integrations/gradient/)                                     | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack)                                     | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml)                                                         |
-| [instructor-embedders-haystack](integrations/instructor-embedders/)             | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack)             | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml)                     |
+| [instructor-embedders-haystack](integrations/instructor_embedders/)             | Embedder            | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack)             | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml)                     |
 | [opensearch-haystack](integrations/opensearch/)                                 | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack)                                 | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml)                                                   |
 | [qdrant-haystack](integrations/qdrant/)                                         | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack)                            | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml)                                                               |
 | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter      | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) |

diff --git a/integrations/chroma/example/example.py b/integrations/chroma/example/example.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 from haystack import Pipeline
-from haystack.components.file_converters import TextFileToDocument
+from haystack.components.converters import TextFileToDocument
 from haystack.components.writers import DocumentWriter
 
 from chroma_haystack import ChromaDocumentStore
@@ -19,11 +19,11 @@
 indexing.add_component("converter", TextFileToDocument())
 indexing.add_component("writer", DocumentWriter(document_store))
 indexing.connect("converter", "writer")
-indexing.run({"converter": {"paths": file_paths}})
+indexing.run({"converter": {"sources": file_paths}})
 
 querying = Pipeline()
 querying.add_component("retriever", ChromaQueryRetriever(document_store))
 results = querying.run({"retriever": {"queries": ["Variable declarations"], "top_k": 3}})
 
-for d in results["retriever"][0]:
-    print(d.metadata, d.score)
+for d in results["retriever"]["documents"][0]:
+    print(d.meta, d.score)
diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py
@@ -247,10 +247,6 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s
                 # if the list contains multiple items, we need an $or chain
                 for v in value:
                     where["$or"].append({field: v})
-            elif field == "mime_type":
-                # Schedule for removal the original key, we're going to change it
-                keys_to_remove.append(field)
-                where["_mime_type"] = value
 
         for k in keys_to_remove:
             del filters[k]
@@ -310,8 +306,7 @@ def _query_result_to_documents(self, result: QueryResult) -> List[List[Document]
 
                 # prepare metadata
                 if metadatas := result.get("metadatas"):
-                    document_dict["metadata"] = dict(metadatas[i][j])
-                    document_dict["mime_type"] = document_dict["metadata"].pop("_mime_type")
+                    document_dict["meta"] = dict(metadatas[i][j])
 
                 if embeddings := result.get("embeddings"):
                     document_dict["embedding"] = np.array(embeddings[i][j])

diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py
@@ -263,7 +263,7 @@ def _bm25_retrieval(
                                 "query": query,
                                 "fuzziness": fuzziness,
                                 "type": "most_fields",
-                                "operator": "AND",
+                                "operator": "OR",
                             }
                         }
                     ]

diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py
@@ -182,6 +182,33 @@ def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocume
         assert "functional" in res[1].content
         assert "functional" in res[2].content
 
+    def test_bm25_not_all_terms_must_match(self, document_store: ElasticsearchDocumentStore):
+        """
+        Test that not all terms must mandatorily match for BM25 retrieval to return a result.
+        """
+        documents = [
+            Document(id=1, content="There are over 7,000 languages spoken around the world today."),
+            Document(
+                id=2,
+                content=(
+                    "Elephants have been observed to behave in a way that indicates a high level of self-awareness"
+                    " such as recognizing themselves in mirrors."
+                ),
+            ),
+            Document(
+                id=3,
+                content=(
+                    "In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness"
+                    " the phenomenon of bioluminescent waves."
+                ),
+            ),
+        ]
+        document_store.write_documents(documents)
+
+        res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3)
+        assert len(res) == 1
+        assert res[0].id == 2
+
     def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore):
         docs = [
             Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]),

diff --git a/integrations/google-vertex/LICENSE.txt → integrations/google_vertex/LICENSE.txt b/integrations/google-vertex/LICENSE.txt → integrations/google_vertex/LICENSE.txt
diff --git a/integrations/google-vertex/README.md → integrations/google_vertex/README.md b/integrations/google-vertex/README.md → integrations/google_vertex/README.md
diff --git a/integrations/google-vertex/pyproject.toml → integrations/google_vertex/pyproject.toml b/integrations/google-vertex/pyproject.toml → integrations/google_vertex/pyproject.toml
@@ -29,17 +29,17 @@ dependencies = [
 ]
 
 [project.urls]
-Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex#readme"
+Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex#readme"
 Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues"
-Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex"
+Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex"
 
 [tool.hatch.version]
 source = "vcs"
-tag-pattern = 'integrations\/google-vertex-v(?P<version>.*)'
+tag-pattern = 'integrations\/google_vertex-v(?P<version>.*)'
 
 [tool.hatch.version.raw-options]
 root = "../.."
-git_describe_command = 'git describe --tags --match="integrations/google-vertex-v[0-9]*"'
+git_describe_command = 'git describe --tags --match="integrations/google_vertex-v[0-9]*"'
 
 [tool.hatch.envs.default]
 dependencies = [
@@ -150,8 +150,8 @@ omit = [
 ]
 
 [tool.coverage.paths]
-google_vertex_haystack = ["src/google_vertex_haystack", "*/google-vertex-haystack/src/google_vertex_haystack"]
-tests = ["tests", "*/google-vertex-haystack/tests"]
+google_vertex_haystack = ["src/google_vertex_haystack", "*/google_vertex/src/google_vertex_haystack"]
+tests = ["tests", "*/google_vertex_haystack/tests"]
 
 [tool.coverage.report]
 exclude_lines = [

diff --git a/...ex/src/google_vertex_haystack/__init__.py → ...ex/src/google_vertex_haystack/__init__.py b/...ex/src/google_vertex_haystack/__init__.py → ...ex/src/google_vertex_haystack/__init__.py
diff --git a/...le_vertex_haystack/generators/__init__.py → ...le_vertex_haystack/generators/__init__.py b/...le_vertex_haystack/generators/__init__.py → ...le_vertex_haystack/generators/__init__.py
diff --git a/...e_vertex_haystack/generators/captioner.py → ...e_vertex_haystack/generators/captioner.py b/...e_vertex_haystack/generators/captioner.py → ...e_vertex_haystack/generators/captioner.py
diff --git a/...vertex_haystack/generators/chat/gemini.py → ...vertex_haystack/generators/chat/gemini.py b/...vertex_haystack/generators/chat/gemini.py → ...vertex_haystack/generators/chat/gemini.py
diff --git a/...tex_haystack/generators/code_generator.py → ...tex_haystack/generators/code_generator.py b/...tex_haystack/generators/code_generator.py → ...tex_haystack/generators/code_generator.py
diff --git a/...ogle_vertex_haystack/generators/gemini.py → ...ogle_vertex_haystack/generators/gemini.py b/...ogle_vertex_haystack/generators/gemini.py → ...ogle_vertex_haystack/generators/gemini.py
diff --git a/...ex_haystack/generators/image_generator.py → ...ex_haystack/generators/image_generator.py b/...ex_haystack/generators/image_generator.py → ...ex_haystack/generators/image_generator.py
diff --git a/...haystack/generators/question_answering.py → ...haystack/generators/question_answering.py b/...haystack/generators/question_answering.py → ...haystack/generators/question_answering.py
diff --git a/...tex_haystack/generators/text_generator.py → ...tex_haystack/generators/text_generator.py b/...tex_haystack/generators/text_generator.py → ...tex_haystack/generators/text_generator.py
diff --git a/integrations/google-vertex/tests/__init__.py → integrations/google_vertex/tests/__init__.py b/integrations/google-vertex/tests/__init__.py → integrations/google_vertex/tests/__init__.py
diff --git a/...ons/google-vertex/tests/test_captioner.py → ...ons/google_vertex/tests/test_captioner.py b/...ons/google-vertex/tests/test_captioner.py → ...ons/google_vertex/tests/test_captioner.py
diff --git a/...oogle-vertex/tests/test_code_generator.py → ...oogle_vertex/tests/test_code_generator.py b/...oogle-vertex/tests/test_code_generator.py → ...oogle_vertex/tests/test_code_generator.py
diff --git a/...ogle-vertex/tests/test_image_generator.py → ...ogle_vertex/tests/test_image_generator.py b/...ogle-vertex/tests/test_image_generator.py → ...ogle_vertex/tests/test_image_generator.py
diff --git a/...e-vertex/tests/test_question_answering.py → ...e_vertex/tests/test_question_answering.py b/...e-vertex/tests/test_question_answering.py → ...e_vertex/tests/test_question_answering.py
diff --git a/...oogle-vertex/tests/test_text_generator.py → ...oogle_vertex/tests/test_text_generator.py b/...oogle-vertex/tests/test_text_generator.py → ...oogle_vertex/tests/test_text_generator.py
diff --git a/integrations/gradient/pyproject.toml b/integrations/gradient/pyproject.toml
@@ -7,11 +7,13 @@ name = "gradient-haystack"
 dynamic = ["version"]
 description = ''
 readme = "README.md"
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 license = "Apache-2.0"
 keywords = []
 authors = [
-  { name = "Mateusz Haligowski", email = "[email protected]" },
+  { name = "Mateusz Haligowski", email = "[email protected]" },
+  { name = "Michael Feil", email = "[email protected]" },
+  { name = "Hayden Wilson", email = "[email protected]" },
 ]
 classifiers = [
   "Development Status :: 4 - Beta",
@@ -26,8 +28,9 @@ classifiers = [
 ]
 dependencies = [
   "haystack-ai",
-  "gradientai",
+  "gradientai>=1.4.0",
 ]
+optional-dependencies = { tqdm = ["tqdm"] }
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/gradient#readme"

diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py
@@ -4,9 +4,20 @@
 from gradientai import Gradient
 from haystack import Document, component, default_to_dict
 
+tqdm_imported: bool = True
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm_imported = False
+
+
 logger = logging.getLogger(__name__)
 
 
+def _alt_progress_bar(x: Any) -> Any:
+    return x
+
+
 @component
 class GradientDocumentEmbedder:
     """
@@ -31,24 +42,28 @@ def __init__(
         self,
         *,
         model_name: str = "bge-large",
-        batch_size: int = 100,
+        batch_size: int = 32_768,
         access_token: Optional[str] = None,
         workspace_id: Optional[str] = None,
         host: Optional[str] = None,
+        progress_bar: bool = True,
     ) -> None:
         """
         Create a GradientDocumentEmbedder component.
 
         :param model_name: The name of the model to use.
+        :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs.
         :param access_token: The Gradient access token. If not provided it's read from the environment
                              variable GRADIENT_ACCESS_TOKEN.
         :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment
                              variable GRADIENT_WORKSPACE_ID.
         :param host: The Gradient host. By default it uses https://api.gradient.ai/.
+        :param progress_bar: Whether to show a progress bar while embedding the documents.
         """
         self._batch_size = batch_size
         self._host = host
         self._model_name = model_name
+        self._progress_bar = progress_bar
 
         self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id)
 
@@ -75,11 +90,17 @@ def _generate_embeddings(self, documents: List[Document], batch_size: int) -> Li
         """
         Batches the documents and generates the embeddings.
         """
-        batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)]
+        if self._progress_bar and tqdm_imported:
+            batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)]
+            progress_bar = tqdm
+        else:
+            # no progress bar
+            progress_bar = _alt_progress_bar  # type: ignore
+            batches = [documents]
 
         embeddings = []
-        for batch in batches:
-            response = self._embedding_model.generate_embeddings(inputs=[{"input": doc.content} for doc in batch])
+        for batch in progress_bar(batches):
+            response = self._embedding_model.embed(inputs=[{"input": doc.content} for doc in batch])
             embeddings.extend([e.embedding for e in response.embeddings])
 
         return embeddings

diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py
@@ -76,7 +76,7 @@ def run(self, text: str):
             msg = "The embedding model has not been loaded. Please call warm_up() before running."
             raise RuntimeError(msg)
 
-        result = self._embedding_model.generate_embeddings(inputs=[{"input": text}])
+        result = self._embedding_model.embed(inputs=[{"input": text}])
 
         if (not result) or (result.embeddings is None) or (len(result.embeddings) == 0):
             msg = "The embedding model did not return any embeddings."