diff --git a/.github/ISSUE_TEMPLATE/new-integration-proposal.md b/.github/ISSUE_TEMPLATE/new-integration-proposal.md index 21964186c..1ba3a8bd3 100644 --- a/.github/ISSUE_TEMPLATE/new-integration-proposal.md +++ b/.github/ISSUE_TEMPLATE/new-integration-proposal.md @@ -2,7 +2,7 @@ name: New Integration Proposal about: Track the creation process for a new integration title: '' -labels: New integration request +labels: new integration assignees: '' --- @@ -30,4 +30,4 @@ If the request is accepted, ensure the following checklist is complete before cl - [ ] An integration tile has been added to https://github.com/deepset-ai/haystack-integrations - [ ] The integration has been listed in the [Inventory section](https://github.com/deepset-ai/haystack-core-integrations#inventory) of this repo README - [ ] There is an example available to demonstrate the feature -- [ ] The feature was announced through social media \ No newline at end of file +- [ ] The feature was announced through social media diff --git a/.github/labeler.yml b/.github/labeler.yml index 1a41c2caf..151deead6 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -16,7 +16,7 @@ integration:elasticsearch: integration:google-vertex: - changed-files: - - any-glob-to-any-file: "integrations/google-vertex/**/*" + - any-glob-to-any-file: "integrations/google_vertex/**/*" - any-glob-to-any-file: ".github/workflows/google_vertex.yml" integration:gradient: @@ -26,7 +26,7 @@ integration:gradient: integration:instructor-embedders: - changed-files: - - any-glob-to-any-file: "integrations/instructor-embedders/**/*" + - any-glob-to-any-file: "integrations/instructor_embedders/**/*" - any-glob-to-any-file: ".github/workflows/instructor_embedders.yml" integration:jina: diff --git a/.github/workflows/CI_check_integration_format.yml b/.github/workflows/CI_check_integration_format.yml new file mode 100644 index 000000000..92e7310b4 --- /dev/null +++ b/.github/workflows/CI_check_integration_format.yml @@ -0,0 +1,16 @@ +name: Core / Check Integration Format +on: +- pull_request + +jobs: + check: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Ensure no hyphens + run: | + set +e + find -name "*-*" -type d -maxdepth 2 | grep integrations + test "$?" -eq 1 && exit 0 || echo "::error::Names of folders in ./integrations must not contain hyphens" && exit 1 diff --git a/.github/workflows/labeler.yml b/.github/workflows/CI_labeler.yml similarity index 91% rename from .github/workflows/labeler.yml rename to .github/workflows/CI_labeler.yml index 2af558297..bed28b1b2 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/CI_labeler.yml @@ -1,4 +1,4 @@ -name: "Labeler" +name: Core / Labeler on: - pull_request_target diff --git a/.github/workflows/project.yml b/.github/workflows/CI_project.yml similarity index 88% rename from .github/workflows/project.yml rename to .github/workflows/CI_project.yml index 716c1ff43..59beadd7f 100644 --- a/.github/workflows/project.yml +++ b/.github/workflows/CI_project.yml @@ -1,4 +1,4 @@ -name: Track issues with Github project +name: Core / Add issues to Github project on: issues: diff --git a/.github/workflows/pypi_release.yml b/.github/workflows/CI_pypi_release.yml similarity index 75% rename from .github/workflows/pypi_release.yml rename to .github/workflows/CI_pypi_release.yml index 39e4a580f..57793c588 100644 --- a/.github/workflows/pypi_release.yml +++ b/.github/workflows/CI_pypi_release.yml @@ -1,13 +1,12 @@ -name: Project release on PyPi +name: Core / Project release on PyPi # The pushed tag must be formatted like so: -# * nodes/speech2text-v1.0.0 -# * stores/foo-documentstore-v1.2.3 +# * integrations/-v1.0.0 # -# The first part must be the path of the project being released. -# If we want to release version 1.0.0 of project text2speech -# that lives in path nodes/text2speech we'd have to push a -# nodes/text2speech-v1.0.0 tag. +# For example, if we want to release version 1.0.99 +# of the google-vertex-haystack integration we'd have to push the tag: +# +# integrations/google_vertex-v1.0.99 on: push: diff --git a/.github/workflows/google_vertex.yml b/.github/workflows/google_vertex.yml index 7be3973bf..cf60d3229 100644 --- a/.github/workflows/google_vertex.yml +++ b/.github/workflows/google_vertex.yml @@ -7,12 +7,12 @@ on: - cron: "0 0 * * *" pull_request: paths: - - "integrations/google-vertex/**" - - ".github/workflows/google-vertex.yml" + - "integrations/google_vertex/**" + - ".github/workflows/google_vertex.yml" defaults: run: - working-directory: integrations/google-vertex + working-directory: integrations/google_vertex concurrency: group: google-vertex-${{ github.head_ref }} diff --git a/.github/workflows/instructor_embedders.yml b/.github/workflows/instructor_embedders.yml index 05ecfb05f..4145408e2 100644 --- a/.github/workflows/instructor_embedders.yml +++ b/.github/workflows/instructor_embedders.yml @@ -5,12 +5,12 @@ on: - cron: "0 0 * * *" pull_request: paths: - - 'integrations/instructor-embedders/**' + - 'integrations/instructor_embedders/**' - '.github/workflows/instructor_embedders.yml' defaults: run: - working-directory: integrations/instructor-embedders + working-directory: integrations/instructor_embedders jobs: test: diff --git a/README.md b/README.md index 352533579..bc9d1b6d1 100644 --- a/README.md +++ b/README.md @@ -65,9 +65,9 @@ deepset-haystack | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | | [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [google-vertex-haystack](integrations/google-vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | +| [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | | [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | -| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | +| [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | diff --git a/integrations/chroma/example/example.py b/integrations/chroma/example/example.py index a6053db1c..a1155b216 100644 --- a/integrations/chroma/example/example.py +++ b/integrations/chroma/example/example.py @@ -3,7 +3,7 @@ from pathlib import Path from haystack import Pipeline -from haystack.components.file_converters import TextFileToDocument +from haystack.components.converters import TextFileToDocument from haystack.components.writers import DocumentWriter from chroma_haystack import ChromaDocumentStore @@ -19,11 +19,11 @@ indexing.add_component("converter", TextFileToDocument()) indexing.add_component("writer", DocumentWriter(document_store)) indexing.connect("converter", "writer") -indexing.run({"converter": {"paths": file_paths}}) +indexing.run({"converter": {"sources": file_paths}}) querying = Pipeline() querying.add_component("retriever", ChromaQueryRetriever(document_store)) results = querying.run({"retriever": {"queries": ["Variable declarations"], "top_k": 3}}) -for d in results["retriever"][0]: - print(d.metadata, d.score) +for d in results["retriever"]["documents"][0]: + print(d.meta, d.score) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index 8d6a8437e..8af78f05f 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -247,10 +247,6 @@ def _normalize_filters(self, filters: Dict[str, Any]) -> Tuple[List[str], Dict[s # if the list contains multiple items, we need an $or chain for v in value: where["$or"].append({field: v}) - elif field == "mime_type": - # Schedule for removal the original key, we're going to change it - keys_to_remove.append(field) - where["_mime_type"] = value for k in keys_to_remove: del filters[k] @@ -310,8 +306,7 @@ def _query_result_to_documents(self, result: QueryResult) -> List[List[Document] # prepare metadata if metadatas := result.get("metadatas"): - document_dict["metadata"] = dict(metadatas[i][j]) - document_dict["mime_type"] = document_dict["metadata"].pop("_mime_type") + document_dict["meta"] = dict(metadatas[i][j]) if embeddings := result.get("embeddings"): document_dict["embedding"] = np.array(embeddings[i][j]) diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py index b552a7e06..568e78ac5 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -263,7 +263,7 @@ def _bm25_retrieval( "query": query, "fuzziness": fuzziness, "type": "most_fields", - "operator": "AND", + "operator": "OR", } } ] diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index fbc850182..906a023da 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -182,6 +182,33 @@ def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocume assert "functional" in res[1].content assert "functional" in res[2].content + def test_bm25_not_all_terms_must_match(self, document_store: ElasticsearchDocumentStore): + """ + Test that not all terms must mandatorily match for BM25 retrieval to return a result. + """ + documents = [ + Document(id=1, content="There are over 7,000 languages spoken around the world today."), + Document( + id=2, + content=( + "Elephants have been observed to behave in a way that indicates a high level of self-awareness" + " such as recognizing themselves in mirrors." + ), + ), + Document( + id=3, + content=( + "In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness" + " the phenomenon of bioluminescent waves." + ), + ), + ] + document_store.write_documents(documents) + + res = document_store._bm25_retrieval("How much self awareness do elephants have?", top_k=3) + assert len(res) == 1 + assert res[0].id == 2 + def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), diff --git a/integrations/google-vertex/LICENSE.txt b/integrations/google_vertex/LICENSE.txt similarity index 100% rename from integrations/google-vertex/LICENSE.txt rename to integrations/google_vertex/LICENSE.txt diff --git a/integrations/google-vertex/README.md b/integrations/google_vertex/README.md similarity index 100% rename from integrations/google-vertex/README.md rename to integrations/google_vertex/README.md diff --git a/integrations/google-vertex/pyproject.toml b/integrations/google_vertex/pyproject.toml similarity index 92% rename from integrations/google-vertex/pyproject.toml rename to integrations/google_vertex/pyproject.toml index b773742e2..1d15a4270 100644 --- a/integrations/google-vertex/pyproject.toml +++ b/integrations/google_vertex/pyproject.toml @@ -29,17 +29,17 @@ dependencies = [ ] [project.urls] -Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex#readme" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex#readme" Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" -Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google-vertex" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/google_vertex" [tool.hatch.version] source = "vcs" -tag-pattern = 'integrations\/google-vertex-v(?P.*)' +tag-pattern = 'integrations\/google_vertex-v(?P.*)' [tool.hatch.version.raw-options] root = "../.." -git_describe_command = 'git describe --tags --match="integrations/google-vertex-v[0-9]*"' +git_describe_command = 'git describe --tags --match="integrations/google_vertex-v[0-9]*"' [tool.hatch.envs.default] dependencies = [ @@ -150,8 +150,8 @@ omit = [ ] [tool.coverage.paths] -google_vertex_haystack = ["src/google_vertex_haystack", "*/google-vertex-haystack/src/google_vertex_haystack"] -tests = ["tests", "*/google-vertex-haystack/tests"] +google_vertex_haystack = ["src/google_vertex_haystack", "*/google_vertex/src/google_vertex_haystack"] +tests = ["tests", "*/google_vertex_haystack/tests"] [tool.coverage.report] exclude_lines = [ diff --git a/integrations/google-vertex/src/google_vertex_haystack/__init__.py b/integrations/google_vertex/src/google_vertex_haystack/__init__.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/__init__.py rename to integrations/google_vertex/src/google_vertex_haystack/__init__.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/__init__.py b/integrations/google_vertex/src/google_vertex_haystack/generators/__init__.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/__init__.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/__init__.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/captioner.py b/integrations/google_vertex/src/google_vertex_haystack/generators/captioner.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/captioner.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/captioner.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/chat/gemini.py b/integrations/google_vertex/src/google_vertex_haystack/generators/chat/gemini.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/chat/gemini.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/chat/gemini.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/code_generator.py b/integrations/google_vertex/src/google_vertex_haystack/generators/code_generator.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/code_generator.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/code_generator.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/gemini.py b/integrations/google_vertex/src/google_vertex_haystack/generators/gemini.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/gemini.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/gemini.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/image_generator.py b/integrations/google_vertex/src/google_vertex_haystack/generators/image_generator.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/image_generator.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/image_generator.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/question_answering.py b/integrations/google_vertex/src/google_vertex_haystack/generators/question_answering.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/question_answering.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/question_answering.py diff --git a/integrations/google-vertex/src/google_vertex_haystack/generators/text_generator.py b/integrations/google_vertex/src/google_vertex_haystack/generators/text_generator.py similarity index 100% rename from integrations/google-vertex/src/google_vertex_haystack/generators/text_generator.py rename to integrations/google_vertex/src/google_vertex_haystack/generators/text_generator.py diff --git a/integrations/google-vertex/tests/__init__.py b/integrations/google_vertex/tests/__init__.py similarity index 100% rename from integrations/google-vertex/tests/__init__.py rename to integrations/google_vertex/tests/__init__.py diff --git a/integrations/google-vertex/tests/test_captioner.py b/integrations/google_vertex/tests/test_captioner.py similarity index 100% rename from integrations/google-vertex/tests/test_captioner.py rename to integrations/google_vertex/tests/test_captioner.py diff --git a/integrations/google-vertex/tests/test_code_generator.py b/integrations/google_vertex/tests/test_code_generator.py similarity index 100% rename from integrations/google-vertex/tests/test_code_generator.py rename to integrations/google_vertex/tests/test_code_generator.py diff --git a/integrations/google-vertex/tests/test_image_generator.py b/integrations/google_vertex/tests/test_image_generator.py similarity index 100% rename from integrations/google-vertex/tests/test_image_generator.py rename to integrations/google_vertex/tests/test_image_generator.py diff --git a/integrations/google-vertex/tests/test_question_answering.py b/integrations/google_vertex/tests/test_question_answering.py similarity index 100% rename from integrations/google-vertex/tests/test_question_answering.py rename to integrations/google_vertex/tests/test_question_answering.py diff --git a/integrations/google-vertex/tests/test_text_generator.py b/integrations/google_vertex/tests/test_text_generator.py similarity index 100% rename from integrations/google-vertex/tests/test_text_generator.py rename to integrations/google_vertex/tests/test_text_generator.py diff --git a/integrations/gradient/pyproject.toml b/integrations/gradient/pyproject.toml index 5a0930cc4..8b7ca65c4 100644 --- a/integrations/gradient/pyproject.toml +++ b/integrations/gradient/pyproject.toml @@ -7,11 +7,13 @@ name = "gradient-haystack" dynamic = ["version"] description = '' readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.8" license = "Apache-2.0" keywords = [] authors = [ - { name = "Mateusz Haligowski", email = "mhaligowski@gmail.com" }, + { name = "Mateusz Haligowski", email = "contact@gradient.ai" }, + { name = "Michael Feil", email = "contact@gradient.ai" }, + { name = "Hayden Wilson", email = "contact@gradient.ai" }, ] classifiers = [ "Development Status :: 4 - Beta", @@ -26,8 +28,9 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "gradientai", + "gradientai>=1.4.0", ] +optional-dependencies = { tqdm = ["tqdm"] } [project.urls] Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/gradient#readme" diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py index a716b8cf2..551aa9dd5 100644 --- a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py @@ -4,9 +4,20 @@ from gradientai import Gradient from haystack import Document, component, default_to_dict +tqdm_imported: bool = True +try: + from tqdm import tqdm +except ImportError: + tqdm_imported = False + + logger = logging.getLogger(__name__) +def _alt_progress_bar(x: Any) -> Any: + return x + + @component class GradientDocumentEmbedder: """ @@ -31,24 +42,28 @@ def __init__( self, *, model_name: str = "bge-large", - batch_size: int = 100, + batch_size: int = 32_768, access_token: Optional[str] = None, workspace_id: Optional[str] = None, host: Optional[str] = None, + progress_bar: bool = True, ) -> None: """ Create a GradientDocumentEmbedder component. :param model_name: The name of the model to use. + :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs. :param access_token: The Gradient access token. If not provided it's read from the environment variable GRADIENT_ACCESS_TOKEN. :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment variable GRADIENT_WORKSPACE_ID. :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param progress_bar: Whether to show a progress bar while embedding the documents. """ self._batch_size = batch_size self._host = host self._model_name = model_name + self._progress_bar = progress_bar self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) @@ -75,11 +90,17 @@ def _generate_embeddings(self, documents: List[Document], batch_size: int) -> Li """ Batches the documents and generates the embeddings. """ - batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)] + if self._progress_bar and tqdm_imported: + batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)] + progress_bar = tqdm + else: + # no progress bar + progress_bar = _alt_progress_bar # type: ignore + batches = [documents] embeddings = [] - for batch in batches: - response = self._embedding_model.generate_embeddings(inputs=[{"input": doc.content} for doc in batch]) + for batch in progress_bar(batches): + response = self._embedding_model.embed(inputs=[{"input": doc.content} for doc in batch]) embeddings.extend([e.embedding for e in response.embeddings]) return embeddings diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py index 013d375ff..2ddc229ce 100644 --- a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py @@ -76,7 +76,7 @@ def run(self, text: str): msg = "The embedding model has not been loaded. Please call warm_up() before running." raise RuntimeError(msg) - result = self._embedding_model.generate_embeddings(inputs=[{"input": text}]) + result = self._embedding_model.embed(inputs=[{"input": text}]) if (not result) or (result.embeddings is None) or (len(result.embeddings) == 0): msg = "The embedding model did not return any embeddings." diff --git a/integrations/gradient/tests/test_gradient_document_embedder.py b/integrations/gradient/tests/test_gradient_document_embedder.py index bac02df5e..f78e19f1f 100644 --- a/integrations/gradient/tests/test_gradient_document_embedder.py +++ b/integrations/gradient/tests/test_gradient_document_embedder.py @@ -89,7 +89,7 @@ def test_run_fail_if_not_warmed_up(self): def test_run(self): embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) embedder._embedding_model = NonCallableMagicMock() - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess( embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(5)] ) @@ -97,7 +97,7 @@ def test_run(self): result = embedder.run(documents=documents) - assert embedder._embedding_model.generate_embeddings.call_count == 1 + assert embedder._embedding_model.embed.call_count == 1 assert isinstance(result["documents"], list) assert len(result["documents"]) == len(documents) for doc in result["documents"]: @@ -110,7 +110,7 @@ def test_run_batch(self): embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) embedder._embedding_model = NonCallableMagicMock() - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess( embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(110)] ) @@ -118,7 +118,7 @@ def test_run_batch(self): result = embedder.run(documents=documents) - assert embedder._embedding_model.generate_embeddings.call_count == 2 + assert embedder._embedding_model.embed.call_count == 1 assert isinstance(result["documents"], list) assert len(result["documents"]) == len(documents) for doc in result["documents"]: @@ -132,7 +132,7 @@ def test_run_custom_batch(self): embedder._embedding_model = NonCallableMagicMock() document_count = 101 - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess( embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(document_count)] ) @@ -140,7 +140,7 @@ def test_run_custom_batch(self): result = embedder.run(documents=documents) - assert embedder._embedding_model.generate_embeddings.call_count == 6 + assert embedder._embedding_model.embed.call_count == 6 assert isinstance(result["documents"], list) assert len(result["documents"]) == len(documents) for doc in result["documents"]: diff --git a/integrations/gradient/tests/test_gradient_text_embedder.py b/integrations/gradient/tests/test_gradient_text_embedder.py index 9623db5d4..98cb1d0fb 100644 --- a/integrations/gradient/tests/test_gradient_text_embedder.py +++ b/integrations/gradient/tests/test_gradient_text_embedder.py @@ -88,24 +88,22 @@ def test_run_fail_if_not_warmed_up(self): def test_run_fail_when_no_embeddings_returned(self): embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) embedder._embedding_model = NonCallableMagicMock() - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess(embeddings=[]) + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess(embeddings=[]) with pytest.raises(RuntimeError): _result = embedder.run(text="The food was delicious") - embedder._embedding_model.generate_embeddings.assert_called_once_with( - inputs=[{"input": "The food was delicious"}] - ) + embedder._embedding_model.embed.assert_called_once_with(inputs=[{"input": "The food was delicious"}]) @pytest.mark.unit def test_run_empty_string(self): embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) embedder._embedding_model = NonCallableMagicMock() - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess( embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": 0}] ) result = embedder.run(text="") - embedder._embedding_model.generate_embeddings.assert_called_once_with(inputs=[{"input": ""}]) + embedder._embedding_model.embed.assert_called_once_with(inputs=[{"input": ""}]) assert len(result["embedding"]) == 1024 # 1024 is the bge-large embedding size assert all(isinstance(x, float) for x in result["embedding"]) @@ -114,14 +112,12 @@ def test_run_empty_string(self): def test_run(self): embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) embedder._embedding_model = NonCallableMagicMock() - embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embedder._embedding_model.embed.return_value = GenerateEmbeddingSuccess( embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": 0}] ) result = embedder.run(text="The food was delicious") - embedder._embedding_model.generate_embeddings.assert_called_once_with( - inputs=[{"input": "The food was delicious"}] - ) + embedder._embedding_model.embed.assert_called_once_with(inputs=[{"input": "The food was delicious"}]) assert len(result["embedding"]) == 1024 # 1024 is the bge-large embedding size assert all(isinstance(x, float) for x in result["embedding"]) diff --git a/integrations/instructor-embedders/LICENSE.txt b/integrations/instructor_embedders/LICENSE.txt similarity index 100% rename from integrations/instructor-embedders/LICENSE.txt rename to integrations/instructor_embedders/LICENSE.txt diff --git a/integrations/instructor-embedders/README.md b/integrations/instructor_embedders/README.md similarity index 100% rename from integrations/instructor-embedders/README.md rename to integrations/instructor_embedders/README.md diff --git a/integrations/instructor-embedders/instructor_embedders_haystack/__init__.py b/integrations/instructor_embedders/instructor_embedders_haystack/__init__.py similarity index 100% rename from integrations/instructor-embedders/instructor_embedders_haystack/__init__.py rename to integrations/instructor_embedders/instructor_embedders_haystack/__init__.py diff --git a/integrations/instructor-embedders/instructor_embedders_haystack/embedding_backend/__init__.py b/integrations/instructor_embedders/instructor_embedders_haystack/embedding_backend/__init__.py similarity index 100% rename from integrations/instructor-embedders/instructor_embedders_haystack/embedding_backend/__init__.py rename to integrations/instructor_embedders/instructor_embedders_haystack/embedding_backend/__init__.py diff --git a/integrations/instructor-embedders/instructor_embedders_haystack/embedding_backend/instructor_backend.py b/integrations/instructor_embedders/instructor_embedders_haystack/embedding_backend/instructor_backend.py similarity index 100% rename from integrations/instructor-embedders/instructor_embedders_haystack/embedding_backend/instructor_backend.py rename to integrations/instructor_embedders/instructor_embedders_haystack/embedding_backend/instructor_backend.py diff --git a/integrations/instructor-embedders/instructor_embedders_haystack/instructor_document_embedder.py b/integrations/instructor_embedders/instructor_embedders_haystack/instructor_document_embedder.py similarity index 100% rename from integrations/instructor-embedders/instructor_embedders_haystack/instructor_document_embedder.py rename to integrations/instructor_embedders/instructor_embedders_haystack/instructor_document_embedder.py diff --git a/integrations/instructor-embedders/instructor_embedders_haystack/instructor_text_embedder.py b/integrations/instructor_embedders/instructor_embedders_haystack/instructor_text_embedder.py similarity index 100% rename from integrations/instructor-embedders/instructor_embedders_haystack/instructor_text_embedder.py rename to integrations/instructor_embedders/instructor_embedders_haystack/instructor_text_embedder.py diff --git a/integrations/instructor-embedders/pyproject.toml b/integrations/instructor_embedders/pyproject.toml similarity index 95% rename from integrations/instructor-embedders/pyproject.toml rename to integrations/instructor_embedders/pyproject.toml index 31834c259..63fb9703b 100644 --- a/integrations/instructor-embedders/pyproject.toml +++ b/integrations/instructor_embedders/pyproject.toml @@ -50,17 +50,17 @@ dependencies = [ dev = ["pytest"] [project.urls] -Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/instructor-embedders#readme" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/instructor_embedders#readme" Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" -Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/instructor-embedders" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/instructor_embedders" [tool.hatch.version] source = "vcs" -tag-pattern = 'integrations\/instructor-embedders-v(?P.*)' +tag-pattern = 'integrations\/instructor_embedders-v(?P.*)' [tool.hatch.version.raw-options] root = "../.." -git_describe_command = 'git describe --tags --match="integrations/instructor-embedders-v[0-9]*"' +git_describe_command = 'git describe --tags --match="integrations/instructor_embedders-v[0-9]*"' [tool.hatch.envs.default] dependencies = ["pytest", "pytest-cov"] diff --git a/integrations/instructor-embedders/tests/__init__.py b/integrations/instructor_embedders/tests/__init__.py similarity index 100% rename from integrations/instructor-embedders/tests/__init__.py rename to integrations/instructor_embedders/tests/__init__.py diff --git a/integrations/instructor-embedders/tests/test_instructor_backend.py b/integrations/instructor_embedders/tests/test_instructor_backend.py similarity index 100% rename from integrations/instructor-embedders/tests/test_instructor_backend.py rename to integrations/instructor_embedders/tests/test_instructor_backend.py diff --git a/integrations/instructor-embedders/tests/test_instructor_document_embedder.py b/integrations/instructor_embedders/tests/test_instructor_document_embedder.py similarity index 100% rename from integrations/instructor-embedders/tests/test_instructor_document_embedder.py rename to integrations/instructor_embedders/tests/test_instructor_document_embedder.py diff --git a/integrations/instructor-embedders/tests/test_instructor_text_embedder.py b/integrations/instructor_embedders/tests/test_instructor_text_embedder.py similarity index 100% rename from integrations/instructor-embedders/tests/test_instructor_text_embedder.py rename to integrations/instructor_embedders/tests/test_instructor_text_embedder.py diff --git a/integrations/pinecone/src/pinecone_haystack/dense_retriever.py b/integrations/pinecone/src/pinecone_haystack/dense_retriever.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index e76cdf1f8..5c9b32698 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -106,6 +106,7 @@ def test_embedding_retrieval(self, document_store: PineconeDocumentStore): ] document_store.write_documents(docs) + results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) assert len(results) == 2 assert results[0].content == "Most similar document"