From 8b9416ef572d99406ad5b59f23de463c92ce3822 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 12 Feb 2024 18:16:47 +0100 Subject: [PATCH 01/25] initial implementation --- integrations/mongodb_atlas/LICENSE.txt | 73 +++++++ integrations/mongodb_atlas/README.md | 31 +++ .../mongodb_atlas/examples/example.py | 1 + integrations/mongodb_atlas/pydoc/config.yml | 29 +++ integrations/mongodb_atlas/pyproject.toml | 182 +++++++++++++++++ .../document_stores/mongodb_atlas/__init__.py | 6 + .../mongodb_atlas/document_store.py | 189 ++++++++++++++++++ .../document_stores/mongodb_atlas/errors.py | 8 + .../document_stores/mongodb_atlas/filters.py | 7 + integrations/mongodb_atlas/tests/__init__.py | 3 + .../tests/test_document_store.py | 47 +++++ 11 files changed, 576 insertions(+) create mode 100644 integrations/mongodb_atlas/LICENSE.txt create mode 100644 integrations/mongodb_atlas/README.md create mode 100644 integrations/mongodb_atlas/examples/example.py create mode 100644 integrations/mongodb_atlas/pydoc/config.yml create mode 100644 integrations/mongodb_atlas/pyproject.toml create mode 100644 integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/__init__.py create mode 100644 integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py create mode 100644 integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py create mode 100644 integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py create mode 100644 integrations/mongodb_atlas/tests/__init__.py create mode 100644 integrations/mongodb_atlas/tests/test_document_store.py diff --git a/integrations/mongodb_atlas/LICENSE.txt b/integrations/mongodb_atlas/LICENSE.txt new file mode 100644 index 000000000..137069b82 --- /dev/null +++ b/integrations/mongodb_atlas/LICENSE.txt @@ -0,0 +1,73 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + + (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + + You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/integrations/mongodb_atlas/README.md b/integrations/mongodb_atlas/README.md new file mode 100644 index 000000000..fa338618c --- /dev/null +++ b/integrations/mongodb_atlas/README.md @@ -0,0 +1,31 @@ +# mongodb-atlas-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) + +--- + +**Table of Contents** + +- [mongodb-atlas-haystack](#mongodb-atlas-haystack) + - [Installation](#installation) + - [Testing](#testing) + - [License](#license) + +## Installation + +```console +pip install mongodb-atlas-haystack +``` + +## Testing + +TODO + +```console +hatch run test +``` + +## License + +`mongodb-atlas-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py new file mode 100644 index 000000000..f87f5c14c --- /dev/null +++ b/integrations/mongodb_atlas/examples/example.py @@ -0,0 +1 @@ +# TODO \ No newline at end of file diff --git a/integrations/mongodb_atlas/pydoc/config.yml b/integrations/mongodb_atlas/pydoc/config.yml new file mode 100644 index 000000000..d7635df9b --- /dev/null +++ b/integrations/mongodb_atlas/pydoc/config.yml @@ -0,0 +1,29 @@ +loaders: + - type: haystack_pydoc_tools.loaders.CustomPythonLoader + search_path: [../src] + modules: [ + "haystack_integrations.document_stores.mongodb_atlas.document_store", + "haystack_integrations.document_stores.mongodb_atlas.filters", + ] + ignore_when_discovered: ["__init__"] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer + excerpt: MongoDB Atlas integration for Haystack + category_slug: haystack-integrations + title: MongoDB Atlas + slug: integrations-mongodb-atlas + order: 140 + markdown: + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: _readme_mongodb_atlas.md diff --git a/integrations/mongodb_atlas/pyproject.toml b/integrations/mongodb_atlas/pyproject.toml new file mode 100644 index 000000000..3b5a32cad --- /dev/null +++ b/integrations/mongodb_atlas/pyproject.toml @@ -0,0 +1,182 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "mongodb-atlas-haystack" +dynamic = ["version"] +description = "An integration of MongoDB Atlas with Haystack" +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "pymongo", +] + +[project.urls] +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/mongodb_atlas/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/pgmongodb-atlasvector-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/mongodb-atlas-v[0-9]*"' + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "ipython", + "haystack-pydoc-tools", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] +docs = [ + "pydoc-markdown pydoc/config.yml" +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["src"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] + +[tool.coverage.run] +source_pkgs = ["src", "tests"] +branch = true +parallel = true + + +[tool.coverage.paths] +tests = ["tests", "*/mongodb-atlas-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "haystack_integrations.*", + "mongodb_atlas.*", + "psycopg.*", + "pytest.*" +] +ignore_missing_imports = true diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/__init__.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/__init__.py new file mode 100644 index 000000000..693a229a6 --- /dev/null +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .document_store import MongoDBAtlasDocumentStore + +__all__ = ["MongoDBAtlasDocumentStore"] diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py new file mode 100644 index 000000000..d9eefafdc --- /dev/null +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict, List, Optional, Union + +import re +import logging + +import pymongo +from pymongo import InsertOne, ReplaceOne, UpdateOne +from pymongo.driver_info import DriverInfo +from haystack import default_to_dict +from haystack.dataclasses.document import Document +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils.filters import convert + +from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo + + +logger = logging.getLogger(__name__) + + +METRIC_TYPES = ["euclidean", "cosine", "dotProduct"] + + +class MongoDBAtlasDocumentStore: + def __init__( + self, + mongo_connection_string: str, + database_name: str, + collection_name: str, + vector_search_index: Optional[str] = None, + embedding_dim: int = 768, + similarity: str = "cosine", + embedding_field: str = "embedding", + recreate_index: bool = False, + ): + """ + Creates a new MongoDBAtlasDocumentStore instance. + + This Document Store uses MongoDB Atlas as a backend (https://www.mongodb.com/docs/atlas/getting-started/). + + :param mongo_connection_string: MongoDB Atlas connection string in the format: + "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}". + This can be obtained on the MongoDB Atlas Dashboard by clicking on the `CONNECT` button. + :param database_name: Name of the database to use. + :param collection_name: Name of the collection to use. + :param vector_search_index: The name of the index to use for vector search. To use the search index it must have been created in the Atlas web UI before. None by default. + :param embedding_dim: Dimensionality of embeddings, 768 by default. + :param similarity: The similarity function to use for the embeddings. One of "euclidean", "cosine" or "dotProduct". "cosine" is the default. + :param embedding_field: The name of the field in the document that contains the embedding. + :param recreate_index: Whether to recreate the index when initializing the document store. + """ + if similarity not in METRIC_TYPES: + raise ValueError( + "MongoDB Atlas currently supports dotProduct, cosine and euclidean metrics. Please set similarity to one of the above." + ) + if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): + raise ValueError( + f'Invalid collection name: "{collection_name}". Index name can only contain letters, numbers, hyphens, or underscores.' + ) + + self.mongo_connection_string = mongo_connection_string + self.database_name = database_name + self.collection_name = collection_name + self.connection: pymongo.MongoClient = pymongo.MongoClient( + self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") + ) + self.database = self.connection[self.database_name] + + self.similarity = similarity + self.embedding_field = embedding_field + self.embedding_dim = embedding_dim + self.index = collection_name + self.recreate_index = recreate_index + self.vector_search_index = vector_search_index + + if self.recreate_index: + self.delete_index() + + # Implicitly create the collection if it doesn't exist + if collection_name not in self.database.list_collection_names(): + self.database.create_collection(self.collection_name) + self._get_collection().create_index("id", unique=True) + + def _create_document_field_map(self) -> Dict: + return {self.embedding_field: "embedding"} + + def _get_collection(self, index=None) -> pymongo.collection.Collection: + """ + Returns the collection named by index or returns the collection specified when the + driver was initialized. + """ + _validate_index_name(index) + if index is not None: + return self.database[index] + else: + return self.database[self.collection_name] + + def to_dict(self) -> Dict[str, Any]: + """ + Utility function that serializes this Document Store's configuration into a dictionary. + """ + return default_to_dict( + self, + mongo_connection_string=self.mongo_connection_string, + database_name=self.database_name, + collection_name=self.collection_name, + vector_search_index=self.vector_search_index, + embedding_dim=self.embedding_dim, + similarity=self.similarity, + embedding_field=self.embedding_field, + recreate_index=self.recreate_index, + ) + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + """ + collection = self._get_collection(self.index) + return collection.count_documents() + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns the documents that match the filters provided. + + For a detailed specification of the filters, + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) + + :param filters: The filters to apply to the document list. + :return: A list of Documents that match the given filters. + """ + mongo_filters = haystack_filters_to_mongo(filters) + collection = self._get_collection(self.index) + documents = collection.find(mongo_filters) + return [Document.from_dict(doc) for doc in documents] + + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: + """ + Writes documents into to PgvectorDocumentStore. + + :param documents: A list of Documents to write to the document store. + :param policy: The duplicate policy to use when writing documents. + :raises DuplicateDocumentError: If a document with the same id already exists in the document store + and the policy is set to DuplicatePolicy.FAIL (or not specified). + :return: The number of documents written to the document store. + """ + + if len(documents) > 0: + if not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) + + if policy == DuplicatePolicy.NONE: + policy = DuplicatePolicy.FAIL + + collection = self._get_collection(self.index) + field_map = self._create_document_field_map() + mongo_documents = [doc.to_dict() for doc in documents] + operations: List[Union[UpdateOne, InsertOne, ReplaceOne]] + written_docs = len(documents) + + if policy == DuplicatePolicy.SKIP: + operations = [UpdateOne({"id": doc["id"]}, {"$setOnInsert": doc}, upsert=True) for doc in mongo_documents] + existing_documents = collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) + written_docs -= existing_documents + elif policy == DuplicatePolicy.FAIL: + operations = [InsertOne(doc) for doc in mongo_documents] + else: + operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] + + collection.bulk_write(operations) + + return written_docs + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + + :param document_ids: the document ids to delete + """ + if not document_ids: + return + collection = self._get_collection(self.index) + collection.delete_many(filter={"id": {"$in": document_ids}}) + + + + diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py new file mode 100644 index 000000000..a15e69cd1 --- /dev/null +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py @@ -0,0 +1,8 @@ +from typing import Optional + + +class MongoDBAtlasDocumentStoreError(Exception): + """Exception for issues that occur in a MongoDBAtlas document store""" + + def __init__(self, message: Optional[str] = None): + super().__init__(message=message) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py new file mode 100644 index 000000000..9ad1ff42b --- /dev/null +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py @@ -0,0 +1,7 @@ +import warnings + + +def haystack_filters_to_mongo(filters): + # TODO + warnings.warn("Filtering not yet implemented for MongoDBAtlasDocumentStore!") + return {} \ No newline at end of file diff --git a/integrations/mongodb_atlas/tests/__init__.py b/integrations/mongodb_atlas/tests/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/mongodb_atlas/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py new file mode 100644 index 000000000..c3f8240c5 --- /dev/null +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import patch + +import pytest +from haystack.dataclasses.document import ByteStream, Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest +from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore +from pandas import DataFrame + + +class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): + + def test_write_documents(self, document_store: MongoDBAtlasDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, DuplicatePolicy.FAIL) + + def test_write_blob(self, document_store: MongoDBAtlasDocumentStore): + bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") + docs = [Document(id="1", blob=bytestream)] + document_store.write_documents(docs) + retrieved_docs = document_store.filter_documents() + assert retrieved_docs == docs + + def test_write_dataframe(self, document_store: MongoDBAtlasDocumentStore): + dataframe = DataFrame({"col1": [1, 2], "col2": [3, 4]}) + docs = [Document(id="1", dataframe=dataframe)] + document_store.write_documents(docs) + retrieved_docs = document_store.filter_documents() + assert retrieved_docs == docs + + def test_init(self): + document_store = MongoDBAtlasDocumentStore() + + def test_to_dict(self): + document_store = MongoDBAtlasDocumentStore() + + assert document_store.to_dict() == { + "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", + "init_parameters": {}, + } From 93f5ca30135346fce2af4f15aaa75bf9f08570c0 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 14:07:52 +0100 Subject: [PATCH 02/25] tests are green --- .../mongodb_atlas/examples/example.py | 12 ++++- integrations/mongodb_atlas/pyproject.toml | 2 +- .../mongodb_atlas/document_store.py | 51 ++++++++++--------- .../tests/test_document_store.py | 44 ++++++++++++---- 4 files changed, 72 insertions(+), 37 deletions(-) diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py index f87f5c14c..5f5077103 100644 --- a/integrations/mongodb_atlas/examples/example.py +++ b/integrations/mongodb_atlas/examples/example.py @@ -1 +1,11 @@ -# TODO \ No newline at end of file +from pymongo.mongo_client import MongoClient +from pymongo.server_api import ServerApi +uri = "mongodb+srv://sarazanzottera:dN3hmY9RNxRDni13@clustertest.gwkckbk.mongodb.net/?retryWrites=true&w=majority" +# Create a new client and connect to the server +client = MongoClient(uri, server_api=ServerApi('1')) +# Send a ping to confirm a successful connection +try: + client.admin.command('ping') + print("Pinged your deployment. You successfully connected to MongoDB!") +except Exception as e: + print(e) \ No newline at end of file diff --git a/integrations/mongodb_atlas/pyproject.toml b/integrations/mongodb_atlas/pyproject.toml index 3b5a32cad..9d544cd93 100644 --- a/integrations/mongodb_atlas/pyproject.toml +++ b/integrations/mongodb_atlas/pyproject.toml @@ -26,7 +26,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", - "pymongo", + "pymongo[srv]", ] [project.urls] diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index d9eefafdc..c2d81b8ec 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -6,13 +6,14 @@ import re import logging -import pymongo -from pymongo import InsertOne, ReplaceOne, UpdateOne +from pymongo import InsertOne, ReplaceOne, UpdateOne, MongoClient +from pymongo.collection import Collection from pymongo.driver_info import DriverInfo +from pymongo.errors import BulkWriteError from haystack import default_to_dict from haystack.dataclasses.document import Document from haystack.document_stores.types import DuplicatePolicy -from haystack.utils.filters import convert +from haystack.document_stores.errors import DuplicateDocumentError from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo @@ -63,11 +64,6 @@ def __init__( self.mongo_connection_string = mongo_connection_string self.database_name = database_name self.collection_name = collection_name - self.connection: pymongo.MongoClient = pymongo.MongoClient( - self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") - ) - self.database = self.connection[self.database_name] - self.similarity = similarity self.embedding_field = embedding_field self.embedding_dim = embedding_dim @@ -75,8 +71,13 @@ def __init__( self.recreate_index = recreate_index self.vector_search_index = vector_search_index + self.connection: MongoClient = MongoClient( + self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") + ) + self.database = self.connection[self.database_name] + if self.recreate_index: - self.delete_index() + self._get_collection().drop() # Implicitly create the collection if it doesn't exist if collection_name not in self.database.list_collection_names(): @@ -86,16 +87,12 @@ def __init__( def _create_document_field_map(self) -> Dict: return {self.embedding_field: "embedding"} - def _get_collection(self, index=None) -> pymongo.collection.Collection: + def _get_collection(self) -> Collection: """ Returns the collection named by index or returns the collection specified when the driver was initialized. """ - _validate_index_name(index) - if index is not None: - return self.database[index] - else: - return self.database[self.collection_name] + return self.database[self.collection_name] def to_dict(self) -> Dict[str, Any]: """ @@ -113,12 +110,11 @@ def to_dict(self) -> Dict[str, Any]: recreate_index=self.recreate_index, ) - def count_documents(self) -> int: + def count_documents(self, filters: Optional[Dict[str, Any]] = None) -> int: """ Returns how many documents are present in the document store. """ - collection = self._get_collection(self.index) - return collection.count_documents() + return self._get_collection().count_documents({} if filters is None else filters) def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ @@ -131,8 +127,10 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc :return: A list of Documents that match the given filters. """ mongo_filters = haystack_filters_to_mongo(filters) - collection = self._get_collection(self.index) - documents = collection.find(mongo_filters) + collection = self._get_collection() + documents = list(collection.find(mongo_filters)) + for doc in documents: + doc.pop("_id", None) return [Document.from_dict(doc) for doc in documents] def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: @@ -154,8 +152,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D if policy == DuplicatePolicy.NONE: policy = DuplicatePolicy.FAIL - collection = self._get_collection(self.index) - field_map = self._create_document_field_map() + collection = self._get_collection() mongo_documents = [doc.to_dict() for doc in documents] operations: List[Union[UpdateOne, InsertOne, ReplaceOne]] written_docs = len(documents) @@ -169,7 +166,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D else: operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] - collection.bulk_write(operations) + print(operations) + + try: + collection.bulk_write(operations) + except BulkWriteError as e: + raise DuplicateDocumentError(f"Duplicate documents found: {e.details['writeErrors']}") return written_docs @@ -181,8 +183,7 @@ def delete_documents(self, document_ids: List[str]) -> None: """ if not document_ids: return - collection = self._get_collection(self.index) - collection.delete_many(filter={"id": {"$in": document_ids}}) + self._get_collection().delete_many(filter={"id": {"$in": document_ids}}) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index c3f8240c5..5eed532fe 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -12,36 +12,60 @@ from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore from pandas import DataFrame +import pytest +from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore + + +@pytest.fixture +def document_store(): + store = MongoDBAtlasDocumentStore( + mongo_connection_string="mongodb+srv://sarazanzottera:dN3hmY9RNxRDni13@clustertest.gwkckbk.mongodb.net/?retryWrites=true&w=majority", + database_name="CusterTest", + collection_name="test" + ) + yield store + store._get_collection().drop() + class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): def test_write_documents(self, document_store: MongoDBAtlasDocumentStore): - docs = [Document(id="1")] + docs = [Document(content="some text")] assert document_store.write_documents(docs) == 1 with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs, DuplicatePolicy.FAIL) def test_write_blob(self, document_store: MongoDBAtlasDocumentStore): bytestream = ByteStream(b"test", meta={"meta_key": "meta_value"}, mime_type="mime_type") - docs = [Document(id="1", blob=bytestream)] + docs = [Document(blob=bytestream)] document_store.write_documents(docs) retrieved_docs = document_store.filter_documents() assert retrieved_docs == docs def test_write_dataframe(self, document_store: MongoDBAtlasDocumentStore): dataframe = DataFrame({"col1": [1, 2], "col2": [3, 4]}) - docs = [Document(id="1", dataframe=dataframe)] + docs = [Document(dataframe=dataframe)] document_store.write_documents(docs) retrieved_docs = document_store.filter_documents() assert retrieved_docs == docs - def test_init(self): - document_store = MongoDBAtlasDocumentStore() - - def test_to_dict(self): - document_store = MongoDBAtlasDocumentStore() - + @patch("haystack_integrations.document_stores.mongodb_atlas.document_store.MongoClient") + def test_to_dict(self, client_mock): + document_store = MongoDBAtlasDocumentStore( + mongo_connection_string="mongo_connection_string", + database_name="database_name", + collection_name="collection_name" + ) assert document_store.to_dict() == { "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", - "init_parameters": {}, + "init_parameters": { + "mongo_connection_string": "mongo_connection_string", + "database_name": "database_name", + "collection_name": "collection_name", + "vector_search_index": None, + "embedding_dim": 768, + "similarity": "cosine", + "embedding_field": "embedding", + "recreate_index": False, + }, } From 44312e9fab4d92ce8b16371894ffcc18377279d8 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 14:24:58 +0100 Subject: [PATCH 03/25] env var check --- .../mongodb_atlas/tests/test_document_store.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index 5eed532fe..e95928bc1 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 - +import os from unittest.mock import patch import pytest @@ -19,14 +19,18 @@ @pytest.fixture def document_store(): store = MongoDBAtlasDocumentStore( - mongo_connection_string="mongodb+srv://sarazanzottera:dN3hmY9RNxRDni13@clustertest.gwkckbk.mongodb.net/?retryWrites=true&w=majority", - database_name="CusterTest", + mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], + database_name="ClusterTest", collection_name="test" ) yield store store._get_collection().drop() +@pytest.mark.skipif( + "MONGO_CONNECTION_STRING" not in os.environ, + reason="No MongoDB Atlas connection string provided", +) class TestDocumentStore(CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest): def test_write_documents(self, document_store: MongoDBAtlasDocumentStore): From ebced28ab85ff85d1e61b8ebbc5a3165932a6e3d Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 14:27:48 +0100 Subject: [PATCH 04/25] add workflow --- .github/workflows/mongodb_atlas.yml | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/mongodb_atlas.yml diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml new file mode 100644 index 000000000..ba7847618 --- /dev/null +++ b/.github/workflows/mongodb_atlas.yml @@ -0,0 +1,58 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / mongodb_atlas + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/mongodb_atlas/**" + - ".github/workflows/mongodb_atlas.yml" + +defaults: + run: + working-directory: integrations/mongodb_atlas + +concurrency: + group: mongodb-atlas-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + MONGO_CONNECTION_STRING: ${{ secrets.MONGO_CONNECTION_STRING }} + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10', '3.11'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: integrations/mongodb_atlas + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Generate docs + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run docs + + - name: Run tests + working-directory: integrations/mongodb_atlas + run: hatch run cov From 90dce1ae942dc960d50129ca1a32939c0ad6a96a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 14:37:33 +0100 Subject: [PATCH 05/25] lint --- .../mongodb_atlas/examples/example.py | 7 +- .../mongodb_atlas/document_store.py | 65 +++++-------------- .../document_stores/mongodb_atlas/errors.py | 6 +- .../document_stores/mongodb_atlas/filters.py | 10 +-- .../tests/test_document_store.py | 9 +-- 5 files changed, 29 insertions(+), 68 deletions(-) diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py index 5f5077103..8370b25cb 100644 --- a/integrations/mongodb_atlas/examples/example.py +++ b/integrations/mongodb_atlas/examples/example.py @@ -1,11 +1,12 @@ from pymongo.mongo_client import MongoClient from pymongo.server_api import ServerApi + uri = "mongodb+srv://sarazanzottera:dN3hmY9RNxRDni13@clustertest.gwkckbk.mongodb.net/?retryWrites=true&w=majority" # Create a new client and connect to the server -client = MongoClient(uri, server_api=ServerApi('1')) +client = MongoClient(uri, server_api=ServerApi("1")) # Send a ping to confirm a successful connection try: - client.admin.command('ping') + client.admin.command("ping") print("Pinged your deployment. You successfully connected to MongoDB!") except Exception as e: - print(e) \ No newline at end of file + print(e) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index c2d81b8ec..64d324fe0 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -1,39 +1,30 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Union - -import re import logging +import re +from typing import Any, Dict, List, Optional, Union -from pymongo import InsertOne, ReplaceOne, UpdateOne, MongoClient -from pymongo.collection import Collection -from pymongo.driver_info import DriverInfo -from pymongo.errors import BulkWriteError from haystack import default_to_dict from haystack.dataclasses.document import Document -from haystack.document_stores.types import DuplicatePolicy from haystack.document_stores.errors import DuplicateDocumentError - +from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo - +from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne +from pymongo.collection import Collection +from pymongo.driver_info import DriverInfo +from pymongo.errors import BulkWriteError logger = logging.getLogger(__name__) -METRIC_TYPES = ["euclidean", "cosine", "dotProduct"] - - class MongoDBAtlasDocumentStore: def __init__( self, + *, mongo_connection_string: str, database_name: str, collection_name: str, - vector_search_index: Optional[str] = None, - embedding_dim: int = 768, - similarity: str = "cosine", - embedding_field: str = "embedding", recreate_index: bool = False, ): """ @@ -41,35 +32,21 @@ def __init__( This Document Store uses MongoDB Atlas as a backend (https://www.mongodb.com/docs/atlas/getting-started/). - :param mongo_connection_string: MongoDB Atlas connection string in the format: + :param mongo_connection_string: MongoDB Atlas connection string in the format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}". This can be obtained on the MongoDB Atlas Dashboard by clicking on the `CONNECT` button. :param database_name: Name of the database to use. :param collection_name: Name of the collection to use. - :param vector_search_index: The name of the index to use for vector search. To use the search index it must have been created in the Atlas web UI before. None by default. - :param embedding_dim: Dimensionality of embeddings, 768 by default. - :param similarity: The similarity function to use for the embeddings. One of "euclidean", "cosine" or "dotProduct". "cosine" is the default. - :param embedding_field: The name of the field in the document that contains the embedding. :param recreate_index: Whether to recreate the index when initializing the document store. """ - if similarity not in METRIC_TYPES: - raise ValueError( - "MongoDB Atlas currently supports dotProduct, cosine and euclidean metrics. Please set similarity to one of the above." - ) if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): - raise ValueError( - f'Invalid collection name: "{collection_name}". Index name can only contain letters, numbers, hyphens, or underscores.' - ) - + msg = f'Invalid collection name: "{collection_name}". It can only contain letters, numbers, -, or _.' + raise ValueError(msg) + self.mongo_connection_string = mongo_connection_string self.database_name = database_name self.collection_name = collection_name - self.similarity = similarity - self.embedding_field = embedding_field - self.embedding_dim = embedding_dim - self.index = collection_name self.recreate_index = recreate_index - self.vector_search_index = vector_search_index self.connection: MongoClient = MongoClient( self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") @@ -84,9 +61,6 @@ def __init__( self.database.create_collection(self.collection_name) self._get_collection().create_index("id", unique=True) - def _create_document_field_map(self) -> Dict: - return {self.embedding_field: "embedding"} - def _get_collection(self) -> Collection: """ Returns the collection named by index or returns the collection specified when the @@ -103,10 +77,6 @@ def to_dict(self) -> Dict[str, Any]: mongo_connection_string=self.mongo_connection_string, database_name=self.database_name, collection_name=self.collection_name, - vector_search_index=self.vector_search_index, - embedding_dim=self.embedding_dim, - similarity=self.similarity, - embedding_field=self.embedding_field, recreate_index=self.recreate_index, ) @@ -115,7 +85,7 @@ def count_documents(self, filters: Optional[Dict[str, Any]] = None) -> int: Returns how many documents are present in the document store. """ return self._get_collection().count_documents({} if filters is None else filters) - + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ Returns the documents that match the filters provided. @@ -166,12 +136,11 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D else: operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] - print(operations) - try: collection.bulk_write(operations) except BulkWriteError as e: - raise DuplicateDocumentError(f"Duplicate documents found: {e.details['writeErrors']}") + msg = f"Duplicate documents found: {e.details['writeErrors']}" + raise DuplicateDocumentError(msg) from e return written_docs @@ -184,7 +153,3 @@ def delete_documents(self, document_ids: List[str]) -> None: if not document_ids: return self._get_collection().delete_many(filter={"id": {"$in": document_ids}}) - - - - diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py index a15e69cd1..132156bd0 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/errors.py @@ -1,8 +1,4 @@ -from typing import Optional - - class MongoDBAtlasDocumentStoreError(Exception): """Exception for issues that occur in a MongoDBAtlas document store""" - def __init__(self, message: Optional[str] = None): - super().__init__(message=message) + pass diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py index 9ad1ff42b..ce2eae518 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py @@ -1,7 +1,9 @@ -import warnings +import logging +logger = logging.getLogger(__name__) -def haystack_filters_to_mongo(filters): + +def haystack_filters_to_mongo(_): # TODO - warnings.warn("Filtering not yet implemented for MongoDBAtlasDocumentStore!") - return {} \ No newline at end of file + logger.warning("Filtering not yet implemented for MongoDBAtlasDocumentStore") + return {} diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index e95928bc1..daf60a855 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -12,16 +12,13 @@ from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore from pandas import DataFrame -import pytest -from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore - @pytest.fixture def document_store(): store = MongoDBAtlasDocumentStore( mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], database_name="ClusterTest", - collection_name="test" + collection_name="test", ) yield store store._get_collection().drop() @@ -54,11 +51,11 @@ def test_write_dataframe(self, document_store: MongoDBAtlasDocumentStore): assert retrieved_docs == docs @patch("haystack_integrations.document_stores.mongodb_atlas.document_store.MongoClient") - def test_to_dict(self, client_mock): + def test_to_dict(self, _): document_store = MongoDBAtlasDocumentStore( mongo_connection_string="mongo_connection_string", database_name="database_name", - collection_name="collection_name" + collection_name="collection_name", ) assert document_store.to_dict() == { "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", From 78c2b880902665ed171f5b761cec0dda03c905c3 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 15:30:14 +0100 Subject: [PATCH 06/25] add type ignores --- .../document_stores/mongodb_atlas/document_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index 64d324fe0..3d433f88f 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -10,10 +10,10 @@ from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo -from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne -from pymongo.collection import Collection -from pymongo.driver_info import DriverInfo -from pymongo.errors import BulkWriteError +from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne # type: ignore +from pymongo.collection import Collection # type: ignore +from pymongo.driver_info import DriverInfo # type: ignore +from pymongo.errors import BulkWriteError # type: ignore logger = logging.getLogger(__name__) From 7b10a4e1d870ff912937352d57ef7f545f0f1fab Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 15:37:40 +0100 Subject: [PATCH 07/25] labeler --- .github/labeler.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/labeler.yml b/.github/labeler.yml index dac3bf015..063079a29 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -64,6 +64,11 @@ integration:llama_cpp: - any-glob-to-any-file: "integrations/llama_cpp/**/*" - any-glob-to-any-file: ".github/workflows/llama_cpp.yml" +integration:mongodb-atlas: + - changed-files: + - any-glob-to-any-file: "integrations/mongodb_atlas/**/*" + - any-glob-to-any-file: ".github/workflows/mongodb_atlas.yml" + integration:ollama: - changed-files: - any-glob-to-any-file: "integrations/ollama/**/*" From 177c7b0feabc149557f7e976320b8cfb479f213f Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 15:47:03 +0100 Subject: [PATCH 08/25] one failing test --- integrations/mongodb_atlas/tests/test_document_store.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index daf60a855..bbdb66e89 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -63,10 +63,6 @@ def test_to_dict(self, _): "mongo_connection_string": "mongo_connection_string", "database_name": "database_name", "collection_name": "collection_name", - "vector_search_index": None, - "embedding_dim": 768, - "similarity": "cosine", - "embedding_field": "embedding", "recreate_index": False, }, } From d55d18b35dfbe6e07d2c489e6be0866fe289dfee Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 15:48:34 +0100 Subject: [PATCH 09/25] reduce matrix --- .github/workflows/mongodb_atlas.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml index ba7847618..22b3c612e 100644 --- a/.github/workflows/mongodb_atlas.yml +++ b/.github/workflows/mongodb_atlas.yml @@ -30,8 +30,9 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ['3.9', '3.10', '3.11'] + # MongoDB Atlas run against a live instance so we should avoid overwhelming it + os: [ubuntu-latest] + python-version: ['3.9', '3.10'] steps: - uses: actions/checkout@v4 From e34f9a09bc81a850af4171a4fba347fbc37e192f Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 17:09:42 +0100 Subject: [PATCH 10/25] reducing the matrix even more --- .github/workflows/mongodb_atlas.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml index 22b3c612e..eb2a1bb77 100644 --- a/.github/workflows/mongodb_atlas.yml +++ b/.github/workflows/mongodb_atlas.yml @@ -32,7 +32,7 @@ jobs: matrix: # MongoDB Atlas run against a live instance so we should avoid overwhelming it os: [ubuntu-latest] - python-version: ['3.9', '3.10'] + python-version: ['3.10'] steps: - uses: actions/checkout@v4 From 7f5ed82ee39115604ce9e72c92fa9737539d5024 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 17:17:36 +0100 Subject: [PATCH 11/25] use different collections --- integrations/mongodb_atlas/tests/test_document_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index bbdb66e89..fcaec1192 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import os +from uuid import uuid4 from unittest.mock import patch import pytest @@ -14,11 +15,11 @@ @pytest.fixture -def document_store(): +def document_store(request): store = MongoDBAtlasDocumentStore( mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], database_name="ClusterTest", - collection_name="test", + collection_name="test_"+request.node.name+str(uuid4()), ) yield store store._get_collection().drop() From bbbf08c1c8e94c042a7dca038e679feaad74f724 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 17:54:51 +0100 Subject: [PATCH 12/25] expand matric --- .github/workflows/mongodb_atlas.yml | 5 +- .../mongodb_atlas/examples/example.py | 54 ++++++++++++++----- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml index eb2a1bb77..ba7847618 100644 --- a/.github/workflows/mongodb_atlas.yml +++ b/.github/workflows/mongodb_atlas.yml @@ -30,9 +30,8 @@ jobs: strategy: fail-fast: false matrix: - # MongoDB Atlas run against a live instance so we should avoid overwhelming it - os: [ubuntu-latest] - python-version: ['3.10'] + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v4 diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py index 8370b25cb..48d92ef60 100644 --- a/integrations/mongodb_atlas/examples/example.py +++ b/integrations/mongodb_atlas/examples/example.py @@ -1,12 +1,42 @@ -from pymongo.mongo_client import MongoClient -from pymongo.server_api import ServerApi - -uri = "mongodb+srv://sarazanzottera:dN3hmY9RNxRDni13@clustertest.gwkckbk.mongodb.net/?retryWrites=true&w=majority" -# Create a new client and connect to the server -client = MongoClient(uri, server_api=ServerApi("1")) -# Send a ping to confirm a successful connection -try: - client.admin.command("ping") - print("Pinged your deployment. You successfully connected to MongoDB!") -except Exception as e: - print(e) +# Install required packages for this example, including mongodb-atlas-haystack and other libraries needed +# for Markdown conversion and embeddings generation. Use the following command: +# +# pip install mongodb-atlas-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" +# +# Download some Markdown files to index. +# git clone https://github.com/anakin87/neural-search-pills + +import glob +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore + +# Provide your connection string +connection_string = input("Enter your MongoDB Atlas connection string: ") + +# Initialize the document store +document_store = MongoDBAtlasDocumentStore( + mongo_connection_string=connection_string, + database_name="haystack_test", + collection_name="test_collection", +) + +# Create the indexing Pipeline and index some documents +file_paths = glob.glob("neural-search-pills/pills/*.md") + + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + +print("Indexed documents:" + document_store.count_documents() + "\n - ".join(document_store.filter_documents())) \ No newline at end of file From 89809565e94a5ac914d566bb4ece8cdf47c67f50 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 17:57:53 +0100 Subject: [PATCH 13/25] lint --- integrations/mongodb_atlas/examples/example.py | 5 +++-- integrations/mongodb_atlas/tests/test_document_store.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py index 48d92ef60..d04c578c4 100644 --- a/integrations/mongodb_atlas/examples/example.py +++ b/integrations/mongodb_atlas/examples/example.py @@ -7,9 +7,10 @@ # git clone https://github.com/anakin87/neural-search-pills import glob + from haystack import Pipeline from haystack.components.converters import MarkdownToDocument -from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack.components.preprocessors import DocumentSplitter from haystack.components.writers import DocumentWriter from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore @@ -39,4 +40,4 @@ indexing.run({"converter": {"sources": file_paths}}) -print("Indexed documents:" + document_store.count_documents() + "\n - ".join(document_store.filter_documents())) \ No newline at end of file +print("Indexed documents:" + document_store.count_documents() + "\n - ".join(document_store.filter_documents())) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index fcaec1192..722248ca0 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 import os -from uuid import uuid4 from unittest.mock import patch +from uuid import uuid4 import pytest from haystack.dataclasses.document import ByteStream, Document @@ -19,7 +19,7 @@ def document_store(request): store = MongoDBAtlasDocumentStore( mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], database_name="ClusterTest", - collection_name="test_"+request.node.name+str(uuid4()), + collection_name="test_" + request.node.name + str(uuid4()), ) yield store store._get_collection().drop() From e0376cb94c77b4b2423e1ee96a08f738ace10cdb Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 14 Feb 2024 17:58:36 +0100 Subject: [PATCH 14/25] run linting only once --- .github/workflows/mongodb_atlas.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mongodb_atlas.yml b/.github/workflows/mongodb_atlas.yml index ba7847618..af19776cd 100644 --- a/.github/workflows/mongodb_atlas.yml +++ b/.github/workflows/mongodb_atlas.yml @@ -46,7 +46,7 @@ jobs: - name: Lint working-directory: integrations/mongodb_atlas - if: matrix.python-version == '3.9' + if: matrix.python-version == '3.9' && runner.os == 'Linux' run: hatch run lint:all - name: Generate docs From 252367fba4179da4760e62470b334b692f3cb81a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 11:14:47 +0100 Subject: [PATCH 15/25] better readme --- integrations/mongodb_atlas/README.md | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/integrations/mongodb_atlas/README.md b/integrations/mongodb_atlas/README.md index fa338618c..f305b9958 100644 --- a/integrations/mongodb_atlas/README.md +++ b/integrations/mongodb_atlas/README.md @@ -3,14 +3,13 @@ [![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mongodb-atlas-haystack.svg)](https://pypi.org/project/mongodb-atlas-haystack) ---- +----- **Table of Contents** -- [mongodb-atlas-haystack](#mongodb-atlas-haystack) - - [Installation](#installation) - - [Testing](#testing) - - [License](#license) +- [Installation](#installation) +- [Contributing](#contributing) +- [License](#license) ## Installation @@ -18,14 +17,27 @@ pip install mongodb-atlas-haystack ``` -## Testing +## Contributing -TODO +`hatch` is the best way to interact with this project, to install it: +```sh +pip install hatch +``` -```console +To run the linters `ruff` and `mypy`: +``` +hatch run lint:all +``` + +To run all the tests: +``` hatch run test ``` +Note: you need your own MongoDB Atlas account to run the tests: you can make one here: +https://www.mongodb.com/cloud/atlas/register. Once you have it, export the connection string +to the env var `MONGO_CONNECTION_STRING`. If you forget to do so, all the tests will be skipped. + ## License `mongodb-atlas-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. From 99b547e1f9845ed82daebd9c2a9c885064586122 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 15:49:18 +0100 Subject: [PATCH 16/25] feedback part 1 --- .../mongodb_atlas/document_store.py | 57 ++++++++++--------- .../document_stores/mongodb_atlas/filters.py | 9 +-- .../tests/test_document_store.py | 23 ++++++-- 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index 3d433f88f..fccba7b7a 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -5,13 +5,12 @@ import re from typing import Any, Dict, List, Optional, Union -from haystack import default_to_dict +from haystack import default_to_dict, default_from_dict from haystack.dataclasses.document import Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne # type: ignore -from pymongo.collection import Collection # type: ignore from pymongo.driver_info import DriverInfo # type: ignore from pymongo.errors import BulkWriteError # type: ignore @@ -25,7 +24,7 @@ def __init__( mongo_connection_string: str, database_name: str, collection_name: str, - recreate_index: bool = False, + recreate_collection: bool = False, ): """ Creates a new MongoDBAtlasDocumentStore instance. @@ -37,7 +36,7 @@ def __init__( This can be obtained on the MongoDB Atlas Dashboard by clicking on the `CONNECT` button. :param database_name: Name of the database to use. :param collection_name: Name of the collection to use. - :param recreate_index: Whether to recreate the index when initializing the document store. + :param recreate_collection: Whether to recreate the collection when initializing the document store. """ if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): msg = f'Invalid collection name: "{collection_name}". It can only contain letters, numbers, -, or _.' @@ -46,27 +45,22 @@ def __init__( self.mongo_connection_string = mongo_connection_string self.database_name = database_name self.collection_name = collection_name - self.recreate_index = recreate_index + self.recreate_collection = recreate_collection self.connection: MongoClient = MongoClient( self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") ) - self.database = self.connection[self.database_name] + database = self.connection[self.database_name] - if self.recreate_index: - self._get_collection().drop() + if self.recreate_collection and self.collection_name in database.list_collection_names(): + database[self.collection_name].drop() # Implicitly create the collection if it doesn't exist - if collection_name not in self.database.list_collection_names(): - self.database.create_collection(self.collection_name) - self._get_collection().create_index("id", unique=True) + if collection_name not in database.list_collection_names(): + database.create_collection(self.collection_name) + database[self.collection_name].create_index("id", unique=True) - def _get_collection(self) -> Collection: - """ - Returns the collection named by index or returns the collection specified when the - driver was initialized. - """ - return self.database[self.collection_name] + self.collection = database[self.collection_name] def to_dict(self) -> Dict[str, Any]: """ @@ -77,30 +71,38 @@ def to_dict(self) -> Dict[str, Any]: mongo_connection_string=self.mongo_connection_string, database_name=self.database_name, collection_name=self.collection_name, - recreate_index=self.recreate_index, + recreate_collection=self.recreate_collection, ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasDocumentStore": + """ + Utility function that deserializes this Document Store's configuration from a dictionary. + """ + return default_from_dict(cls, data) def count_documents(self, filters: Optional[Dict[str, Any]] = None) -> int: """ Returns how many documents are present in the document store. + + :param filters: The filters to apply. It counts only the documents that match the filters. """ - return self._get_collection().count_documents({} if filters is None else filters) + return self.collection.count_documents(filters or {}) def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ Returns the documents that match the filters provided. For a detailed specification of the filters, - refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering). - :param filters: The filters to apply to the document list. + :param filters: The filters to apply. It returns only the documents that match the filters. :return: A list of Documents that match the given filters. """ mongo_filters = haystack_filters_to_mongo(filters) - collection = self._get_collection() - documents = list(collection.find(mongo_filters)) + documents = list(self.collection.find(mongo_filters)) for doc in documents: - doc.pop("_id", None) + doc.pop("_id", None) # MongoDB's internal id doesn't belong into a Haystack document, so we remove it. return [Document.from_dict(doc) for doc in documents] def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: @@ -122,14 +124,13 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D if policy == DuplicatePolicy.NONE: policy = DuplicatePolicy.FAIL - collection = self._get_collection() mongo_documents = [doc.to_dict() for doc in documents] operations: List[Union[UpdateOne, InsertOne, ReplaceOne]] written_docs = len(documents) if policy == DuplicatePolicy.SKIP: operations = [UpdateOne({"id": doc["id"]}, {"$setOnInsert": doc}, upsert=True) for doc in mongo_documents] - existing_documents = collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) + existing_documents = self.collection.count_documents({"id": {"$in": [doc.id for doc in documents]}}) written_docs -= existing_documents elif policy == DuplicatePolicy.FAIL: operations = [InsertOne(doc) for doc in mongo_documents] @@ -137,7 +138,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D operations = [ReplaceOne({"id": doc["id"]}, upsert=True, replacement=doc) for doc in mongo_documents] try: - collection.bulk_write(operations) + self.collection.bulk_write(operations) except BulkWriteError as e: msg = f"Duplicate documents found: {e.details['writeErrors']}" raise DuplicateDocumentError(msg) from e @@ -152,4 +153,4 @@ def delete_documents(self, document_ids: List[str]) -> None: """ if not document_ids: return - self._get_collection().delete_many(filter={"id": {"$in": document_ids}}) + self.collection.delete_many(filter={"id": {"$in": document_ids}}) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py index ce2eae518..90e20de67 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py @@ -1,9 +1,6 @@ -import logging -logger = logging.getLogger(__name__) - - -def haystack_filters_to_mongo(_): +def haystack_filters_to_mongo(filters): # TODO - logger.warning("Filtering not yet implemented for MongoDBAtlasDocumentStore") + if filters: + raise "Filtering not yet implemented for MongoDBAtlasDocumentStore" return {} diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index 722248ca0..770880be6 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -18,11 +18,11 @@ def document_store(request): store = MongoDBAtlasDocumentStore( mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], - database_name="ClusterTest", - collection_name="test_" + request.node.name + str(uuid4()), + database_name="haystack_integration_test", + collection_name=request.node.name + str(uuid4()), ) yield store - store._get_collection().drop() + store.collection.drop() @pytest.mark.skipif( @@ -64,6 +64,21 @@ def test_to_dict(self, _): "mongo_connection_string": "mongo_connection_string", "database_name": "database_name", "collection_name": "collection_name", - "recreate_index": False, + "recreate_collection": False, }, } + + @patch("haystack_integrations.document_stores.mongodb_atlas.document_store.MongoClient") + def test_from_dict(self, _): + docstore = MongoDBAtlasDocumentStore.from_dict({ + "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", + "init_parameters": { + "mongo_connection_string": "mongo_connection_string", + "database_name": "database_name", + "collection_name": "collection_name", + "recreate_collection": True, + }}) + assert docstore.mongo_connection_string == "mongo_connection_string" + assert docstore.database_name == "database_name" + assert docstore.collection_name == "collection_name" + assert docstore.recreate_collection == True From 71f1605b714cb4bc813613fd19b6683ceba7cf05 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 15:58:08 +0100 Subject: [PATCH 17/25] secrets management --- .../mongodb_atlas/document_store.py | 19 +++++++++++++---- .../tests/test_document_store.py | 21 ++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index fccba7b7a..81937aac8 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional, Union from haystack import default_to_dict, default_from_dict +from haystack.utils import Secret, deserialize_secrets_inplace from haystack.dataclasses.document import Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy @@ -21,7 +22,7 @@ class MongoDBAtlasDocumentStore: def __init__( self, *, - mongo_connection_string: str, + mongo_connection_string: Secret = Secret.from_env_var("MONGO_CONNECTION_STRING"), # noqa: B008 database_name: str, collection_name: str, recreate_collection: bool = False, @@ -34,6 +35,7 @@ def __init__( :param mongo_connection_string: MongoDB Atlas connection string in the format: "mongodb+srv://{mongo_atlas_username}:{mongo_atlas_password}@{mongo_atlas_host}/?{mongo_atlas_params_string}". This can be obtained on the MongoDB Atlas Dashboard by clicking on the `CONNECT` button. + This value will be read automatically from the env var "MONGO_CONNECTION_STRING". :param database_name: Name of the database to use. :param collection_name: Name of the collection to use. :param recreate_collection: Whether to recreate the collection when initializing the document store. @@ -41,14 +43,22 @@ def __init__( if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): msg = f'Invalid collection name: "{collection_name}". It can only contain letters, numbers, -, or _.' raise ValueError(msg) - + + resolved_connection_string = mongo_connection_string.resolve_value() + if resolved_connection_string is None: + msg = ( + "MongoDBAtlasDocumentStore expects an API key. " + "Set the MONGO_CONNECTION_STRING environment variable (recommended) or pass it explicitly." + ) + raise ValueError(msg) self.mongo_connection_string = mongo_connection_string + self.database_name = database_name self.collection_name = collection_name self.recreate_collection = recreate_collection self.connection: MongoClient = MongoClient( - self.mongo_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") + resolved_connection_string, driver=DriverInfo(name="MongoDBAtlasHaystackIntegration") ) database = self.connection[self.database_name] @@ -68,7 +78,7 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - mongo_connection_string=self.mongo_connection_string, + mongo_connection_string=self.mongo_connection_string.to_dict(), database_name=self.database_name, collection_name=self.collection_name, recreate_collection=self.recreate_collection, @@ -79,6 +89,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasDocumentStore": """ Utility function that deserializes this Document Store's configuration from a dictionary. """ + deserialize_secrets_inplace(data["init_parameters"], keys=["mongo_connection_string"]) return default_from_dict(cls, data) def count_documents(self, filters: Optional[Dict[str, Any]] = None) -> int: diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index 770880be6..0c01b20a7 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -6,6 +6,7 @@ from uuid import uuid4 import pytest +from haystack.utils import Secret from haystack.dataclasses.document import ByteStream, Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy @@ -17,7 +18,6 @@ @pytest.fixture def document_store(request): store = MongoDBAtlasDocumentStore( - mongo_connection_string=os.environ["MONGO_CONNECTION_STRING"], database_name="haystack_integration_test", collection_name=request.node.name + str(uuid4()), ) @@ -54,14 +54,19 @@ def test_write_dataframe(self, document_store: MongoDBAtlasDocumentStore): @patch("haystack_integrations.document_stores.mongodb_atlas.document_store.MongoClient") def test_to_dict(self, _): document_store = MongoDBAtlasDocumentStore( - mongo_connection_string="mongo_connection_string", database_name="database_name", collection_name="collection_name", ) assert document_store.to_dict() == { "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", "init_parameters": { - "mongo_connection_string": "mongo_connection_string", + "mongo_connection_string": { + "env_vars": [ + "MONGO_CONNECTION_STRING", + ], + "strict": True, + "type": "env_var", + }, "database_name": "database_name", "collection_name": "collection_name", "recreate_collection": False, @@ -73,12 +78,18 @@ def test_from_dict(self, _): docstore = MongoDBAtlasDocumentStore.from_dict({ "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", "init_parameters": { - "mongo_connection_string": "mongo_connection_string", + "mongo_connection_string": { + "env_vars": [ + "MONGO_CONNECTION_STRING", + ], + "strict": True, + "type": "env_var", + }, "database_name": "database_name", "collection_name": "collection_name", "recreate_collection": True, }}) - assert docstore.mongo_connection_string == "mongo_connection_string" + assert docstore.mongo_connection_string == Secret.from_env_var("MONGO_CONNECTION_STRING") assert docstore.database_name == "database_name" assert docstore.collection_name == "collection_name" assert docstore.recreate_collection == True From 6eed0dd75fbe363d9cd01985da73180a5121e5ab Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 15:59:03 +0100 Subject: [PATCH 18/25] haystack pin --- integrations/mongodb_atlas/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/mongodb_atlas/pyproject.toml b/integrations/mongodb_atlas/pyproject.toml index 9d544cd93..c8b40ea71 100644 --- a/integrations/mongodb_atlas/pyproject.toml +++ b/integrations/mongodb_atlas/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai", + "haystack-ai>=2.0.0.b6", "pymongo[srv]", ] From c3a1eff4eee0be3f470a80e135877dbee289b30c Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 16:28:32 +0100 Subject: [PATCH 19/25] Update integrations/mongodb_atlas/pyproject.toml Co-authored-by: Madeesh Kannan --- integrations/mongodb_atlas/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/mongodb_atlas/pyproject.toml b/integrations/mongodb_atlas/pyproject.toml index c8b40ea71..efa94f1c1 100644 --- a/integrations/mongodb_atlas/pyproject.toml +++ b/integrations/mongodb_atlas/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai>=2.0.0.b6", + "haystack-ai>=2.0.0b6", "pymongo[srv]", ] From dc4fbdf709d5b59a284a9d7f53f8373b1671146a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 16:28:46 +0100 Subject: [PATCH 20/25] Update integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py Co-authored-by: Madeesh Kannan --- .../document_stores/mongodb_atlas/document_store.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index 81937aac8..c01b53999 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -45,12 +45,6 @@ def __init__( raise ValueError(msg) resolved_connection_string = mongo_connection_string.resolve_value() - if resolved_connection_string is None: - msg = ( - "MongoDBAtlasDocumentStore expects an API key. " - "Set the MONGO_CONNECTION_STRING environment variable (recommended) or pass it explicitly." - ) - raise ValueError(msg) self.mongo_connection_string = mongo_connection_string self.database_name = database_name From 9859b2eeffb2376e30d4299319e65316e9d7d223 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 16:29:55 +0100 Subject: [PATCH 21/25] Update integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py --- .../document_stores/mongodb_atlas/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py index 90e20de67..a55e70b96 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py @@ -1,5 +1,5 @@ -def haystack_filters_to_mongo(filters): +def haystack_filters_to_mongo(filters: Optional[Dict[str, Any]]): # TODO if filters: raise "Filtering not yet implemented for MongoDBAtlasDocumentStore" From bb6e4ba66747e59a4c88c6036b4e4249c59af911 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 17:17:48 +0100 Subject: [PATCH 22/25] lint --- .../document_stores/mongodb_atlas/document_store.py | 8 ++++---- .../document_stores/mongodb_atlas/filters.py | 5 ++++- integrations/mongodb_atlas/tests/test_document_store.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py index c01b53999..68c02522e 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/document_store.py @@ -5,11 +5,11 @@ import re from typing import Any, Dict, List, Optional, Union -from haystack import default_to_dict, default_from_dict -from haystack.utils import Secret, deserialize_secrets_inplace +from haystack import default_from_dict, default_to_dict from haystack.dataclasses.document import Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret, deserialize_secrets_inplace from haystack_integrations.document_stores.mongodb_atlas.filters import haystack_filters_to_mongo from pymongo import InsertOne, MongoClient, ReplaceOne, UpdateOne # type: ignore from pymongo.driver_info import DriverInfo # type: ignore @@ -43,7 +43,7 @@ def __init__( if collection_name and not bool(re.match(r"^[a-zA-Z0-9\-_]+$", collection_name)): msg = f'Invalid collection name: "{collection_name}". It can only contain letters, numbers, -, or _.' raise ValueError(msg) - + resolved_connection_string = mongo_connection_string.resolve_value() self.mongo_connection_string = mongo_connection_string @@ -77,7 +77,7 @@ def to_dict(self) -> Dict[str, Any]: collection_name=self.collection_name, recreate_collection=self.recreate_collection, ) - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "MongoDBAtlasDocumentStore": """ diff --git a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py index a55e70b96..f03ca88c0 100644 --- a/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py +++ b/integrations/mongodb_atlas/src/haystack_integrations/document_stores/mongodb_atlas/filters.py @@ -1,6 +1,9 @@ +from typing import Any, Dict, Optional + def haystack_filters_to_mongo(filters: Optional[Dict[str, Any]]): # TODO if filters: - raise "Filtering not yet implemented for MongoDBAtlasDocumentStore" + msg = "Filtering not yet implemented for MongoDBAtlasDocumentStore" + raise ValueError(msg) return {} diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index 0c01b20a7..964796d0b 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -6,11 +6,11 @@ from uuid import uuid4 import pytest -from haystack.utils import Secret from haystack.dataclasses.document import ByteStream, Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest +from haystack.utils import Secret from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore from pandas import DataFrame @@ -92,4 +92,4 @@ def test_from_dict(self, _): assert docstore.mongo_connection_string == Secret.from_env_var("MONGO_CONNECTION_STRING") assert docstore.database_name == "database_name" assert docstore.collection_name == "collection_name" - assert docstore.recreate_collection == True + assert docstore.recreate_collection From 88746889aee81b03ada30f855bd2c7757e5e6741 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 17:18:51 +0100 Subject: [PATCH 23/25] black --- .../tests/test_document_store.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/integrations/mongodb_atlas/tests/test_document_store.py b/integrations/mongodb_atlas/tests/test_document_store.py index 964796d0b..07aeabc0b 100644 --- a/integrations/mongodb_atlas/tests/test_document_store.py +++ b/integrations/mongodb_atlas/tests/test_document_store.py @@ -75,20 +75,23 @@ def test_to_dict(self, _): @patch("haystack_integrations.document_stores.mongodb_atlas.document_store.MongoClient") def test_from_dict(self, _): - docstore = MongoDBAtlasDocumentStore.from_dict({ - "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", - "init_parameters": { - "mongo_connection_string": { - "env_vars": [ - "MONGO_CONNECTION_STRING", - ], - "strict": True, - "type": "env_var", + docstore = MongoDBAtlasDocumentStore.from_dict( + { + "type": "haystack_integrations.document_stores.mongodb_atlas.document_store.MongoDBAtlasDocumentStore", + "init_parameters": { + "mongo_connection_string": { + "env_vars": [ + "MONGO_CONNECTION_STRING", + ], + "strict": True, + "type": "env_var", + }, + "database_name": "database_name", + "collection_name": "collection_name", + "recreate_collection": True, }, - "database_name": "database_name", - "collection_name": "collection_name", - "recreate_collection": True, - }}) + } + ) assert docstore.mongo_connection_string == Secret.from_env_var("MONGO_CONNECTION_STRING") assert docstore.database_name == "database_name" assert docstore.collection_name == "collection_name" From f2c90a5acc37354e1608c8bc40649a2e5f7910f0 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 17:25:26 +0100 Subject: [PATCH 24/25] fix example --- integrations/mongodb_atlas/examples/example.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/mongodb_atlas/examples/example.py b/integrations/mongodb_atlas/examples/example.py index d04c578c4..4b02bfd59 100644 --- a/integrations/mongodb_atlas/examples/example.py +++ b/integrations/mongodb_atlas/examples/example.py @@ -20,7 +20,6 @@ # Initialize the document store document_store = MongoDBAtlasDocumentStore( - mongo_connection_string=connection_string, database_name="haystack_test", collection_name="test_collection", ) From 6f4b7c6ce54acd0857fd144fe598442f6c61e61b Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 15 Feb 2024 17:36:02 +0100 Subject: [PATCH 25/25] readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 21c1e7d9d..d27b41e82 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ deepset-haystack | [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | | [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | | [llama-cpp-haystack](integrations/llama_cpp/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack) | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml) | +| [mongodb-atlas-haystack](integrations/mongodb_atlas/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/mongodb-atlas-haystack.svg?color=orange)](https://pypi.org/project/mongodb-atlas-haystack) | [![Test / mongodb-atlas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mongodb_atlas.yml) | | [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) |