From 515e338ad226ce7c7f8cb51d22255c2c9c5e166e Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Sun, 29 Oct 2023 12:05:02 +0530 Subject: [PATCH 01/38] Add PineconeDocumentStore --- document_stores/pinecone/README.md | 18 + document_stores/pinecone/pyproject.toml | 181 ++++ .../src/pinecone_haystack/__about__.py | 4 + .../src/pinecone_haystack/__init__.py | 6 + .../src/pinecone_haystack/document_store.py | 990 ++++++++++++++++++ .../pinecone/src/pinecone_haystack/errors.py | 10 + .../src/pinecone_haystack/filter_utils.py | 435 ++++++++ .../src/pinecone_haystack/retriever.py | 127 +++ document_stores/pinecone/tests/__init__.py | 3 + .../pinecone/tests/pinecone_mock.py | 331 ++++++ .../tests/test_pinecone_document_store.py | 334 ++++++ .../pinecone/tests/test_retriever.py | 136 +++ 12 files changed, 2575 insertions(+) create mode 100644 document_stores/pinecone/README.md create mode 100644 document_stores/pinecone/pyproject.toml create mode 100644 document_stores/pinecone/src/pinecone_haystack/__about__.py create mode 100644 document_stores/pinecone/src/pinecone_haystack/__init__.py create mode 100644 document_stores/pinecone/src/pinecone_haystack/document_store.py create mode 100644 document_stores/pinecone/src/pinecone_haystack/errors.py create mode 100644 document_stores/pinecone/src/pinecone_haystack/filter_utils.py create mode 100644 document_stores/pinecone/src/pinecone_haystack/retriever.py create mode 100644 document_stores/pinecone/tests/__init__.py create mode 100644 document_stores/pinecone/tests/pinecone_mock.py create mode 100644 document_stores/pinecone/tests/test_pinecone_document_store.py create mode 100644 document_stores/pinecone/tests/test_retriever.py diff --git a/document_stores/pinecone/README.md b/document_stores/pinecone/README.md new file mode 100644 index 000000000..3fa4de623 --- /dev/null +++ b/document_stores/pinecone/README.md @@ -0,0 +1,18 @@ +[![test](https://github.com/deepset-ai/document-store/actions/workflows/test.yml/badge.svg)](https://github.com/deepset-ai/document-store/actions/workflows/test.yml) + +# Pinecone Document Store + +This Github repository is a template that can be used to create custom document stores to extend +the new [Haystack](https://github.com/deepset-ai/haystack/) API available under the `preview` +package starting from version 1.15. + +While the new API is still under active development, the new "Store" architecture is quite stable +and we are encouraging early adopters to contribute their custom document stores. + +## Installation + +## Examples + +## License + +`pinecone-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/document_stores/pinecone/pyproject.toml b/document_stores/pinecone/pyproject.toml new file mode 100644 index 000000000..b8f3f40c0 --- /dev/null +++ b/document_stores/pinecone/pyproject.toml @@ -0,0 +1,181 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "pinecone_haystack" +dynamic = ["version"] +description = '' +readme = "README.md" +requires-python = ">=3.7" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "John Doe", email = "jd@example.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" + "haystack-ai", + "pinecone-client", +] + +[project.urls] +Documentation = "https://github.com/unknown/example-store#readme" +Issues = "https://github.com/unknown/example-store/issues" +Source = "https://github.com/unknown/example-store" + +[tool.hatch.version] +path = "src/pinecone_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.7", "3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", + "numpy", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/pinecone_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["pinecone_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["pinecone_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/pinecone_haystack/__about__.py", + "example" +] + +[tool.coverage.paths] +pinecone_haystack = ["src/pinecone_haystack", "*/pinecone_haystack/src/pinecone_haystack"] +tests = ["tests", "*/pinecone_haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pytest.ini_options] +minversion = "6.0" +markers = [ + "unit: unit tests", + "integration: integration tests" +] + +[[tool.mypy.overrides]] +module = [ + "pinecone.*", + "haystack.*", + "pytest.*" +] +ignore_missing_imports = true diff --git a/document_stores/pinecone/src/pinecone_haystack/__about__.py b/document_stores/pinecone/src/pinecone_haystack/__about__.py new file mode 100644 index 000000000..6294ccfe2 --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present John Doe +# +# SPDX-License-Identifier: Apache-2.0 +__version__ = "0.0.1" diff --git a/document_stores/pinecone/src/pinecone_haystack/__init__.py b/document_stores/pinecone/src/pinecone_haystack/__init__.py new file mode 100644 index 000000000..dbd6664ea --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present John Doe +# +# SPDX-License-Identifier: Apache-2.0 +from pinecone_haystack.document_store import PineconeDocumentStore + +__all__ = ["PineconeDocumentStore"] diff --git a/document_stores/pinecone/src/pinecone_haystack/document_store.py b/document_stores/pinecone/src/pinecone_haystack/document_store.py new file mode 100644 index 000000000..16266e506 --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/document_store.py @@ -0,0 +1,990 @@ +# SPDX-FileCopyrightText: 2023-present John Doe +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +import json +import logging +import operator +from functools import reduce +from itertools import islice +from typing import Any, Dict, Generator, List, Literal, Optional, Set, Union + +import numpy as np +from tqdm import tqdm +import pinecone + +from haystack.preview.dataclasses import Document +from haystack.preview.document_stores.decorator import document_store +from haystack.preview.document_stores.errors import ( + DuplicateDocumentError, + MissingDocumentError, +) +from haystack.preview.document_stores.protocols import DuplicatePolicy + +from pinecone_haystack.errors import ( + PineconeDocumentStoreError, + PineconeDocumentStoreFilterError, +) +from pinecone_haystack.filter_utils import LogicalFilterClause + + +logger = logging.getLogger(__name__) + + +TYPE_METADATA_FIELD = "doc_type" +DOCUMENT_WITH_EMBEDDING = "vector" +DOCUMENT_WITHOUT_EMBEDDING = "no-vector" +LABEL = "label" + +AND_OPERATOR = "$and" +IN_OPERATOR = "$in" +EQ_OPERATOR = "$eq" + +DocTypeMetadata = Literal["vector", "no-vector", "label"] + + +def _sanitize_index(index: Optional[str]) -> Optional[str]: + if index: + return index.replace("_", "-").lower() + return None + + +def _get_by_path(root, items): + """Access a nested object in root by item sequence.""" + return reduce(operator.getitem, items, root) + + +def _set_by_path(root, items, value): + """Set a value in a nested object in root by item sequence.""" + _get_by_path(root, items[:-1])[items[-1]] = value + + +@document_store +class PineconeDocumentStore: + """ + It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) + to perform similarity search on vectors. In order to use this document store, you need an API key that you can + obtain by creating an account on the [Pinecone website](https://www.pinecone.io). + + This is a hosted document store, + this means that your vectors will not be stored locally but in the cloud. This means that the similarity + search will be run on the cloud as well. + """ + + top_k_limit = 10_000 + top_k_limit_vectors = 1_000 + + def __init__( + self, + api_key: str, + environment: str = "us-west1-gcp", + pinecone_index: Optional["pinecone.Index"] = None, + embedding_dim: int = 768, + batch_size: int = 100, + return_embedding: bool = False, + index: str = "document", + similarity: str = "cosine", + replicas: int = 1, + shards: int = 1, + namespace: Optional[str] = None, + embedding_field: str = "embedding", + progress_bar: bool = True, + duplicate_documents: str = "overwrite", + recreate_index: bool = False, + metadata_config: Optional[Dict] = None, + validate_index_sync: bool = True, + ): + """ + :param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). + :param environment: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS + regions are supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. + :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. + :param embedding_dim: The embedding vector size. + :param batch_size: The batch size to be used when writing documents to the document store. + :param return_embedding: Whether to return document embeddings. + :param index: Name of index in document store to use. + :param similarity: The similarity function used to compare document vectors. `"cosine"` is the default + and is recommended if you are using a Sentence-Transformer model. `"dot_product"` is more performant + with DPR embeddings. + In both cases, the returned values in Document.score are normalized to be in range [0,1]: + - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` + - For `"cosine"`: `(raw_score + 1) / 2` + :param replicas: The number of replicas. Replicas duplicate the index. They provide higher availability and + throughput. + :param shards: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. + :param namespace: Optional namespace. If not specified, None is default. + :param embedding_field: Name of field containing an embedding vector. + :param progress_bar: Whether to show a tqdm progress bar or not. + Can be helpful to disable in production deployments to keep the logs clean. + :param duplicate_documents: Handle duplicate documents based on parameter options.\ + Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. + :param recreate_index: If set to True, an existing Pinecone index will be deleted and a new one will be + created using the config you are using for initialization. Be aware that all data in the old index will be + lost if you choose to recreate the index. Be aware that both the document_index and the label_index will + be recreated. + :param metadata_config: Which metadata fields should be indexed, part of the + [selective metadata filtering](https://www.pinecone.io/docs/manage-indexes/#selective-metadata-indexing) feature. + Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default, + no fields are indexed. + """ + + if metadata_config is None: + metadata_config = {"indexed": []} + # Connect to Pinecone server using python client binding + if not api_key: + raise PineconeDocumentStoreError( + "Pinecone requires an API key, please provide one. https://app.pinecone.io" + ) + + pinecone.init(api_key=api_key, environment=environment) + self._api_key = api_key + + # Format similarity string + self._set_similarity_metric(similarity) + + self.similarity = similarity + self.index: str = self._index(index) + self.embedding_dim = embedding_dim + self.batch_size = batch_size + self.return_embedding = return_embedding + self.embedding_field = embedding_field + self.progress_bar = progress_bar + self.duplicate_documents = duplicate_documents + + # Pinecone index params + self.replicas = replicas + self.shards = shards + self.namespace = namespace + + # Add necessary metadata fields to metadata_config + fields = ["label-id", "query", TYPE_METADATA_FIELD] + metadata_config["indexed"] += fields + self.metadata_config = metadata_config + + # Initialize dictionary of index connections + self.pinecone_indexes: Dict[str, pinecone.Index] = {} + self.return_embedding = return_embedding + self.embedding_field = embedding_field + + # Initialize dictionary to store temporary set of document IDs + self.all_ids: dict = {} + + # Dummy query to be used during searches + self.dummy_query = [0.0] * self.embedding_dim + + if pinecone_index: + if not isinstance(pinecone_index, pinecone.Index): + raise PineconeDocumentStoreError( + f"The parameter `pinecone_index` needs to be a " + f"`pinecone.Index` object. You provided an object of " + f"type `{type(pinecone_index)}`." + ) + self.pinecone_indexes[self.index] = pinecone_index + else: + self.pinecone_indexes[self.index] = self._create_index( + embedding_dim=self.embedding_dim, + index=self.index, + metric_type=self.metric_type, + replicas=self.replicas, + shards=self.shards, + recreate_index=recreate_index, + metadata_config=self.metadata_config, + ) + + super().__init__() + + def _index(self, index) -> str: + index = _sanitize_index(index) or self.index + return index + + def _create_index( + self, + embedding_dim: int, + index: Optional[str] = None, + metric_type: Optional[str] = "cosine", + replicas: Optional[int] = 1, + shards: Optional[int] = 1, + recreate_index: bool = False, + metadata_config: Optional[Dict] = None, + ) -> "pinecone.Index": + """ + Create a new index for storing documents in case an index with the name + doesn't exist already. + """ + if metadata_config is None: + metadata_config = {"indexed": []} + + if recreate_index: + self.delete_index(index) + + # Skip if already exists + if index in self.pinecone_indexes: + index_connection = self.pinecone_indexes[index] + else: + # Search pinecone hosted indexes and create an index if it does not exist + if index not in pinecone.list_indexes(): + pinecone.create_index( + name=index, + dimension=embedding_dim, + metric=metric_type, + replicas=replicas, + shards=shards, + metadata_config=metadata_config, + ) + index_connection = pinecone.Index(index) + + # return index connection + return index_connection + + def get_index_stats(self): + stats = self.pinecone_indexes[self.index] + self.index_stats = stats + # Get index statistics + dims = stats["dimension"] + count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 + logger.info( + "Index statistics: name: %s embedding dimensions: %s, record count: %s", + self.index, + dims, + count, + ) + + return stats, dims, count + + def _index_connection_exists(self, index: str, create: bool = False) -> Optional["pinecone.Index"]: + """ + Check if the index connection exists. If specified, create an index if it does not exist yet. + + :param index: Index name. + :param create: Indicates if an index needs to be created or not. If set to `True`, create an index + and return connection to it, otherwise raise `PineconeDocumentStoreError` error. + :raises PineconeDocumentStoreError: Exception trigger when index connection not found. + """ + if index not in self.pinecone_indexes: + if create: + return self._create_index( + embedding_dim=self.embedding_dim, + index=index, + metric_type=self.metric_type, + replicas=self.replicas, + shards=self.shards, + recreate_index=False, + metadata_config=self.metadata_config, + ) + raise PineconeDocumentStoreError( + f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running " + f"'update_embeddings()' to create and populate an index." + ) + return None + + def _set_similarity_metric(self, similarity: str): + """ + Set vector similarity metric. + """ + if similarity == "cosine": + self.metric_type = similarity + elif similarity == "dot_product": + self.metric_type = "dotproduct" + elif similarity in ["l2", "euclidean"]: + self.metric_type = "euclidean" + else: + raise ValueError( + "The Pinecone document store can currently only support dot_product, cosine and euclidean metrics. " + "Please set similarity to one of the above." + ) + + def _add_local_ids(self, index: str, ids: List[str]): + """ + Add all document IDs to the set of all IDs. + """ + if index not in self.all_ids: + self.all_ids[index] = set() + self.all_ids[index] = self.all_ids[index].union(set(ids)) + + def _add_type_metadata_filter( + self, filters: Dict[str, Any], type_value: Optional[DocTypeMetadata] + ) -> Dict[str, Any]: + """ + Add new filter for `doc_type` metadata field. + """ + if type_value: + new_type_filter = {TYPE_METADATA_FIELD: {EQ_OPERATOR: type_value}} + if AND_OPERATOR not in filters and TYPE_METADATA_FIELD not in filters: + # extend filters with new `doc_type` filter and add $and operator + filters.update(new_type_filter) + all_filters = filters + return {AND_OPERATOR: all_filters} + + filters_content = filters[AND_OPERATOR] if AND_OPERATOR in filters else filters + if TYPE_METADATA_FIELD in filters_content: # type: ignore + current_type_filter = filters_content[TYPE_METADATA_FIELD] # type: ignore + type_values = {type_value} + if isinstance(current_type_filter, str): + type_values.add(current_type_filter) # type: ignore + elif isinstance(current_type_filter, dict): + if EQ_OPERATOR in current_type_filter: + # current `doc_type` filter has single value + type_values.add(current_type_filter[EQ_OPERATOR]) + else: + # current `doc_type` filter has multiple values + type_values.update(set(current_type_filter[IN_OPERATOR])) + new_type_filter = {TYPE_METADATA_FIELD: {IN_OPERATOR: list(type_values)}} # type: ignore + filters_content.update(new_type_filter) # type: ignore + + return filters + + def _get_default_type_metadata(self, index: Optional[str], namespace: Optional[str] = None) -> str: + """ + Get default value for `doc_type` metadata filed. If there is at least one embedding, default value + will be `vector`, otherwise it will be `no-vector`. + """ + if self.get_embedding_count(index=index, namespace=namespace) > 0: + return DOCUMENT_WITH_EMBEDDING + return DOCUMENT_WITHOUT_EMBEDDING + + def _get_vector_count( + self, + index: str, + filters: Optional[Dict[str, Any]], + namespace: Optional[str], + ) -> int: + res = self.pinecone_indexes[index].query( + self.dummy_query, + top_k=self.top_k_limit, + include_values=False, + include_metadata=False, + filter=filters, + namespace=namespace, + ) + return len(res["matches"]) + + def get_document_count( + self, + filters: Dict[str, Any] = None, + index: Optional[str] = None, + only_documents_without_embedding: bool = False, + headers: Optional[Dict[str, str]] = None, + namespace: Optional[str] = None, + type_metadata: Optional[DocTypeMetadata] = None, + ) -> int: + """ + Return the count of documents in the document store. + + :param filters: Optional filters to narrow down the documents which will be counted. + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, + `"$gte"`, `"$lt"`, `"$lte"`), or a metadata field name. + Logical operator keys take a dictionary of metadata field names or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + __Example__: + + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + :param index: Optional index name to use for the query. If not provided, the default index name is used. + :param only_documents_without_embedding: If set to `True`, only documents without embeddings are counted. + :param headers: PineconeDocumentStore does not support headers. + :param namespace: Optional namespace to count documents from. If not specified, None is default. + :param type_metadata: Optional value for `doc_type` metadata to reference documents that need to be counted. + Parameter options: + - `"vector"`: Documents with embedding. + - `"no-vector"`: Documents without embedding (dummy embedding only). + - `"label"`: Labels. + """ + if headers: + raise NotImplementedError("PineconeDocumentStore does not support headers.") + + index = self._index(index) + self._index_connection_exists(index) + + filters = filters or {} + if not type_metadata: + # add filter for `doc_type` metadata related to documents without embeddings + filters = self._add_type_metadata_filter(filters, type_value=DOCUMENT_WITHOUT_EMBEDDING) # type: ignore + if not only_documents_without_embedding: + # add filter for `doc_type` metadata related to documents with embeddings + filters = self._add_type_metadata_filter(filters, type_value=DOCUMENT_WITH_EMBEDDING) # type: ignore + else: + # if value for `doc_type` metadata is specified, add filter with given value + filters = self._add_type_metadata_filter(filters, type_value=type_metadata) + + pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None + return self._get_vector_count(index, filters=pinecone_syntax_filter, namespace=namespace) + + def get_embedding_count( + self, + filters: Optional[Dict[str, Any]] = None, + index: Optional[str] = None, + namespace: Optional[str] = None, + ) -> int: + """ + Return the count of embeddings in the document store. + + :param index: Optional index name to retrieve all documents from. + :param filters: Filters are not supported for `get_embedding_count` in Pinecone. + :param namespace: Optional namespace to count embeddings from. If not specified, None is default. + """ + if filters: + raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") + + index = self._index(index) + self._index_connection_exists(index) + + pinecone_filters = self._meta_for_pinecone({TYPE_METADATA_FIELD: DOCUMENT_WITH_EMBEDDING}) + return self._get_vector_count(index, filters=pinecone_filters, namespace=namespace) + + def _meta_for_pinecone(self, meta: Dict[str, Any], parent_key: str = "", labels: bool = False) -> Dict[str, Any]: + """ + Converts the meta dictionary to a format that can be stored in Pinecone. + :param meta: Metadata dictionary to be converted. + :param parent_key: Optional, used for recursive calls to keep track of parent keys, for example: + ``` + {"parent1": {"parent2": {"child": "value"}}} + ``` + On the second recursive call, parent_key would be "parent1", and the final key would be "parent1.parent2.child". + :param labels: Optional, used to indicate whether the metadata is being stored as a label or not. If True the + the flattening of dictionaries is not required. + """ + items: list = [] + if labels: + # Replace any None values with empty strings + for key, value in meta.items(): + if value is None: + meta[key] = "" + else: + # Explode dict of dicts into single flattened dict + for key, value in meta.items(): + # Replace any None values with empty strings + if value is None: + value = "" + if key == "_split_overlap": + value = json.dumps(value) + # format key + new_key = f"{parent_key}.{key}" if parent_key else key + # if value is dict, expand + if isinstance(value, dict): + items.extend(self._meta_for_pinecone(value, parent_key=new_key).items()) + else: + items.append((new_key, value)) + # Create new flattened dictionary + meta = dict(items) + return meta + + def _pinecone_meta_format(self, meta: Dict[str, Any], labels: bool = False) -> Dict[str, Any]: + """ + Converts the meta extracted from Pinecone into a better format for Python. + :param meta: Metadata dictionary to be converted. + :param labels: Optional, used to indicate whether the metadata is being stored as a label or not. If True the + the flattening of dictionaries is not required. + """ + new_meta: Dict[str, Any] = {} + + if labels: + # Replace any empty strings with None values + for key, value in meta.items(): + if value == "": + meta[key] = None + return meta + else: + for key, value in meta.items(): + # Replace any empty strings with None values + if value == "": + value = None + if "." in key: + # We must split into nested dictionary + keys = key.split(".") + # Iterate through each dictionary level + for i in range(len(keys)): + path = keys[: i + 1] + # Check if path exists + try: + _get_by_path(new_meta, path) + except KeyError: + # Create path + if i == len(keys) - 1: + _set_by_path(new_meta, path, value) + else: + _set_by_path(new_meta, path, {}) + else: + new_meta[key] = value + return new_meta + + def _validate_index_sync(self, index: Optional[str] = None): + """ + This check ensures the correct number of documents with embeddings and embeddings are found in the + Pinecone database. + """ + if self.get_document_count( + index=index, type_metadata=DOCUMENT_WITH_EMBEDDING # type: ignore + ) != self.get_embedding_count(index=index): + raise PineconeDocumentStoreError( + f"The number of documents present in Pinecone ({self.get_document_count(index=index)}) " + "does not match the number of embeddings in Pinecone " + f" ({self.get_embedding_count(index=index)}). This can happen if a document store " + "instance is deleted during write operations. Call " + "the `update_documents` method to fix it." + ) + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + """ + count = self.index_stats["namespaces"][""]["vector_count"] if self.index_stats["namespaces"].get("") else 0 + return count + + def write_documents( + self, + documents: List[Document], + policy: DuplicatePolicy = "fail", + ) -> None: + """ + Writes (or overwrites) documents into the store. + + :param documents: a list of documents. + :param policy: documents with the same ID count as duplicates. When duplicates are met, + the store can: + - skip: keep the existing document and ignore the new one. + - overwrite: remove the old document and write the new one. + - fail: an error is raised + :raises DuplicateDocumentError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL` + :return: None + """ + if not isinstance(documents, list): + msg = "Documents must be a list" + raise ValueError(msg) + + index = self._index(self.index) + index_connection = self._index_connection_exists(index, create=True) + if index_connection: + self.pinecone_indexes[index] = index_connection + + duplicate_documents = policy or self.duplicate_documents + policy_options = ["skip", "overwrite", "fail"] + assert ( + duplicate_documents in policy_options + ), f"duplicate_documents parameter must be {', '.join(policy_options)}" + + add_vectors = documents[0].embedding is not None + type_metadata = DOCUMENT_WITH_EMBEDDING if add_vectors else DOCUMENT_WITHOUT_EMBEDDING + + if not add_vectors: + # To store documents in Pinecone, we use dummy embeddings (to be replaced with real embeddings later) + embeddings_to_index = np.zeros((self.batch_size, self.embedding_dim), dtype="float32") + # Convert embeddings to list objects + embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index] + + with tqdm( + total=len(documents), + disable=not self.progress_bar, + position=0, + desc="Writing Documents", + ) as progress_bar: + for i in range(0, len(documents), self.batch_size): + document_batch = documents[i : i + self.batch_size] + ids = [doc.id for doc in document_batch] + # If duplicate_documents set to `skip` or `fail`, we need to check for existing documents + if duplicate_documents in ["skip", "fail"]: + existing_documents = self.get_documents_by_id( + ids=ids, + index=index, + namespace=self.namespace, + include_type_metadata=True, + ) + # First check for documents in current batch that exist in the index + if existing_documents: + if duplicate_documents == "skip": + # If we should skip existing documents, we drop the ids that already exist + skip_ids = [doc.id for doc in existing_documents] + # We need to drop the affected document objects from the batch + document_batch = [doc for doc in document_batch if doc.id not in skip_ids] + # Now rebuild the ID list + ids = [doc.id for doc in document_batch] + progress_bar.update(len(skip_ids)) + elif duplicate_documents == "fail": + # Otherwise, we raise an error + raise DuplicateDocumentError( + f"Document ID {existing_documents[0].id} already exists in index {index}" + ) + # Now check for duplicate documents within the batch itself + if len(ids) != len(set(ids)): + if duplicate_documents == "skip": + # We just keep the first instance of each duplicate document + ids = [] + temp_document_batch = [] + for doc in document_batch: + if doc.id not in ids: + ids.append(doc.id) + temp_document_batch.append(doc) + document_batch = temp_document_batch + elif duplicate_documents == "fail": + # Otherwise, we raise an error + raise DuplicateDocumentError(f"Duplicate document IDs found in batch: {ids}") + metadata = [ + self._meta_for_pinecone( + { + TYPE_METADATA_FIELD: type_metadata, # add `doc_type` in metadata + "text": doc.text, + "content_type": doc.metadata, + } + ) + for doc in documents[i : i + self.batch_size] + ] + if add_vectors: + embeddings = [doc.embedding for doc in documents[i : i + self.batch_size]] + embeddings_to_index = np.array(embeddings, dtype="float32") + + # Convert embeddings to list objects + embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index] + data_to_write_to_pinecone = zip(ids, embeddings, metadata) + # Metadata fields and embeddings are stored in Pinecone + self.pinecone_indexes[index].upsert(vectors=data_to_write_to_pinecone, namespace=self.namespace) + # Add IDs to ID list + self._add_local_ids(index, ids) + progress_bar.update(self.batch_size) + progress_bar.close() + + def _limit_check(self, top_k: int, include_values: Optional[bool] = None): + """ + Confirms the top_k value does not exceed Pinecone vector database limits. + """ + if include_values: + if top_k > self.top_k_limit_vectors: + raise PineconeDocumentStoreError( + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records " + f"when returning embedding values. This request is attempting to return {top_k} records." + ) + else: + if top_k > self.top_k_limit: + raise PineconeDocumentStoreError( + f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. " + f"This request is attempting to return {top_k} records." + ) + + def query_by_embedding( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + scale_score: bool = True, + return_embedding: Optional[bool] = None, + ) -> List[Document]: + """ + Find the document that is most similar to the provided `query_embedding` by using a vector similarity metric. + + :param query_embedding: Embedding of the query. + :param filters: A dictionary with filters to narrow down the search space. + :param top_k: The maximum number of documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents or not. + :param return_embedding: Whether to return the embedding of the retrieved Documents. + :return: The retrieved documents. + """ + if return_embedding is None: + return_embedding = self.return_embedding + + self._limit_check(top_k, include_values=return_embedding) + + index = self._index(self.index) + self._index_connection_exists(index) + + type_metadata = DOCUMENT_WITH_EMBEDDING # type: ignore + + filters = filters or {} + filters = self._add_type_metadata_filter(filters, type_metadata) + + pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None + + res = self.pinecone_indexes[index].query( + query_embedding, + namespace=self.namespace, + top_k=top_k, + include_values=return_embedding, + include_metadata=True, + filter=pinecone_syntax_filter, + ) + + score_matrix = [] + vector_id_matrix = [] + meta_matrix = [] + embedding_matrix = [] + for match in res["matches"]: + score_matrix.append(match["score"]) + vector_id_matrix.append(match["id"]) + meta_matrix.append(match["metadata"]) + if return_embedding: + embedding_matrix.append(match["values"]) + if return_embedding: + values = embedding_matrix + else: + values = None + documents = self._get_documents_by_meta( + vector_id_matrix, + meta_matrix, + values=values, + index=index, + return_embedding=return_embedding, + ) + + # assign query score to each document + scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} + return_documents = [] + for doc in documents: + score = scores_for_vector_ids[doc.id] + if scale_score: + if self.similarity == "cosine": + score = (score + 1) / 2 + else: + score = float(1 / (1 + np.exp(-score / 100))) + doc.score = score + return_document = copy.copy(doc) + return_documents.append(return_document) + + return return_documents + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + """ + Returns the documents that match the filters provided. + + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, + `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, + `"$lte"`) or a metadata field name. + + Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata + field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or + (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default + operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used + as default operation. + + Example: + + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators can take a list of + dictionaries as value. + + Example: + + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + + :param filters: the filters to apply to the document list. + :return: a list of Documents that match the given filters. + """ + docs = self.query_by_embedding( + query_embedding=self.dummy_query, + filters=filters, + top_k=10, + scale_score=True, + return_embedding=True, + ) + + return docs + + def _attach_embedding_to_document(self, document: Document, index: str): + """ + Fetches the Document's embedding from the specified Pinecone index and attaches it to the Document's + embedding field. + """ + result = self.pinecone_indexes[index].fetch(ids=[document.id]) + if result["vectors"].get(document.id, False): + embedding = result["vectors"][document.id].get("values", None) + document.embedding = np.asarray(embedding, dtype=np.float32) + + def _get_documents_by_meta( + self, + ids: List[str], + metadata: List[dict], + values: Optional[List[List[float]]] = None, + index: Optional[str] = None, + return_embedding: Optional[bool] = None, + ) -> List[Document]: + if return_embedding is None: + return_embedding = self.return_embedding + + index = self._index(index) + + # extract ID, content, and metadata to create Documents + documents = [] + for _id, meta in zip(ids, metadata): + content = meta.pop("content") + content_type = meta.pop("content_type") + if "_split_overlap" in meta: + meta["_split_overlap"] = json.loads(meta["_split_overlap"]) + doc = Document(id=_id, content=content, content_type=content_type, meta=meta) + documents.append(doc) + if return_embedding: + if values is None: + # If no embedding values are provided, we must request the embeddings from Pinecone + for doc in documents: + self._attach_embedding_to_document(document=doc, index=index) + else: + # If embedding values are given, we just add + for doc, embedding in zip(documents, values): + doc.embedding = np.asarray(embedding, dtype=np.float32) + + return documents + + def get_documents_by_id( + self, + ids: List[str], + index: Optional[str] = None, + batch_size: int = 100, + return_embedding: Optional[bool] = None, + namespace: Optional[str] = None, + include_type_metadata: Optional[bool] = False, + ) -> List[Document]: + """ + Retrieves all documents in the index using their IDs. + + :param ids: List of IDs to retrieve. + :param index: Optional index name to retrieve all documents from. + :param batch_size: Number of documents to retrieve at a time. When working with large number of documents, + batching can help reduce memory footprint. + :param headers: Pinecone does not support headers. + :param return_embedding: Optional flag to return the embedding of the document. + :param namespace: Optional namespace to retrieve document from. If not specified, None is default. + :param include_type_metadata: Indicates if `doc_type` value will be included in document metadata or not. + If not specified, `doc_type` field will be dropped from document metadata. + """ + + if return_embedding is None: + return_embedding = self.return_embedding + + index = self._index(index) + self._index_connection_exists(index) + + documents = [] + for i in range(0, len(ids), batch_size): + i_end = min(len(ids), i + batch_size) + id_batch = ids[i:i_end] + result = self.pinecone_indexes[index].fetch(ids=id_batch, namespace=namespace) + + vector_id_matrix = [] + meta_matrix = [] + embedding_matrix = [] + for _id in result["vectors"]: + vector_id_matrix.append(_id) + metadata = result["vectors"][_id]["metadata"] + if not include_type_metadata and TYPE_METADATA_FIELD in metadata: + metadata.pop(TYPE_METADATA_FIELD) + meta_matrix.append(self._pinecone_meta_format(metadata)) + if return_embedding: + embedding_matrix.append(result["vectors"][_id]["values"]) + if return_embedding: + values = embedding_matrix + else: + values = None + document_batch = self._get_documents_by_meta( + vector_id_matrix, + meta_matrix, + values=values, + index=index, + return_embedding=return_embedding, + ) + documents.extend(document_batch) + + return documents + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + Fails with `MissingDocumentError` if no document with this id is present in the store. + + :param document_ids: the document_ids to delete + """ + for doc_id in document_ids: + msg = f"ID '{doc_id}' not found, cannot delete it." + document_ids.remove(doc_id) + raise MissingDocumentError(msg) + + index = self._index(self.index) + self._index_connection_exists(index) + + if index not in self.all_ids: + self.all_ids[index] = set() + if document_ids is None: + # If no IDs we delete everything + self.pinecone_indexes[index].delete(delete_all=True, namespace=self.namespace) + id_values = list(self.all_ids[index]) + else: + id_values = document_ids + self.pinecone_indexes[index].delete(ids=document_ids, namespace=self.namespace) + + # Remove deleted ids from all_ids + self.all_ids[index] = self.all_ids[index].difference(set(id_values)) + + def delete_index(self, index: Optional[str]): + """ + Delete an existing index. The index including all data will be removed. + + :param index: The name of the index to delete. + :return: None + """ + index = self._index(index) + + if index in pinecone.list_indexes(): + pinecone.delete_index(index) + logger.info("Index '%s' deleted.", index) + if index in self.pinecone_indexes: + del self.pinecone_indexes[index] + if index in self.all_ids: + self.all_ids[index] = set() diff --git a/document_stores/pinecone/src/pinecone_haystack/errors.py b/document_stores/pinecone/src/pinecone_haystack/errors.py new file mode 100644 index 000000000..08d71b35a --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/errors.py @@ -0,0 +1,10 @@ +from haystack.preview.document_stores.errors import DocumentStoreError +from haystack.preview.errors import FilterError + + +class PineconeDocumentStoreError(DocumentStoreError): + pass + + +class PineconeDocumentStoreFilterError(FilterError): + pass diff --git a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py new file mode 100644 index 000000000..bcba26784 --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py @@ -0,0 +1,435 @@ +import logging +from typing import Union, List, Dict +from abc import ABC, abstractmethod +from collections import defaultdict +from haystack.errors import FilterError +from pinecone_haystack.errors import PineconeDocumentStoreFilterError + +logger = logging.getLogger(__file__) + + + +def nested_defaultdict() -> defaultdict: + """ + Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary + structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries + with many levels) but access the existing value if a key exists and create an empty dictionary if a key does not + exist. + """ + return defaultdict(nested_defaultdict) + + +class LogicalFilterClause(ABC): + """ + Class that is able to parse a filter and convert it to the format that the underlying databases of our + DocumentStores require. + + Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical + operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, `"$gte"`, `"$lt"`, + `"$lte"`) or a metadata field name. + Logical operator keys take a dictionary of metadata field names and/or logical operators as + value. Metadata field names take a dictionary of comparison operators as value. Comparison + operator keys take a single value or (in case of `"$in"`) a list of values as value. + If no logical operator is provided, `"$and"` is used as default operation. If no comparison + operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default + operation. + Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take optionally a list of + dictionaries as value. + + Example: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` + + """ + + def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]): + self.conditions = conditions + + @abstractmethod + def evaluate(self, fields) -> bool: + pass + + @classmethod + def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]: + """ + Parses a filter dictionary/list and returns a LogicalFilterClause instance. + + :param filter_term: Dictionary or list that contains the filter definition. + """ + conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = [] + + if isinstance(filter_term, dict): + filter_term = [filter_term] + for item in filter_term: + for key, value in item.items(): + if key == "$not": + conditions.append(NotOperation.parse(value)) + elif key == "$and": + conditions.append(AndOperation.parse(value)) + elif key == "$or": + conditions.append(OrOperation.parse(value)) + # Key needs to be a metadata field + else: + conditions.extend(ComparisonOperation.parse(key, value)) + + if cls == LogicalFilterClause: + if len(conditions) == 1: + return conditions[0] + else: + return AndOperation(conditions) + else: + return cls(conditions) + + def convert_to_pinecone(self): + """ + Converts the LogicalFilterClause instance to a Pinecone filter. + """ + pass + + + +class ComparisonOperation(ABC): + def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]): + self.field_name = field_name + self.comparison_value = comparison_value + + @abstractmethod + def evaluate(self, fields) -> bool: + pass + + @classmethod + def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]: + comparison_operations: List[ComparisonOperation] = [] + + if isinstance(comparison_clause, dict): + for comparison_operation, comparison_value in comparison_clause.items(): + if comparison_operation == "$eq": + comparison_operations.append(EqOperation(field_name, comparison_value)) + elif comparison_operation == "$in": + comparison_operations.append(InOperation(field_name, comparison_value)) + elif comparison_operation == "$ne": + comparison_operations.append(NeOperation(field_name, comparison_value)) + elif comparison_operation == "$nin": + comparison_operations.append(NinOperation(field_name, comparison_value)) + elif comparison_operation == "$gt": + comparison_operations.append(GtOperation(field_name, comparison_value)) + elif comparison_operation == "$gte": + comparison_operations.append(GteOperation(field_name, comparison_value)) + elif comparison_operation == "$lt": + comparison_operations.append(LtOperation(field_name, comparison_value)) + elif comparison_operation == "$lte": + comparison_operations.append(LteOperation(field_name, comparison_value)) + + # No comparison operator is given, so we use the default operators "$in" if the comparison value is a list and + # "$eq" in every other case + elif isinstance(comparison_clause, list): + comparison_operations.append(InOperation(field_name, comparison_clause)) + else: + comparison_operations.append((EqOperation(field_name, comparison_clause))) + + return comparison_operations + + def convert_to_pinecone(self): + """ + Converts the ComparisonOperation instance to a Pinecone comparison operator. + """ + pass + + def invert(self) -> "ComparisonOperation": + """ + Inverts the ComparisonOperation. + Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore. + (https://github.com/semi-technologies/weaviate/issues/1717) + """ + pass + + + + +class NotOperation(LogicalFilterClause): + """ + Handles conversion of logical 'NOT' operations. + """ + + def evaluate(self, fields) -> bool: + return not any(condition.evaluate(fields) for condition in self.conditions) + + def convert_to_pinecone(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]: + conditions = [condition.invert().convert_to_pinecone() for condition in self.conditions] + if len(conditions) > 1: + # Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan + return {"$or": conditions} + else: + return conditions[0] + + def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]: + # This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't + # invert the operations here, as two "$not" operation annihilate each other. + # (If we have more than one condition, we return an AndOperation, the default logical operation for combining + # multiple conditions.) + if len(self.conditions) > 1: + return AndOperation(self.conditions) + else: + return self.conditions[0] + + +class AndOperation(LogicalFilterClause): + """ + Handles conversion of logical 'AND' operations. + """ + + def evaluate(self, fields) -> bool: + return all(condition.evaluate(fields) for condition in self.conditions) + + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_pinecone() for condition in self.conditions] + return {"$and": conditions} + + def invert(self) -> "OrOperation": + return OrOperation([condition.invert() for condition in self.conditions]) + + +class OrOperation(LogicalFilterClause): + """ + Handles conversion of logical 'OR' operations. + """ + + def evaluate(self, fields) -> bool: + return any(condition.evaluate(fields) for condition in self.conditions) + + def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: + conditions = [condition.convert_to_pinecone() for condition in self.conditions] + return {"$or": conditions} + + def invert(self) -> AndOperation: + return AndOperation([condition.invert() for condition in self.conditions]) + + +class EqOperation(ComparisonOperation): + """ + Handles conversion of the '$eq' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + return fields[self.field_name] == self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: + return {self.field_name: {"$eq": self.comparison_value}} + + def invert(self) -> "NeOperation": + return NeOperation(self.field_name, self.comparison_value) + + +class InOperation(ComparisonOperation): + """ + Handles conversion of the '$in' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + + if not isinstance(self.comparison_value, list): + raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") + + # If the document field is a list, check if any of its values are in the comparison value + if isinstance(fields[self.field_name], list): + return any(field in self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] in self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + if not isinstance(self.comparison_value, list): + raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") + return {self.field_name: {"$in": self.comparison_value}} + + def invert(self) -> "NinOperation": + return NinOperation(self.field_name, self.comparison_value) + + +class NeOperation(ComparisonOperation): + """ + Handles conversion of the '$ne' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + return fields[self.field_name] != self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: + return {self.field_name: {"$ne": self.comparison_value}} + + def invert(self) -> "EqOperation": + return EqOperation(self.field_name, self.comparison_value) + + +class NinOperation(ComparisonOperation): + """ + Handles conversion of the '$nin' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return True + + if not isinstance(self.comparison_value, list): + raise PineconeDocumentStoreFilterError("'$nin' operation requires comparison value to be a list.") + + # If the document field is a list, check if any of its values are in the comparison value + if isinstance(fields[self.field_name], list): + return not any(field in self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] not in self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: + if not isinstance(self.comparison_value, list): + raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") + return {self.field_name: {"$nin": self.comparison_value}} + + def invert(self) -> "InOperation": + return InOperation(self.field_name, self.comparison_value) + + +class GtOperation(ComparisonOperation): + """ + Handles conversion of the '$gt' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + + # If the document field is a list, check if any of its values are greater than the comparison value + if isinstance(fields[self.field_name], list): + return any(field > self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] > self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: + if not isinstance(self.comparison_value, (float, int)): + raise PineconeDocumentStoreFilterError("Comparison value for '$gt' operation must be a float or int.") + return {self.field_name: {"$gt": self.comparison_value}} + + def invert(self) -> "LteOperation": + return LteOperation(self.field_name, self.comparison_value) + + +class GteOperation(ComparisonOperation): + """ + Handles conversion of the '$gte' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + + # If the document field is a list, check if any of its values are greater than or equal to the comparison value + if isinstance(fields[self.field_name], list): + return any(field >= self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] >= self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: + if not isinstance(self.comparison_value, (float, int)): + raise PineconeDocumentStoreFilterError("Comparison value for '$gte' operation must be a float or int.") + return {self.field_name: {"$gte": self.comparison_value}} + + def invert(self) -> "LtOperation": + return LtOperation(self.field_name, self.comparison_value) + + +class LtOperation(ComparisonOperation): + """ + Handles conversion of the '$lt' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + + # If the document field is a list, check if any of its values are less than the comparison value + if isinstance(fields[self.field_name], list): + return any(field < self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] < self.comparison_value + + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: + if not isinstance(self.comparison_value, (float, int)): + raise PineconeDocumentStoreFilterError("Comparison value for '$lt' operation must be a float or int.") + return {self.field_name: {"$lt": self.comparison_value}} + + def invert(self) -> "GteOperation": + return GteOperation(self.field_name, self.comparison_value) + + +class LteOperation(ComparisonOperation): + """ + Handles conversion of the '$lte' comparison operation. + """ + + def evaluate(self, fields) -> bool: + if self.field_name not in fields: + return False + + # If the document field is a list, check if any of its values are less than or equal to the comparison value + if isinstance(fields[self.field_name], list): + return any(field <= self.comparison_value for field in fields[self.field_name]) + + return fields[self.field_name] <= self.comparison_value + + def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: + if not isinstance(self.comparison_value, (float, int)): + raise PineconeDocumentStoreFilterError("Comparison value for '$lte' operation must be a float or int.") + return {self.field_name: {"$lte": self.comparison_value}} + + def invert(self) -> "GtOperation": + return GtOperation(self.field_name, self.comparison_value) diff --git a/document_stores/pinecone/src/pinecone_haystack/retriever.py b/document_stores/pinecone/src/pinecone_haystack/retriever.py new file mode 100644 index 000000000..5f811980f --- /dev/null +++ b/document_stores/pinecone/src/pinecone_haystack/retriever.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: 2023-present John Doe +# +# SPDX-License-Identifier: Apache-2.0 +from typing import Dict, List, Any, Optional + +from haystack.preview import ( + component, + Document, + default_to_dict, + default_from_dict, + DeserializationError, +) +from haystack.preview.dataclasses import Document +from pinecone_haystack.document_store import PineconeDocumentStore + + +@component +class PineconeRetriever: + """ + A component for retrieving documents from an PineconeDocumentStore using a vector similarity metric. + + Needs to be connected to a PineconeDocumentStore to run. + """ + + def __init__( + self, + document_store: PineconeDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + scale_score: bool = True, + return_embedding: bool = False, + ): + """ + Create a PineconeRetriever component. + + :param document_store: An instance of PineconeDocumentStore. + :param filters: A dictionary with filters to narrow down the search space. Default is None. + :param top_k: The maximum number of documents to retrieve. Default is 10. + :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. + :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False. + + :raises ValueError: If the specified top_k is not > 0. + """ + if not isinstance(document_store, PineconeDocumentStore): + raise ValueError("document_store must be an instance of PineconeDocumentStore") + + self.document_store = document_store + + if top_k <= 0: + raise ValueError(f"top_k must be > 0, but got {top_k}") + + self.filters = filters + self.top_k = top_k + self.scale_score = scale_score + self.return_embedding = return_embedding + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + docstore = self.document_store.to_dict() + return default_to_dict( + self, + document_store=docstore, + filters=self.filters, + top_k=self.top_k, + scale_score=self.scale_score, + return_embedding=self.return_embedding, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "PineconeRetriever": + """ + Deserialize this component from a dictionary. + """ + init_params = data.get("init_parameters", {}) + if "document_store" not in init_params: + raise DeserializationError("Missing 'document_store' in serialization data") + if "type" not in init_params["document_store"]: + raise DeserializationError("Missing 'type' in document store's serialization data") + if init_params["document_store"]["type"] not in document_store.registry: + raise DeserializationError(f"DocumentStore type '{init_params['document_store']['type']}' not found") + + docstore_class = document_store.registry[init_params["document_store"]["type"]] + docstore = docstore_class.from_dict(init_params["document_store"]) + data["init_parameters"]["document_store"] = docstore + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run( + self, + query_embedding: List[float], + filters: Optional[Dict[str, Any]] = None, + top_k: Optional[int] = None, + scale_score: Optional[bool] = None, + return_embedding: Optional[bool] = None, + ): + """ + Run the Embedding Retriever on the given input data. + + :param query_embedding: Embedding of the query. + :param filters: A dictionary with filters to narrow down the search space. + :param top_k: The maximum number of documents to return. + :param scale_score: Whether to scale the scores of the retrieved documents or not. + :param return_embedding: Whether to return the embedding of the retrieved Documents. + :return: The retrieved documents. + + :raises ValueError: If the specified DocumentStore is not found or is not a MemoryDocumentStore instance. + """ + if filters is None: + filters = self.filters + if top_k is None: + top_k = self.top_k + if scale_score is None: + scale_score = self.scale_score + if return_embedding is None: + return_embedding = self.return_embedding + + docs = self.document_store.query_by_embedding( + query_embedding=query_embedding, + filters=filters, + top_k=top_k, + scale_score=scale_score, + return_embedding=return_embedding, + ) + + return {"documents": docs} diff --git a/document_stores/pinecone/tests/__init__.py b/document_stores/pinecone/tests/__init__.py new file mode 100644 index 000000000..7eda7517e --- /dev/null +++ b/document_stores/pinecone/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present John Doe +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/document_stores/pinecone/tests/pinecone_mock.py b/document_stores/pinecone/tests/pinecone_mock.py new file mode 100644 index 000000000..882531acf --- /dev/null +++ b/document_stores/pinecone/tests/pinecone_mock.py @@ -0,0 +1,331 @@ +from typing import Optional, List, Union, Dict, Any + +import logging + + +logger = logging.getLogger(__name__) + + +# Mock Pinecone instance +CONFIG: dict = {"api_key": None, "environment": None, "indexes": {}} + + +# Mock Pinecone Index instance +class IndexObject: + def __init__( + self, + index: str, + api_key: Optional[str] = None, + environment: Optional[str] = None, + dimension: Optional[int] = None, + metric: Optional[str] = None, + replicas: Optional[int] = None, + shards: Optional[int] = None, + metadata_config: Optional[dict] = None, + ): + self.index = index + self.api_key = api_key + self.environment = environment + self.dimension = dimension + self.metric = metric + self.replicas = replicas + self.shards = shards + self.metadata_config = metadata_config + self.namespaces: dict = {} + + +# Mock the Pinecone Index class +class Index: + def __init__(self, index: str): + self.index = index + self.index_config = CONFIG["indexes"][index] + + def upsert(self, vectors: List[tuple], namespace: str = ""): + if namespace not in self.index_config.namespaces: + self.index_config.namespaces[namespace] = {} + upsert_count = 0 + for record in vectors: + # Extract info from tuple + _id = record[0] + vector = record[1] + metadata = record[2] + # Checks + assert type(_id) is str + assert type(vector) is list + assert len(vector) == self.index_config.dimension + assert type(metadata) is dict + # Create record (eg document) + new_record: dict = {"id": _id, "values": vector, "metadata": metadata} + self.index_config.namespaces[namespace][_id] = new_record + upsert_count += 1 + return {"upserted_count": upsert_count} + + def update(self, namespace: str, id: str, set_metadata: dict): + # Get existing item metadata + meta = self.index_config.namespaces[namespace][id]["metadata"] + # Add new metadata to existing item metadata + self.index_config.namespaces[namespace][id]["metadata"] = {**meta, **set_metadata} + + def describe_index_stats(self, filter=None): + namespaces = {} + for namespace in self.index_config.namespaces.items(): + records = self.index_config.namespaces[namespace[0]] + if filter: + filtered_records = [] + for record in records.values(): + if self._filter(metadata=record["metadata"], filters=filter, top_level=True): + filtered_records.append(record) + records = filtered_records + namespaces[namespace[0]] = {"vector_count": len(records)} + return {"dimension": self.index_config.dimension, "index_fullness": 0.0, "namespaces": namespaces} + + def query( + self, + vector: List[float], + top_k: int, + namespace: str = "", + include_values: bool = False, + include_metadata: bool = False, + filter: Optional[dict] = None, + ): + return self.query_filter( + vector=vector, + top_k=top_k, + namespace=namespace, + include_values=include_values, + include_metadata=include_metadata, + filter=filter, + ) + + def query_filter( + self, + vector: List[float], + top_k: int, + namespace: str = "", + include_values: bool = False, + include_metadata: bool = False, + filter: Optional[dict] = None, + ): + assert len(vector) == self.index_config.dimension + response: dict = {"matches": []} + if namespace not in self.index_config.namespaces: + return response + else: + records = self.index_config.namespaces[namespace] + namespace_ids = list(records.keys())[:top_k] + + for _id in namespace_ids: + match = {"id": _id} + if include_values: + match["values"] = records[_id]["values"].copy() + if include_metadata: + match["metadata"] = records[_id]["metadata"].copy() + match["score"] = 0.0 + + if filter is None or ( + filter is not None and self._filter(records[_id]["metadata"], filter, top_level=True) + ): + # filter if needed + response["matches"].append(match) + return response + + def fetch(self, ids: List[str], namespace: str = ""): + response: dict = {"namespace": namespace, "vectors": {}} + if namespace not in self.index_config.namespaces: + # If we query an empty/non-existent namespace, Pinecone will just return an empty response + logger.warning("No namespace called '%s'", namespace) + return response + records = self.index_config.namespaces[namespace] + namespace_ids = records.keys() + for _id in namespace_ids: + if _id in ids.copy(): + response["vectors"][_id] = { + "id": _id, + "metadata": records[_id]["metadata"].copy(), + "values": records[_id]["values"].copy(), + } + return response + + def _filter( # noqa: C901,PLR0912 + self, + metadata: dict, + filters: Dict[str, Any] = None, + mode: Optional[str] = "$and", + top_level=False, + ) -> dict: + """ + Mock filtering function + """ + # This function has a very high McCabe cyclomatic complexity score of 38 + # (recommended is 10) and contains 55 branches (recommended is 12). + bools = [] + if type(filters) is list: + list_bools = [] + for _filter in filters: + res = self._filter(metadata, _filter, mode=mode) + for key, value in res.items(): + if key == "$and": + list_bools.append(all(value)) + else: + list_bools.append(any(value)) + if mode == "$and": + bools.append(all(list_bools)) + elif mode == "$or": + bools.append(any(list_bools)) + else: + for field, potential_value in filters.items(): + if field in ["$and", "$or"]: + bools.append(self._filter(metadata, potential_value, mode=field)) + mode = field + cond = field + else: + if type(potential_value) is dict: + sub_bool = [] + for cond, value in potential_value.items(): + if len(potential_value.keys()) > 1: + sub_filter = {field: {cond: value}} + bools.append(self._filter(metadata, sub_filter)) + if len(sub_bool) > 1: + if field == "$or": + bools.append(any(sub_bool)) + else: + bools.append(all(sub_bool)) + elif type(potential_value) is list: + cond = "$in" + value = potential_value + else: + cond = "$eq" + value = potential_value + # main chunk of condition checks + if cond == "$eq": + if field in metadata and metadata[field] == value: + bools.append(True) + else: + bools.append(False) + elif cond == "$ne": + if field in metadata and metadata[field] != value: + bools.append(True) + else: + bools.append(False) + elif cond == "$in": + if field in metadata and metadata[field] in value: + bools.append(True) + else: + bools.append(False) + elif cond == "$nin": + if field in metadata and metadata[field] not in value: + bools.append(True) + else: + bools.append(False) + elif cond == "$gt": + if field in metadata and metadata[field] > value: + bools.append(True) + else: + bools.append(False) + elif cond == "$lt": + if field in metadata and metadata[field] < value: + bools.append(True) + else: + bools.append(False) + elif cond == "$gte": + if field in metadata and metadata[field] >= value: + bools.append(True) + else: + bools.append(False) + elif cond == "$lte": + if field in metadata and metadata[field] <= value: + bools.append(True) + else: + bools.append(False) + if top_level: + final = [] + for item in bools: + if type(item) is dict: + for key, value in item.items(): + if key == "$and": + final.append(all(value)) + else: + final.append(any(value)) + else: + final.append(item) + if mode == "$and": + bools = all(final) + else: + bools = any(final) + else: + if mode == "$and": + return {"$and": bools} + else: + return {"$or": bools} + return bools + + def delete( + self, + ids: Optional[List[str]] = None, + namespace: str = "", + filters: Dict[str, Any] = None, + delete_all: bool = False, + ): + if filters: + # Get a filtered list of IDs + matches = self.query(filters=filters, namespace=namespace, include_values=False, include_metadata=False)[ + "vectors" + ] + filter_ids: List[str] = matches.keys() # .keys() returns an object that supports set operators already + elif delete_all: + self.index_config.namespaces[namespace] = {} + + if namespace not in self.index_config.namespaces: + pass + elif ids is not None: + id_list: List[str] = ids + if filters: + # We find the intersect between the IDs and filtered IDs + id_list = set(id_list).intersection(filter_ids) + records = self.index_config.namespaces[namespace] + for _id in list(records.keys()): # list() is needed to be able to del below + if _id in id_list: + del records[_id] + else: + # Delete all + self.index_config.namespaces[namespace] = {} + return {} + + def _get_config(self): + return self.index_config + + +# Mock core Pinecone client functions +def init(api_key: Optional[str] = None, environment: Optional[str] = None): + CONFIG["api_key"] = api_key + CONFIG["environment"] = environment + CONFIG["indexes"] = {} + + +def list_indexes(): + return list(CONFIG["indexes"].keys()) + + +def create_index( + name: str, + dimension: int, + metric: str = "cosine", + replicas: int = 1, + shards: int = 1, + metadata_config: Optional[dict] = None, +): + index_object = IndexObject( + api_key=CONFIG["api_key"], + environment=CONFIG["environment"], + index=name, + dimension=dimension, + metric=metric, + replicas=replicas, + shards=shards, + metadata_config=metadata_config, + ) + CONFIG["indexes"][name] = index_object + + +def delete_index(index: str): + del CONFIG["indexes"][index] diff --git a/document_stores/pinecone/tests/test_pinecone_document_store.py b/document_stores/pinecone/tests/test_pinecone_document_store.py new file mode 100644 index 000000000..4f279cd18 --- /dev/null +++ b/document_stores/pinecone/tests/test_pinecone_document_store.py @@ -0,0 +1,334 @@ +import os +from inspect import getmembers, isclass, isfunction +from typing import Any, Dict, List, Union +from unittest.mock import MagicMock +import numpy as np +import pytest + +from pinecone_haystack.document_store import PineconeDocumentStore +from pinecone_haystack.errors import ( + PineconeDocumentStoreError, + PineconeDocumentStoreFilterError, +) +from haystack.preview.dataclasses import Document +from haystack.preview.testing.document_store import DocumentStoreBaseTests +from tests import pinecone_mock +import pinecone + +class TestPineconeDocumentStore: + @pytest.fixture + def ds(self, monkeypatch, request) -> PineconeDocumentStore: + """ + This fixture provides an empty document store and takes care of cleaning up after each test + """ + + for fname, function in getmembers(pinecone_mock, isfunction): + monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) + for cname, class_ in getmembers(pinecone_mock, isclass): + monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) + + return PineconeDocumentStore( + api_key=os.environ.get("PINECONE_API_KEY") or "pinecone-test-key", + embedding_dim=768, + embedding_field="embedding", + index="haystack_tests", + similarity="cosine", + recreate_index=True, + ) + + @pytest.fixture + def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStore: + """ + This fixture provides a pre-populated document store and takes care of cleaning up after each test + """ + documents = [Document( + text="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", + metadata={ + "target": "Lloyds", + "sentiment_score": -0.532, + "format": "headline", + }, + ), + Document( + text="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", + metadata={ + "target": "Glencore", + "sentiment_score": 0.037, + "format": "headline", + }, + ), + Document( + text="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", + metadata={ + "target": "Shell", + "sentiment_score": -0.345, + "format": "headline", + }, + ), + Document( + text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + metadata={ + "target": "TSLA", + "sentiment_score": 0.318, + "format": "post", + }, + ), + Document( + text="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", + metadata={ + "target": "HSBC", + "sentiment_score": 0.154, + "format": "post", + }, + ), + # Without meta + Document( + text="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." + ), + Document( + text="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." + ), + Document( + text="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." + ), + ] + ds.write_documents(documents) + return ds + + @pytest.fixture + def mocked_ds(self): + class DSMock(PineconeDocumentStore): + pass + + pinecone.init = MagicMock() + DSMock._create_index = MagicMock() + mocked_ds = DSMock(api_key="MOCK") + + return mocked_ds + + def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: + return [ + # Document object + Document( + text="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", + metadata={ + "target": "Lloyds", + "sentiment_score": -0.532, + "format": "headline", + }, + ), + Document( + text="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", + metadata={ + "target": "Glencore", + "sentiment_score": 0.037, + "format": "headline", + }, + ), + Document( + text="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", + metadata={ + "target": "Shell", + "sentiment_score": -0.345, + "format": "headline", + }, + ), + Document( + text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + metadata={ + "target": "TSLA", + "sentiment_score": 0.318, + "format": "post", + }, + ), + Document( + text="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", + metadata={ + "target": "HSBC", + "sentiment_score": 0.154, + "format": "post", + }, + ), + # Without meta + Document( + text="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." + ), + Document( + text="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." + ), + Document( + text="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." + ), + ] + + @pytest.mark.integration + def test_ne_filters(self, ds, documents): + ds.write_documents(documents) + + result = ds.get_filter_documents(filters={"format": {"$ne": "headline"}}) + assert len(result) == 2 + + @pytest.mark.integration + def test_filter_documents_with_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore): + + eq_docs = doc_store_with_docs.filter_documents(filters={"type": {"$eq": "article"}}) + normal_docs = doc_store_with_docs.filter_documents(filters={"type": "article"}) + assert eq_docs == normal_docs + + @pytest.mark.integration + def test_filter_documents_ids_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore): + retrieved_docs = doc_store_with_docs.filter_documents(filters={"target": {"$ne": "Glencore"}}) + assert all(d.meta.get("metadata", None) != "Glencore" for d in retrieved_docs) + + @pytest.mark.integration + def test_filter_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore): + retrieved_docs = doc_store_with_docs.filter_documents(filters={"format": {"$nin": ["target", "post"]}}) + assert {"target", "post"}.isdisjoint({d.metadata.get("metadata", None) for d in retrieved_docs}) + + @pytest.mark.integration + def test_filter_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore): + retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gt": 3.0}}) + assert all(d.metadata["sentiment_score"] > 3.0 for d in retrieved_docs) + + @pytest.mark.integration + def test_filter_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore): + retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gte": 3.0}}) + assert all(d.metadata["sentiment_score"] >= 3.0 for d in retrieved_docs) + + @pytest.mark.integration + def test_filter_documents_extended_filter_compound_and_other_field_simplified( + self, doc_store_with_docs: PineconeDocumentStore + ): + filters_simplified = { + "sentiment_score": {"$lte": 0.2, "$gte": 0.4}, + "target": ["Shell", "Glencore", "HSBC", "Lloyds", "TSLA"], + } + + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters_simplified) + + @pytest.mark.integration + def test_filter_documents_extended_filter_compound_and_or_explicit( + self, doc_store_with_docs: PineconeDocumentStore + ): + filters = { + "$and": { + "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, + "target": { + "name": {"$in": ["HSBC", "Lloyds"]}, + "sentiment_score": {"$lte": 5.0}, + }, + } + } + + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters) + + @pytest.mark.integration + def test_filter_documents_extended_filter_and_or_simplified(self, doc_store_with_docs: PineconeDocumentStore): + filters_simplified = { + "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, + "$or": {"format": ["headline", "post"], "sentiment_score": {"0.318"}}, + } + + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters_simplified) + + @pytest.mark.integration + def test_filter_documents_extended_filter_and_or_and_not_explicit(self, doc_store_with_docs: PineconeDocumentStore): + filters = { + "$and": { + "sentiment_score": {"$gte": 0.037}, + "$or": { + "target": {"$in": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"]}, + "$and": {"format": {"$in": ["headline", "post"]}}, + }, + } + } + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters) + + @pytest.mark.integration + def test_filter_documents_extended_filter_and_or_and_not_simplified( + self, doc_store_with_docs: PineconeDocumentStore + ): + filters_simplified = { + "sentiment_score": {"$lte": "0.037"}, + "$or": { + "target": ["LLyods", "Glencore"], + "$and": {"format": {"$lte": "headline"}, "$not": {"format": "post"}}, + }, + } + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters_simplified) + + @pytest.mark.integration + def test_filter_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore): + # Test nested logical operations within "$not". + filters = { + "$not": { + "$or": { + "$and": {"target": {"Lloyds"}}, + "$not": {"format": {"healdine"}}, + } + } + } + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]t' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters) + + @pytest.mark.integration + def test_filter_documents_extended_filter_compound_same_level_not(self, doc_store_with_docs: PineconeDocumentStore): + # Test same logical operator twice on the same level. + filters = { + "$or": [ + { + "$and": { + "target": ["LLyods", "Glencore", "TSLA", "Shell"], + "format": {"$in": ["post"]}, + } + }, + { + "$and": { + "target": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"], + "format": {"$in": ["headline"]}, + } + }, + ] + } + + with pytest.raises( + PineconeDocumentStoreFilterError, + match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", + ): + doc_store_with_docs.filter_documents(filters=filters) + + def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): + """ + We expect 1 doc with an embeddings because all documents in already written in doc_store_with_docs contain no + embeddings. + """ + doc = Document( + text="Doc with embedding", + embedding=np.random.rand(768).astype(np.float32), + ) + doc_store_with_docs.write_documents([doc]) + assert doc_store_with_docs.get_embedding_count() == 1 + diff --git a/document_stores/pinecone/tests/test_retriever.py b/document_stores/pinecone/tests/test_retriever.py new file mode 100644 index 000000000..0f099ec1e --- /dev/null +++ b/document_stores/pinecone/tests/test_retriever.py @@ -0,0 +1,136 @@ +import os +from inspect import getmembers, isclass, isfunction +import pinecone +from typing import Any, Dict, List, Union +from unittest.mock import MagicMock +from unittest.mock import patch +import numpy as np +import pytest +from tests import pinecone_mock +from pinecone_haystack.document_store import PineconeDocumentStore +from pinecone_haystack.retriever import PineconeRetriever +from haystack.preview import ( + component, + Document, + default_to_dict, + default_from_dict, + DeserializationError, +) + +from haystack.preview.dataclasses import Document + +class TestPineconeDocumentStore: + @pytest.fixture + def ds(self, monkeypatch, request) -> PineconeDocumentStore: + """ + This fixture provides an empty document store and takes care of cleaning up after each test + """ + + for fname, function in getmembers(pinecone_mock, isfunction): + monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) + for cname, class_ in getmembers(pinecone_mock, isclass): + monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) + + return PineconeDocumentStore( + api_key=os.environ.get("PINECONE_API_KEY") or "pinecone-test-key", + embedding_dim=768, + embedding_field="embedding", + index="haystack_tests", + similarity="cosine", + recreate_index=True, + ) + + @pytest.fixture + def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStore: + """ + This fixture provides a pre-populated document store and takes care of cleaning up after each test + """ + documents = [Document( + text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + metadata={ + "target": "Lloyds", + "sentiment_score": -0.532, + "format": "headline", + }), + ] + ds.write_documents(documents) + return ds + + @pytest.fixture + def mocked_ds(self): + class DSMock(PineconeDocumentStore): + pass + + pinecone.init = MagicMock() + DSMock._create_index = MagicMock() + mocked_ds = DSMock(api_key="MOCK") + + return mocked_ds + + +class TestPineconeRetriever: + @pytest.mark.integration + def test_init(self): + document_store = PineconeDocumentStore("pinecone-test-key") + retriever = PineconeRetriever(document_store=document_store) + assert retriever.document_store == document_store + assert retriever.filters == None + assert retriever.top_k == 10 + assert retriever.scale_score == True + assert retriever.return_embedding == False + + @pytest.mark.integration + def test_run(self): + document_store = PineconeDocumentStore("pinecone-test-key") + with patch.object(document_store, "query") as mock_query: + mock_query.return_value = Document( + text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + metadata={ + "target": "TSLA", + "sentiment_score": 0.318, + "format": "post", + }) + + results = self.retriever.run(["How many cars is TSLA recalling?"]) + + assert len(results["documents"]) == 1 + assert results["documents"][0][0].text == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." + + @pytest.mark.integration + def test_to_dict(self): + document_store = PineconeDocumentStore("pinecone-test-key") + retriever = PineconeRetriever(document_store=document_store) + doc_dict = retriever.to_dict() + assert doc_dict == { + "init_parameters": { + "document_store": "test_document_store", + "filters": None, + "top_k": 10, + "scale_score": "True", + "return_embedding": False, + } + } + + @pytest.mark.integration + def test_from_dict(self): + """ + Test deserialization of this component from a dictionary, using default initialization parameters. + """ + retriever_component_dict = { + "type": "PineconeRetriever", + "init_parameters": { + "document_store": "test_document_store", + "filters": None, + "top_k": 10, + "scale_score": True, + "return_embedding": False, + } + } + retriever = PineconeRetriever.from_dict(retriever_component_dict) + + assert retriever.document_store == "test_document_store" + assert retriever.filters is None + assert retriever.top_k == 10 + assert retriever.scale_score is True + assert retriever.return_embedding is False + From 76ef43b046a6341e590401943533bc744dbb3bad Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 15 Nov 2023 16:57:46 +0100 Subject: [PATCH 02/38] adapt to Document refactoring --- .../src/pinecone_haystack/document_store.py | 4 +- .../src/pinecone_haystack/filter_utils.py | 7 +- .../tests/test_pinecone_document_store.py | 72 +++++++++---------- .../pinecone/tests/test_retriever.py | 36 ++++++---- 4 files changed, 60 insertions(+), 59 deletions(-) diff --git a/document_stores/pinecone/src/pinecone_haystack/document_store.py b/document_stores/pinecone/src/pinecone_haystack/document_store.py index 16266e506..8b50598f8 100644 --- a/document_stores/pinecone/src/pinecone_haystack/document_store.py +++ b/document_stores/pinecone/src/pinecone_haystack/document_store.py @@ -641,8 +641,8 @@ def write_documents( self._meta_for_pinecone( { TYPE_METADATA_FIELD: type_metadata, # add `doc_type` in metadata - "text": doc.text, - "content_type": doc.metadata, + "text": doc.content, + "content_type": doc.meta, } ) for doc in documents[i : i + self.batch_size] diff --git a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py index bcba26784..52a02233a 100644 --- a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py +++ b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py @@ -2,13 +2,12 @@ from typing import Union, List, Dict from abc import ABC, abstractmethod from collections import defaultdict -from haystack.errors import FilterError +from haystack.preview.errors import FilterError from pinecone_haystack.errors import PineconeDocumentStoreFilterError logger = logging.getLogger(__file__) - def nested_defaultdict() -> defaultdict: """ Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary @@ -132,7 +131,6 @@ def convert_to_pinecone(self): pass - class ComparisonOperation(ABC): def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]): self.field_name = field_name @@ -189,8 +187,6 @@ def invert(self) -> "ComparisonOperation": pass - - class NotOperation(LogicalFilterClause): """ Handles conversion of logical 'NOT' operations. @@ -401,7 +397,6 @@ def evaluate(self, fields) -> bool: return fields[self.field_name] < self.comparison_value - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: if not isinstance(self.comparison_value, (float, int)): raise PineconeDocumentStoreFilterError("Comparison value for '$lt' operation must be a float or int.") diff --git a/document_stores/pinecone/tests/test_pinecone_document_store.py b/document_stores/pinecone/tests/test_pinecone_document_store.py index 4f279cd18..089472a38 100644 --- a/document_stores/pinecone/tests/test_pinecone_document_store.py +++ b/document_stores/pinecone/tests/test_pinecone_document_store.py @@ -15,13 +15,14 @@ from tests import pinecone_mock import pinecone + class TestPineconeDocumentStore: @pytest.fixture def ds(self, monkeypatch, request) -> PineconeDocumentStore: """ This fixture provides an empty document store and takes care of cleaning up after each test """ - + for fname, function in getmembers(pinecone_mock, isfunction): monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) for cname, class_ in getmembers(pinecone_mock, isclass): @@ -41,41 +42,42 @@ def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStor """ This fixture provides a pre-populated document store and takes care of cleaning up after each test """ - documents = [Document( - text="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - metadata={ + documents = [ + Document( + content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", + meta={ "target": "Lloyds", "sentiment_score": -0.532, "format": "headline", }, ), Document( - text="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - metadata={ + content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", + meta={ "target": "Glencore", "sentiment_score": 0.037, "format": "headline", }, ), Document( - text="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - metadata={ + content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", + meta={ "target": "Shell", "sentiment_score": -0.345, "format": "headline", }, ), Document( - text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - metadata={ + content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + meta={ "target": "TSLA", "sentiment_score": 0.318, "format": "post", }, ), Document( - text="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - metadata={ + content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", + meta={ "target": "HSBC", "sentiment_score": 0.154, "format": "post", @@ -83,13 +85,13 @@ def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStor ), # Without meta Document( - text="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." + content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." ), Document( - text="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." + content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." ), Document( - text="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." + content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." ), ] ds.write_documents(documents) @@ -110,40 +112,40 @@ def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: return [ # Document object Document( - text="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - metadata={ + content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", + meta={ "target": "Lloyds", "sentiment_score": -0.532, "format": "headline", }, ), Document( - text="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - metadata={ + content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", + meta={ "target": "Glencore", "sentiment_score": 0.037, "format": "headline", }, ), Document( - text="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - metadata={ + content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", + meta={ "target": "Shell", "sentiment_score": -0.345, "format": "headline", }, ), Document( - text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - metadata={ + content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + meta={ "target": "TSLA", "sentiment_score": 0.318, "format": "post", }, ), Document( - text="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - metadata={ + content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", + meta={ "target": "HSBC", "sentiment_score": 0.154, "format": "post", @@ -151,26 +153,25 @@ def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: ), # Without meta Document( - text="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." + content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." ), Document( - text="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." + content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." ), Document( - text="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." + content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." ), ] - + @pytest.mark.integration def test_ne_filters(self, ds, documents): ds.write_documents(documents) result = ds.get_filter_documents(filters={"format": {"$ne": "headline"}}) - assert len(result) == 2 + assert len(result) == 2 @pytest.mark.integration def test_filter_documents_with_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore): - eq_docs = doc_store_with_docs.filter_documents(filters={"type": {"$eq": "article"}}) normal_docs = doc_store_with_docs.filter_documents(filters={"type": "article"}) assert eq_docs == normal_docs @@ -183,17 +184,17 @@ def test_filter_documents_ids_extended_filter_ne(self, doc_store_with_docs: Pine @pytest.mark.integration def test_filter_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.filter_documents(filters={"format": {"$nin": ["target", "post"]}}) - assert {"target", "post"}.isdisjoint({d.metadata.get("metadata", None) for d in retrieved_docs}) + assert {"target", "post"}.isdisjoint({d.meta.get("metadata", None) for d in retrieved_docs}) @pytest.mark.integration def test_filter_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gt": 3.0}}) - assert all(d.metadata["sentiment_score"] > 3.0 for d in retrieved_docs) + assert all(d.meta["sentiment_score"] > 3.0 for d in retrieved_docs) @pytest.mark.integration def test_filter_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore): retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gte": 3.0}}) - assert all(d.metadata["sentiment_score"] >= 3.0 for d in retrieved_docs) + assert all(d.meta["sentiment_score"] >= 3.0 for d in retrieved_docs) @pytest.mark.integration def test_filter_documents_extended_filter_compound_and_other_field_simplified( @@ -326,9 +327,8 @@ def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): embeddings. """ doc = Document( - text="Doc with embedding", + content="Doc with embedding", embedding=np.random.rand(768).astype(np.float32), ) doc_store_with_docs.write_documents([doc]) assert doc_store_with_docs.get_embedding_count() == 1 - diff --git a/document_stores/pinecone/tests/test_retriever.py b/document_stores/pinecone/tests/test_retriever.py index 0f099ec1e..be42a2213 100644 --- a/document_stores/pinecone/tests/test_retriever.py +++ b/document_stores/pinecone/tests/test_retriever.py @@ -19,13 +19,14 @@ from haystack.preview.dataclasses import Document + class TestPineconeDocumentStore: @pytest.fixture def ds(self, monkeypatch, request) -> PineconeDocumentStore: """ This fixture provides an empty document store and takes care of cleaning up after each test """ - + for fname, function in getmembers(pinecone_mock, isfunction): monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) for cname, class_ in getmembers(pinecone_mock, isclass): @@ -45,13 +46,15 @@ def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStor """ This fixture provides a pre-populated document store and takes care of cleaning up after each test """ - documents = [Document( - text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - metadata={ + documents = [ + Document( + content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + meta={ "target": "Lloyds", "sentiment_score": -0.532, "format": "headline", - }), + }, + ), ] ds.write_documents(documents) return ds @@ -78,24 +81,28 @@ def test_init(self): assert retriever.top_k == 10 assert retriever.scale_score == True assert retriever.return_embedding == False - + @pytest.mark.integration def test_run(self): document_store = PineconeDocumentStore("pinecone-test-key") with patch.object(document_store, "query") as mock_query: mock_query.return_value = Document( - text="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - metadata={ + content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", + meta={ "target": "TSLA", "sentiment_score": 0.318, "format": "post", - }) + }, + ) results = self.retriever.run(["How many cars is TSLA recalling?"]) - + assert len(results["documents"]) == 1 - assert results["documents"][0][0].text == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." - + assert ( + results["documents"][0][0].content + == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." + ) + @pytest.mark.integration def test_to_dict(self): document_store = PineconeDocumentStore("pinecone-test-key") @@ -110,7 +117,7 @@ def test_to_dict(self): "return_embedding": False, } } - + @pytest.mark.integration def test_from_dict(self): """ @@ -124,7 +131,7 @@ def test_from_dict(self): "top_k": 10, "scale_score": True, "return_embedding": False, - } + }, } retriever = PineconeRetriever.from_dict(retriever_component_dict) @@ -133,4 +140,3 @@ def test_from_dict(self): assert retriever.top_k == 10 assert retriever.scale_score is True assert retriever.return_embedding is False - From b449167528629d3fe27282c27eaf4521bedb301a Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 15 Nov 2023 17:17:55 +0100 Subject: [PATCH 03/38] start improving existing tests --- .../tests/test_pinecone_document_store.py | 2 +- .../pinecone/tests/test_retriever.py | 83 ++++--------------- 2 files changed, 17 insertions(+), 68 deletions(-) diff --git a/document_stores/pinecone/tests/test_pinecone_document_store.py b/document_stores/pinecone/tests/test_pinecone_document_store.py index 089472a38..6ab79ecab 100644 --- a/document_stores/pinecone/tests/test_pinecone_document_store.py +++ b/document_stores/pinecone/tests/test_pinecone_document_store.py @@ -16,7 +16,7 @@ import pinecone -class TestPineconeDocumentStore: +class TestPineconeDocumentStore(DocumentStoreBaseTests): @pytest.fixture def ds(self, monkeypatch, request) -> PineconeDocumentStore: """ diff --git a/document_stores/pinecone/tests/test_retriever.py b/document_stores/pinecone/tests/test_retriever.py index be42a2213..2f0756aea 100644 --- a/document_stores/pinecone/tests/test_retriever.py +++ b/document_stores/pinecone/tests/test_retriever.py @@ -2,7 +2,7 @@ from inspect import getmembers, isclass, isfunction import pinecone from typing import Any, Dict, List, Union -from unittest.mock import MagicMock +from unittest.mock import MagicMock, Mock from unittest.mock import patch import numpy as np import pytest @@ -20,88 +20,37 @@ from haystack.preview.dataclasses import Document -class TestPineconeDocumentStore: - @pytest.fixture - def ds(self, monkeypatch, request) -> PineconeDocumentStore: - """ - This fixture provides an empty document store and takes care of cleaning up after each test - """ - - for fname, function in getmembers(pinecone_mock, isfunction): - monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) - for cname, class_ in getmembers(pinecone_mock, isclass): - monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) - - return PineconeDocumentStore( - api_key=os.environ.get("PINECONE_API_KEY") or "pinecone-test-key", - embedding_dim=768, - embedding_field="embedding", - index="haystack_tests", - similarity="cosine", - recreate_index=True, - ) - - @pytest.fixture - def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStore: - """ - This fixture provides a pre-populated document store and takes care of cleaning up after each test - """ - documents = [ - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "Lloyds", - "sentiment_score": -0.532, - "format": "headline", - }, - ), - ] - ds.write_documents(documents) - return ds - - @pytest.fixture - def mocked_ds(self): - class DSMock(PineconeDocumentStore): - pass - - pinecone.init = MagicMock() - DSMock._create_index = MagicMock() - mocked_ds = DSMock(api_key="MOCK") - - return mocked_ds - - class TestPineconeRetriever: - @pytest.mark.integration + @pytest.mark.unit def test_init(self): - document_store = PineconeDocumentStore("pinecone-test-key") - retriever = PineconeRetriever(document_store=document_store) - assert retriever.document_store == document_store + mock_store = Mock(spec=PineconeDocumentStore) + retriever = PineconeRetriever(document_store=mock_store) + assert retriever.document_store == mock_store assert retriever.filters == None assert retriever.top_k == 10 assert retriever.scale_score == True assert retriever.return_embedding == False - @pytest.mark.integration + @pytest.mark.unit def test_run(self): - document_store = PineconeDocumentStore("pinecone-test-key") - with patch.object(document_store, "query") as mock_query: - mock_query.return_value = Document( + mock_store = Mock(spec=PineconeDocumentStore) + mock_store.query_by_embedding.return_value = [Document( content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", meta={ "target": "TSLA", "sentiment_score": 0.318, "format": "post", }, - ) + )] - results = self.retriever.run(["How many cars is TSLA recalling?"]) + retriever = PineconeRetriever(document_store=mock_store) + results = retriever.run(["How many cars is TSLA recalling?"]) - assert len(results["documents"]) == 1 - assert ( - results["documents"][0][0].content - == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." - ) + assert len(results["documents"]) == 1 + assert ( + results["documents"][0].content + == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." + ) @pytest.mark.integration def test_to_dict(self): From 7b26b2e42d56f153f708a9fc3bdc897aab812e84 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 15 Nov 2023 17:34:28 +0100 Subject: [PATCH 04/38] try to setup a testing workflow --- .../workflows/document_stores_pinecone.yml | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 .github/workflows/document_stores_pinecone.yml diff --git a/.github/workflows/document_stores_pinecone.yml b/.github/workflows/document_stores_pinecone.yml new file mode 100644 index 000000000..14202474f --- /dev/null +++ b/.github/workflows/document_stores_pinecone.yml @@ -0,0 +1,49 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / Document Stores / pinecone + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "document_stores/pinecone/**" + - ".github/workflows/pinecone.yml" + +concurrency: + group: document_stores_pinecone-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: document_stores/pinecone + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Run tests + working-directory: document_stores/pinecone + run: hatch run cov From 7d0cdd1d0b97cead52f3d4d198b6a553ebefdf7e Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 16 Nov 2023 09:41:56 +0100 Subject: [PATCH 05/38] fix some format errors --- .../src/pinecone_haystack/document_store.py | 4 +-- .../src/pinecone_haystack/filter_utils.py | 6 +++-- .../src/pinecone_haystack/retriever.py | 9 ++++--- .../pinecone/tests/pinecone_mock.py | 6 ++--- .../tests/test_pinecone_document_store.py | 7 ++--- .../pinecone/tests/test_retriever.py | 26 ++++++++++--------- 6 files changed, 30 insertions(+), 28 deletions(-) diff --git a/document_stores/pinecone/src/pinecone_haystack/document_store.py b/document_stores/pinecone/src/pinecone_haystack/document_store.py index 8b50598f8..e25cfa9aa 100644 --- a/document_stores/pinecone/src/pinecone_haystack/document_store.py +++ b/document_stores/pinecone/src/pinecone_haystack/document_store.py @@ -11,9 +11,7 @@ from typing import Any, Dict, Generator, List, Literal, Optional, Set, Union import numpy as np -from tqdm import tqdm import pinecone - from haystack.preview.dataclasses import Document from haystack.preview.document_stores.decorator import document_store from haystack.preview.document_stores.errors import ( @@ -21,6 +19,7 @@ MissingDocumentError, ) from haystack.preview.document_stores.protocols import DuplicatePolicy +from tqdm import tqdm from pinecone_haystack.errors import ( PineconeDocumentStoreError, @@ -28,7 +27,6 @@ ) from pinecone_haystack.filter_utils import LogicalFilterClause - logger = logging.getLogger(__name__) diff --git a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py index 52a02233a..cbf256210 100644 --- a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py +++ b/document_stores/pinecone/src/pinecone_haystack/filter_utils.py @@ -1,8 +1,10 @@ import logging -from typing import Union, List, Dict from abc import ABC, abstractmethod from collections import defaultdict +from typing import Dict, List, Union + from haystack.preview.errors import FilterError + from pinecone_haystack.errors import PineconeDocumentStoreFilterError logger = logging.getLogger(__file__) @@ -168,7 +170,7 @@ def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> elif isinstance(comparison_clause, list): comparison_operations.append(InOperation(field_name, comparison_clause)) else: - comparison_operations.append((EqOperation(field_name, comparison_clause))) + comparison_operations.append(EqOperation(field_name, comparison_clause)) return comparison_operations diff --git a/document_stores/pinecone/src/pinecone_haystack/retriever.py b/document_stores/pinecone/src/pinecone_haystack/retriever.py index 5f811980f..a4d89b7c9 100644 --- a/document_stores/pinecone/src/pinecone_haystack/retriever.py +++ b/document_stores/pinecone/src/pinecone_haystack/retriever.py @@ -1,16 +1,17 @@ # SPDX-FileCopyrightText: 2023-present John Doe # # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, List, Any, Optional +from typing import Any, Dict, List, Optional from haystack.preview import ( - component, + DeserializationError, Document, - default_to_dict, + component, default_from_dict, - DeserializationError, + default_to_dict, ) from haystack.preview.dataclasses import Document + from pinecone_haystack.document_store import PineconeDocumentStore diff --git a/document_stores/pinecone/tests/pinecone_mock.py b/document_stores/pinecone/tests/pinecone_mock.py index 882531acf..215b0e428 100644 --- a/document_stores/pinecone/tests/pinecone_mock.py +++ b/document_stores/pinecone/tests/pinecone_mock.py @@ -1,7 +1,5 @@ -from typing import Optional, List, Union, Dict, Any - import logging - +from typing import Any, Dict, List, Optional, Union logger = logging.getLogger(__name__) @@ -146,7 +144,7 @@ def fetch(self, ids: List[str], namespace: str = ""): } return response - def _filter( # noqa: C901,PLR0912 + def _filter( self, metadata: dict, filters: Dict[str, Any] = None, diff --git a/document_stores/pinecone/tests/test_pinecone_document_store.py b/document_stores/pinecone/tests/test_pinecone_document_store.py index 6ab79ecab..8d0f1b097 100644 --- a/document_stores/pinecone/tests/test_pinecone_document_store.py +++ b/document_stores/pinecone/tests/test_pinecone_document_store.py @@ -2,18 +2,19 @@ from inspect import getmembers, isclass, isfunction from typing import Any, Dict, List, Union from unittest.mock import MagicMock + import numpy as np +import pinecone import pytest +from haystack.preview.dataclasses import Document +from haystack.preview.testing.document_store import DocumentStoreBaseTests from pinecone_haystack.document_store import PineconeDocumentStore from pinecone_haystack.errors import ( PineconeDocumentStoreError, PineconeDocumentStoreFilterError, ) -from haystack.preview.dataclasses import Document -from haystack.preview.testing.document_store import DocumentStoreBaseTests from tests import pinecone_mock -import pinecone class TestPineconeDocumentStore(DocumentStoreBaseTests): diff --git a/document_stores/pinecone/tests/test_retriever.py b/document_stores/pinecone/tests/test_retriever.py index 2f0756aea..4f21d357a 100644 --- a/document_stores/pinecone/tests/test_retriever.py +++ b/document_stores/pinecone/tests/test_retriever.py @@ -1,24 +1,24 @@ import os from inspect import getmembers, isclass, isfunction -import pinecone from typing import Any, Dict, List, Union -from unittest.mock import MagicMock, Mock -from unittest.mock import patch +from unittest.mock import MagicMock, Mock, patch + import numpy as np +import pinecone import pytest -from tests import pinecone_mock -from pinecone_haystack.document_store import PineconeDocumentStore -from pinecone_haystack.retriever import PineconeRetriever from haystack.preview import ( - component, + DeserializationError, Document, - default_to_dict, + component, default_from_dict, - DeserializationError, + default_to_dict, ) - from haystack.preview.dataclasses import Document +from pinecone_haystack.document_store import PineconeDocumentStore +from pinecone_haystack.retriever import PineconeRetriever +from tests import pinecone_mock + class TestPineconeRetriever: @pytest.mark.unit @@ -34,14 +34,16 @@ def test_init(self): @pytest.mark.unit def test_run(self): mock_store = Mock(spec=PineconeDocumentStore) - mock_store.query_by_embedding.return_value = [Document( + mock_store.query_by_embedding.return_value = [ + Document( content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", meta={ "target": "TSLA", "sentiment_score": 0.318, "format": "post", }, - )] + ) + ] retriever = PineconeRetriever(document_store=mock_store) results = retriever.run(["How many cars is TSLA recalling?"]) From 31c1e43a87eccb2414ddc54a6519376b97f44202 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 6 Dec 2023 13:00:09 +0100 Subject: [PATCH 06/38] adapt to new strucure --- .../{document_stores_pinecone.yml => pinecone.yml} | 10 +++++----- {document_stores => integrations}/pinecone/README.md | 0 .../pinecone/pyproject.toml | 0 .../pinecone/src/pinecone_haystack/__about__.py | 0 .../pinecone/src/pinecone_haystack/__init__.py | 0 .../pinecone/src/pinecone_haystack/document_store.py | 0 .../pinecone/src/pinecone_haystack/errors.py | 0 .../pinecone/src/pinecone_haystack/filter_utils.py | 0 .../pinecone/src/pinecone_haystack/retriever.py | 0 .../pinecone/tests/__init__.py | 0 .../pinecone/tests/pinecone_mock.py | 0 .../pinecone/tests/test_pinecone_document_store.py | 0 .../pinecone/tests/test_retriever.py | 0 13 files changed, 5 insertions(+), 5 deletions(-) rename .github/workflows/{document_stores_pinecone.yml => pinecone.yml} (82%) rename {document_stores => integrations}/pinecone/README.md (100%) rename {document_stores => integrations}/pinecone/pyproject.toml (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/__about__.py (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/__init__.py (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/document_store.py (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/errors.py (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/filter_utils.py (100%) rename {document_stores => integrations}/pinecone/src/pinecone_haystack/retriever.py (100%) rename {document_stores => integrations}/pinecone/tests/__init__.py (100%) rename {document_stores => integrations}/pinecone/tests/pinecone_mock.py (100%) rename {document_stores => integrations}/pinecone/tests/test_pinecone_document_store.py (100%) rename {document_stores => integrations}/pinecone/tests/test_retriever.py (100%) diff --git a/.github/workflows/document_stores_pinecone.yml b/.github/workflows/pinecone.yml similarity index 82% rename from .github/workflows/document_stores_pinecone.yml rename to .github/workflows/pinecone.yml index 14202474f..78eb27937 100644 --- a/.github/workflows/document_stores_pinecone.yml +++ b/.github/workflows/pinecone.yml @@ -1,17 +1,17 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / Document Stores / pinecone +name: Test / pinecone on: schedule: - cron: "0 0 * * *" pull_request: paths: - - "document_stores/pinecone/**" + - "integrations/pinecone/**" - ".github/workflows/pinecone.yml" concurrency: - group: document_stores_pinecone-${{ github.head_ref }} + group: pinecone-${{ github.head_ref }} cancel-in-progress: true env: @@ -40,10 +40,10 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: document_stores/pinecone + working-directory: pinecone if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: document_stores/pinecone + working-directory: pinecone run: hatch run cov diff --git a/document_stores/pinecone/README.md b/integrations/pinecone/README.md similarity index 100% rename from document_stores/pinecone/README.md rename to integrations/pinecone/README.md diff --git a/document_stores/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml similarity index 100% rename from document_stores/pinecone/pyproject.toml rename to integrations/pinecone/pyproject.toml diff --git a/document_stores/pinecone/src/pinecone_haystack/__about__.py b/integrations/pinecone/src/pinecone_haystack/__about__.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/__about__.py rename to integrations/pinecone/src/pinecone_haystack/__about__.py diff --git a/document_stores/pinecone/src/pinecone_haystack/__init__.py b/integrations/pinecone/src/pinecone_haystack/__init__.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/__init__.py rename to integrations/pinecone/src/pinecone_haystack/__init__.py diff --git a/document_stores/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/document_store.py rename to integrations/pinecone/src/pinecone_haystack/document_store.py diff --git a/document_stores/pinecone/src/pinecone_haystack/errors.py b/integrations/pinecone/src/pinecone_haystack/errors.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/errors.py rename to integrations/pinecone/src/pinecone_haystack/errors.py diff --git a/document_stores/pinecone/src/pinecone_haystack/filter_utils.py b/integrations/pinecone/src/pinecone_haystack/filter_utils.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/filter_utils.py rename to integrations/pinecone/src/pinecone_haystack/filter_utils.py diff --git a/document_stores/pinecone/src/pinecone_haystack/retriever.py b/integrations/pinecone/src/pinecone_haystack/retriever.py similarity index 100% rename from document_stores/pinecone/src/pinecone_haystack/retriever.py rename to integrations/pinecone/src/pinecone_haystack/retriever.py diff --git a/document_stores/pinecone/tests/__init__.py b/integrations/pinecone/tests/__init__.py similarity index 100% rename from document_stores/pinecone/tests/__init__.py rename to integrations/pinecone/tests/__init__.py diff --git a/document_stores/pinecone/tests/pinecone_mock.py b/integrations/pinecone/tests/pinecone_mock.py similarity index 100% rename from document_stores/pinecone/tests/pinecone_mock.py rename to integrations/pinecone/tests/pinecone_mock.py diff --git a/document_stores/pinecone/tests/test_pinecone_document_store.py b/integrations/pinecone/tests/test_pinecone_document_store.py similarity index 100% rename from document_stores/pinecone/tests/test_pinecone_document_store.py rename to integrations/pinecone/tests/test_pinecone_document_store.py diff --git a/document_stores/pinecone/tests/test_retriever.py b/integrations/pinecone/tests/test_retriever.py similarity index 100% rename from document_stores/pinecone/tests/test_retriever.py rename to integrations/pinecone/tests/test_retriever.py From 9c39509a218bf79bb3fccf9ab9b1422c263ca7af Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 18 Dec 2023 12:07:01 +0100 Subject: [PATCH 07/38] adapt pyproject; rm about --- integrations/pinecone/pyproject.toml | 21 +++++++++++-------- .../src/pinecone_haystack/__about__.py | 4 ---- 2 files changed, 12 insertions(+), 13 deletions(-) delete mode 100644 integrations/pinecone/src/pinecone_haystack/__about__.py diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index b8f3f40c0..e40e9f16c 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -7,7 +7,7 @@ name = "pinecone_haystack" dynamic = ["version"] description = '' readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.8" license = "Apache-2.0" keywords = [] authors = [ @@ -16,7 +16,6 @@ authors = [ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -25,23 +24,28 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" "haystack-ai", "pinecone-client", ] [project.urls] -Documentation = "https://github.com/unknown/example-store#readme" -Issues = "https://github.com/unknown/example-store/issues" -Source = "https://github.com/unknown/example-store" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/pinecone" [tool.hatch.version] -path = "src/pinecone_haystack/__about__.py" +source = "vcs" +tag-pattern = 'integrations\/pinecone-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/pinecone-v[0-9]*"' [tool.hatch.envs.default] dependencies = [ "coverage[toml]>=6.5", "pytest", + "pytest-xdist", ] [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" @@ -56,7 +60,7 @@ cov = [ ] [[tool.hatch.envs.all.matrix]] -python = ["3.7", "3.8", "3.9", "3.10", "3.11"] +python = ["3.8", "3.9", "3.10", "3.11"] [tool.hatch.envs.lint] detached = true @@ -150,7 +154,6 @@ source_pkgs = ["pinecone_haystack", "tests"] branch = true parallel = true omit = [ - "src/pinecone_haystack/__about__.py", "example" ] diff --git a/integrations/pinecone/src/pinecone_haystack/__about__.py b/integrations/pinecone/src/pinecone_haystack/__about__.py deleted file mode 100644 index 6294ccfe2..000000000 --- a/integrations/pinecone/src/pinecone_haystack/__about__.py +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present John Doe -# -# SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.1" From fe2168ba7ae2cf19259cd8cbd9f5b277102f000f Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 18 Dec 2023 12:17:36 +0100 Subject: [PATCH 08/38] fix workflow --- .github/workflows/pinecone.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml index 78eb27937..8f91c4a71 100644 --- a/.github/workflows/pinecone.yml +++ b/.github/workflows/pinecone.yml @@ -32,7 +32,7 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} @@ -40,10 +40,10 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: pinecone + working-directory: integrations/pinecone if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: pinecone + working-directory: integrations/pinecone run: hatch run cov From 2d9d215d12a047554be75d59eba00ce1d9564253 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Mon, 18 Dec 2023 12:21:38 +0100 Subject: [PATCH 09/38] add hatch-vcs --- integrations/pinecone/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index e40e9f16c..1f8b1f328 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" [project] From 7ee262cf3a6733fb331b65b7465344de0c960180 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 19 Dec 2023 16:42:24 +0100 Subject: [PATCH 10/38] simplification - first draft --- .../src/pinecone_haystack/document_store.py | 1098 +++-------------- .../pinecone/src/pinecone_haystack/errors.py | 4 +- .../src/pinecone_haystack/filter_utils.py | 432 ------- .../src/pinecone_haystack/retriever.py | 128 -- 4 files changed, 175 insertions(+), 1487 deletions(-) delete mode 100644 integrations/pinecone/src/pinecone_haystack/filter_utils.py delete mode 100644 integrations/pinecone/src/pinecone_haystack/retriever.py diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index e25cfa9aa..ae812e7ec 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -1,988 +1,236 @@ -# SPDX-FileCopyrightText: 2023-present John Doe +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 - -import copy -import json +import io import logging -import operator -from functools import reduce -from itertools import islice -from typing import Any, Dict, Generator, List, Literal, Optional, Set, Union +import os +from typing import Any, Dict, List, Optional -import numpy as np +import pandas as pd import pinecone -from haystack.preview.dataclasses import Document -from haystack.preview.document_stores.decorator import document_store -from haystack.preview.document_stores.errors import ( - DuplicateDocumentError, - MissingDocumentError, -) -from haystack.preview.document_stores.protocols import DuplicatePolicy -from tqdm import tqdm - -from pinecone_haystack.errors import ( - PineconeDocumentStoreError, - PineconeDocumentStoreFilterError, -) -from pinecone_haystack.filter_utils import LogicalFilterClause +from haystack import default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores import DuplicatePolicy logger = logging.getLogger(__name__) - -TYPE_METADATA_FIELD = "doc_type" -DOCUMENT_WITH_EMBEDDING = "vector" -DOCUMENT_WITHOUT_EMBEDDING = "no-vector" -LABEL = "label" - -AND_OPERATOR = "$and" -IN_OPERATOR = "$in" -EQ_OPERATOR = "$eq" - -DocTypeMetadata = Literal["vector", "no-vector", "label"] - - -def _sanitize_index(index: Optional[str]) -> Optional[str]: - if index: - return index.replace("_", "-").lower() - return None - - -def _get_by_path(root, items): - """Access a nested object in root by item sequence.""" - return reduce(operator.getitem, items, root) +# Pinecone has a limit of 1000 documents that can be returned in a query +# with include_metadata=True or include_data=True +# https://docs.pinecone.io/docs/limits +TOP_K_LIMIT = 1_000 -def _set_by_path(root, items, value): - """Set a value in a nested object in root by item sequence.""" - _get_by_path(root, items[:-1])[items[-1]] = value - - -@document_store class PineconeDocumentStore: - """ - It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) - to perform similarity search on vectors. In order to use this document store, you need an API key that you can - obtain by creating an account on the [Pinecone website](https://www.pinecone.io). - - This is a hosted document store, - this means that your vectors will not be stored locally but in the cloud. This means that the similarity - search will be run on the cloud as well. - """ - - top_k_limit = 10_000 - top_k_limit_vectors = 1_000 - def __init__( self, - api_key: str, + *, + api_key: Optional[str] = None, environment: str = "us-west1-gcp", - pinecone_index: Optional["pinecone.Index"] = None, - embedding_dim: int = 768, + index: str = "default", + namespace: str = "default", batch_size: int = 100, - return_embedding: bool = False, - index: str = "document", - similarity: str = "cosine", - replicas: int = 1, - shards: int = 1, - namespace: Optional[str] = None, - embedding_field: str = "embedding", - progress_bar: bool = True, - duplicate_documents: str = "overwrite", - recreate_index: bool = False, - metadata_config: Optional[Dict] = None, - validate_index_sync: bool = True, + dimension: int = 768, + **index_creation_kwargs, ): """ - :param api_key: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). - :param environment: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS - regions are supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. - :param pinecone_index: pinecone-client Index object, an index will be initialized or loaded if not specified. - :param embedding_dim: The embedding vector size. - :param batch_size: The batch size to be used when writing documents to the document store. - :param return_embedding: Whether to return document embeddings. - :param index: Name of index in document store to use. - :param similarity: The similarity function used to compare document vectors. `"cosine"` is the default - and is recommended if you are using a Sentence-Transformer model. `"dot_product"` is more performant - with DPR embeddings. - In both cases, the returned values in Document.score are normalized to be in range [0,1]: - - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` - - For `"cosine"`: `(raw_score + 1) / 2` - :param replicas: The number of replicas. Replicas duplicate the index. They provide higher availability and - throughput. - :param shards: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. - :param namespace: Optional namespace. If not specified, None is default. - :param embedding_field: Name of field containing an embedding vector. - :param progress_bar: Whether to show a tqdm progress bar or not. - Can be helpful to disable in production deployments to keep the logs clean. - :param duplicate_documents: Handle duplicate documents based on parameter options.\ - Parameter options: - - `"skip"`: Ignore the duplicate documents. - - `"overwrite"`: Update any existing documents with the same ID when adding documents. - - `"fail"`: An error is raised if the document ID of the document being added already exists. - :param recreate_index: If set to True, an existing Pinecone index will be deleted and a new one will be - created using the config you are using for initialization. Be aware that all data in the old index will be - lost if you choose to recreate the index. Be aware that both the document_index and the label_index will - be recreated. - :param metadata_config: Which metadata fields should be indexed, part of the - [selective metadata filtering](https://www.pinecone.io/docs/manage-indexes/#selective-metadata-indexing) feature. - Should be in the format `{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. By default, - no fields are indexed. - """ - - if metadata_config is None: - metadata_config = {"indexed": []} - # Connect to Pinecone server using python client binding - if not api_key: - raise PineconeDocumentStoreError( - "Pinecone requires an API key, please provide one. https://app.pinecone.io" - ) + Creates a new PineconeDocumentStore instance. + It is meant to be connected to a Pinecone index and namespace. + + :param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the + environment variable PINECONE_API_KEY (recommended). + :param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp". + :param index: The Pinecone index to connect to. If the index does not exist, it will be created. + Defaults to "default". + :param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created + at the first write. Defaults to "default". + :param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by + Pinecone. + :param dimension: The dimension of the embeddings. This parameter is only used when creating a new index. + Defaults to 768. + :param index_creation_kwargs: Additional keyword arguments to pass to the index creation method. + For example, you can specify `metric`, `pods`, `replicas`... + You can find the full list of supported arguments in the + [API reference](https://docs.pinecone.io/reference/create_index-1). + + """ + if api_key is None: + try: + api_key = os.environ["PINECONE_API_KEY"] + except KeyError as e: + msg = ( + "PineconeDocumentStore expects a Pinecone API key. " + "Set the PINECONE_API_KEY environment variable (recommended) or pass it explicitly." + ) + raise ValueError(msg) from e pinecone.init(api_key=api_key, environment=environment) - self._api_key = api_key - # Format similarity string - self._set_similarity_metric(similarity) + if index not in pinecone.list_indexes(): + pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs) - self.similarity = similarity - self.index: str = self._index(index) - self.embedding_dim = embedding_dim - self.batch_size = batch_size - self.return_embedding = return_embedding - self.embedding_field = embedding_field - self.progress_bar = progress_bar - self.duplicate_documents = duplicate_documents - - # Pinecone index params - self.replicas = replicas - self.shards = shards + self._index = pinecone.Index(index_name=index) + self.dimension = self._index.describe_index_stats()["dimension"] + self._dummy_vector = [0.0] * self.dimension + self.environment = environment + self.index = index self.namespace = namespace - - # Add necessary metadata fields to metadata_config - fields = ["label-id", "query", TYPE_METADATA_FIELD] - metadata_config["indexed"] += fields - self.metadata_config = metadata_config - - # Initialize dictionary of index connections - self.pinecone_indexes: Dict[str, pinecone.Index] = {} - self.return_embedding = return_embedding - self.embedding_field = embedding_field - - # Initialize dictionary to store temporary set of document IDs - self.all_ids: dict = {} - - # Dummy query to be used during searches - self.dummy_query = [0.0] * self.embedding_dim - - if pinecone_index: - if not isinstance(pinecone_index, pinecone.Index): - raise PineconeDocumentStoreError( - f"The parameter `pinecone_index` needs to be a " - f"`pinecone.Index` object. You provided an object of " - f"type `{type(pinecone_index)}`." - ) - self.pinecone_indexes[self.index] = pinecone_index - else: - self.pinecone_indexes[self.index] = self._create_index( - embedding_dim=self.embedding_dim, - index=self.index, - metric_type=self.metric_type, - replicas=self.replicas, - shards=self.shards, - recreate_index=recreate_index, - metadata_config=self.metadata_config, - ) - - super().__init__() - - def _index(self, index) -> str: - index = _sanitize_index(index) or self.index - return index - - def _create_index( - self, - embedding_dim: int, - index: Optional[str] = None, - metric_type: Optional[str] = "cosine", - replicas: Optional[int] = 1, - shards: Optional[int] = 1, - recreate_index: bool = False, - metadata_config: Optional[Dict] = None, - ) -> "pinecone.Index": - """ - Create a new index for storing documents in case an index with the name - doesn't exist already. - """ - if metadata_config is None: - metadata_config = {"indexed": []} - - if recreate_index: - self.delete_index(index) - - # Skip if already exists - if index in self.pinecone_indexes: - index_connection = self.pinecone_indexes[index] - else: - # Search pinecone hosted indexes and create an index if it does not exist - if index not in pinecone.list_indexes(): - pinecone.create_index( - name=index, - dimension=embedding_dim, - metric=metric_type, - replicas=replicas, - shards=shards, - metadata_config=metadata_config, - ) - index_connection = pinecone.Index(index) - - # return index connection - return index_connection - - def get_index_stats(self): - stats = self.pinecone_indexes[self.index] - self.index_stats = stats - # Get index statistics - dims = stats["dimension"] - count = stats["namespaces"][""]["vector_count"] if stats["namespaces"].get("") else 0 - logger.info( - "Index statistics: name: %s embedding dimensions: %s, record count: %s", - self.index, - dims, - count, + self.batch_size = batch_size + self.index_creation_kwargs = index_creation_kwargs + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + environment=self.environment, + index=self.index, + dimension=self.dimension, + namespace=self.namespace, + batch_size=self.batch_size, + **self.index_creation_kwargs, ) - return stats, dims, count + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore": + return default_from_dict(cls, data) - def _index_connection_exists(self, index: str, create: bool = False) -> Optional["pinecone.Index"]: + def count_documents(self) -> int: """ - Check if the index connection exists. If specified, create an index if it does not exist yet. - - :param index: Index name. - :param create: Indicates if an index needs to be created or not. If set to `True`, create an index - and return connection to it, otherwise raise `PineconeDocumentStoreError` error. - :raises PineconeDocumentStoreError: Exception trigger when index connection not found. + Returns how many documents are present in the document store. """ - if index not in self.pinecone_indexes: - if create: - return self._create_index( - embedding_dim=self.embedding_dim, - index=index, - metric_type=self.metric_type, - replicas=self.replicas, - shards=self.shards, - recreate_index=False, - metadata_config=self.metadata_config, - ) - raise PineconeDocumentStoreError( - f"Index named '{index}' does not exist. Try reinitializing PineconeDocumentStore() and running " - f"'update_embeddings()' to create and populate an index." - ) - return None + try: + count = self._index.describe_index_stats()["namespaces"][self.namespace]["vector_count"] + except KeyError: + count = 0 + return count - def _set_similarity_metric(self, similarity: str): + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ - Set vector similarity metric. - """ - if similarity == "cosine": - self.metric_type = similarity - elif similarity == "dot_product": - self.metric_type = "dotproduct" - elif similarity in ["l2", "euclidean"]: - self.metric_type = "euclidean" - else: - raise ValueError( - "The Pinecone document store can currently only support dot_product, cosine and euclidean metrics. " - "Please set similarity to one of the above." - ) + Writes Documents to Pinecone. - def _add_local_ids(self, index: str, ids: List[str]): - """ - Add all document IDs to the set of all IDs. - """ - if index not in self.all_ids: - self.all_ids[index] = set() - self.all_ids[index] = self.all_ids[index].union(set(ids)) + :param documents: A list of Documents to write to the document store. + :param policy: The duplicate policy to use when writing documents. + PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`. - def _add_type_metadata_filter( - self, filters: Dict[str, Any], type_value: Optional[DocTypeMetadata] - ) -> Dict[str, Any]: + :return: The number of documents written to the document store. """ - Add new filter for `doc_type` metadata field. - """ - if type_value: - new_type_filter = {TYPE_METADATA_FIELD: {EQ_OPERATOR: type_value}} - if AND_OPERATOR not in filters and TYPE_METADATA_FIELD not in filters: - # extend filters with new `doc_type` filter and add $and operator - filters.update(new_type_filter) - all_filters = filters - return {AND_OPERATOR: all_filters} - - filters_content = filters[AND_OPERATOR] if AND_OPERATOR in filters else filters - if TYPE_METADATA_FIELD in filters_content: # type: ignore - current_type_filter = filters_content[TYPE_METADATA_FIELD] # type: ignore - type_values = {type_value} - if isinstance(current_type_filter, str): - type_values.add(current_type_filter) # type: ignore - elif isinstance(current_type_filter, dict): - if EQ_OPERATOR in current_type_filter: - # current `doc_type` filter has single value - type_values.add(current_type_filter[EQ_OPERATOR]) - else: - # current `doc_type` filter has multiple values - type_values.update(set(current_type_filter[IN_OPERATOR])) - new_type_filter = {TYPE_METADATA_FIELD: {IN_OPERATOR: list(type_values)}} # type: ignore - filters_content.update(new_type_filter) # type: ignore - - return filters - - def _get_default_type_metadata(self, index: Optional[str], namespace: Optional[str] = None) -> str: - """ - Get default value for `doc_type` metadata filed. If there is at least one embedding, default value - will be `vector`, otherwise it will be `no-vector`. - """ - if self.get_embedding_count(index=index, namespace=namespace) > 0: - return DOCUMENT_WITH_EMBEDDING - return DOCUMENT_WITHOUT_EMBEDDING + if len(documents) > 0: + if not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) - def _get_vector_count( - self, - index: str, - filters: Optional[Dict[str, Any]], - namespace: Optional[str], - ) -> int: - res = self.pinecone_indexes[index].query( - self.dummy_query, - top_k=self.top_k_limit, - include_values=False, - include_metadata=False, - filter=filters, - namespace=namespace, - ) - return len(res["matches"]) + if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: + logger.warning( + f"PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`" + f"but got {policy}. Overwriting duplicates is enabled by default." + ) - def get_document_count( - self, - filters: Dict[str, Any] = None, - index: Optional[str] = None, - only_documents_without_embedding: bool = False, - headers: Optional[Dict[str, str]] = None, - namespace: Optional[str] = None, - type_metadata: Optional[DocTypeMetadata] = None, - ) -> int: - """ - Return the count of documents in the document store. - - :param filters: Optional filters to narrow down the documents which will be counted. - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical - operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, - `"$gte"`, `"$lt"`, `"$lte"`), or a metadata field name. - Logical operator keys take a dictionary of metadata field names or logical operators as - value. Metadata field names take a dictionary of comparison operators as value. Comparison - operator keys take a single value or (in case of `"$in"`) a list of values as value. - If no logical operator is provided, `"$and"` is used as default operation. If no comparison - operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default - operation. - __Example__: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - ``` - :param index: Optional index name to use for the query. If not provided, the default index name is used. - :param only_documents_without_embedding: If set to `True`, only documents without embeddings are counted. - :param headers: PineconeDocumentStore does not support headers. - :param namespace: Optional namespace to count documents from. If not specified, None is default. - :param type_metadata: Optional value for `doc_type` metadata to reference documents that need to be counted. - Parameter options: - - `"vector"`: Documents with embedding. - - `"no-vector"`: Documents without embedding (dummy embedding only). - - `"label"`: Labels. - """ - if headers: - raise NotImplementedError("PineconeDocumentStore does not support headers.") - - index = self._index(index) - self._index_connection_exists(index) - - filters = filters or {} - if not type_metadata: - # add filter for `doc_type` metadata related to documents without embeddings - filters = self._add_type_metadata_filter(filters, type_value=DOCUMENT_WITHOUT_EMBEDDING) # type: ignore - if not only_documents_without_embedding: - # add filter for `doc_type` metadata related to documents with embeddings - filters = self._add_type_metadata_filter(filters, type_value=DOCUMENT_WITH_EMBEDDING) # type: ignore - else: - # if value for `doc_type` metadata is specified, add filter with given value - filters = self._add_type_metadata_filter(filters, type_value=type_metadata) - - pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None - return self._get_vector_count(index, filters=pinecone_syntax_filter, namespace=namespace) - - def get_embedding_count( - self, - filters: Optional[Dict[str, Any]] = None, - index: Optional[str] = None, - namespace: Optional[str] = None, - ) -> int: - """ - Return the count of embeddings in the document store. + documents_for_pinecone = [] + for document in documents: + if document.embedding is None: + logger.warning( + f"Document {document.id} has no embedding. Pinecone is a purely vector database. " + "A dummy embedding will be used, but this can affect the search results. " + ) + document.embedding = self._dummy_vector + doc_for_pinecone = {"id": document.id, "values": document.embedding, "metadata": document.meta} + + # we save content/dataframe as metadata + # currently, storing blob in Pinecone is not supported + if document.content is not None: + doc_for_pinecone["metadata"]["content"] = document.content + if document.dataframe is not None: + doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() + if document.blob is not None: + logger.warning( + f"Document {document.id} has a blob. Currently, storing blob in Pinecone is not supported. " + "The blob will be ignored." + ) - :param index: Optional index name to retrieve all documents from. - :param filters: Filters are not supported for `get_embedding_count` in Pinecone. - :param namespace: Optional namespace to count embeddings from. If not specified, None is default. - """ - if filters: - raise NotImplementedError("Filters are not supported for get_embedding_count in PineconeDocumentStore") + documents_for_pinecone.append(doc_for_pinecone) - index = self._index(index) - self._index_connection_exists(index) + result = self._index.upsert( + vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size + ) - pinecone_filters = self._meta_for_pinecone({TYPE_METADATA_FIELD: DOCUMENT_WITH_EMBEDDING}) - return self._get_vector_count(index, filters=pinecone_filters, namespace=namespace) + written_docs = result["upserted_count"] + return written_docs - def _meta_for_pinecone(self, meta: Dict[str, Any], parent_key: str = "", labels: bool = False) -> Dict[str, Any]: - """ - Converts the meta dictionary to a format that can be stored in Pinecone. - :param meta: Metadata dictionary to be converted. - :param parent_key: Optional, used for recursive calls to keep track of parent keys, for example: - ``` - {"parent1": {"parent2": {"child": "value"}}} - ``` - On the second recursive call, parent_key would be "parent1", and the final key would be "parent1.parent2.child". - :param labels: Optional, used to indicate whether the metadata is being stored as a label or not. If True the - the flattening of dictionaries is not required. - """ - items: list = [] - if labels: - # Replace any None values with empty strings - for key, value in meta.items(): - if value is None: - meta[key] = "" - else: - # Explode dict of dicts into single flattened dict - for key, value in meta.items(): - # Replace any None values with empty strings - if value is None: - value = "" - if key == "_split_overlap": - value = json.dumps(value) - # format key - new_key = f"{parent_key}.{key}" if parent_key else key - # if value is dict, expand - if isinstance(value, dict): - items.extend(self._meta_for_pinecone(value, parent_key=new_key).items()) - else: - items.append((new_key, value)) - # Create new flattened dictionary - meta = dict(items) - return meta - - def _pinecone_meta_format(self, meta: Dict[str, Any], labels: bool = False) -> Dict[str, Any]: - """ - Converts the meta extracted from Pinecone into a better format for Python. - :param meta: Metadata dictionary to be converted. - :param labels: Optional, used to indicate whether the metadata is being stored as a label or not. If True the - the flattening of dictionaries is not required. - """ - new_meta: Dict[str, Any] = {} - - if labels: - # Replace any empty strings with None values - for key, value in meta.items(): - if value == "": - meta[key] = None - return meta - else: - for key, value in meta.items(): - # Replace any empty strings with None values - if value == "": - value = None - if "." in key: - # We must split into nested dictionary - keys = key.split(".") - # Iterate through each dictionary level - for i in range(len(keys)): - path = keys[: i + 1] - # Check if path exists - try: - _get_by_path(new_meta, path) - except KeyError: - # Create path - if i == len(keys) - 1: - _set_by_path(new_meta, path, value) - else: - _set_by_path(new_meta, path, {}) - else: - new_meta[key] = value - return new_meta - - def _validate_index_sync(self, index: Optional[str] = None): - """ - This check ensures the correct number of documents with embeddings and embeddings are found in the - Pinecone database. - """ - if self.get_document_count( - index=index, type_metadata=DOCUMENT_WITH_EMBEDDING # type: ignore - ) != self.get_embedding_count(index=index): - raise PineconeDocumentStoreError( - f"The number of documents present in Pinecone ({self.get_document_count(index=index)}) " - "does not match the number of embeddings in Pinecone " - f" ({self.get_embedding_count(index=index)}). This can happen if a document store " - "instance is deleted during write operations. Call " - "the `update_documents` method to fix it." - ) + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + if not filters: + # in this case, we try to return all documents but Pinecone has some limits + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT) + + total_docs_number = self.count_documents() + if total_docs_number > TOP_K_LIMIT: + logger.warning( + f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. " + f"However, there are {total_docs_number} documents in the namespace. " + ) + return documents - def count_documents(self) -> int: - """ - Returns how many documents are present in the document store. - """ - count = self.index_stats["namespaces"][""]["vector_count"] if self.index_stats["namespaces"].get("") else 0 - return count + return [] - def write_documents( - self, - documents: List[Document], - policy: DuplicatePolicy = "fail", - ) -> None: - """ - Writes (or overwrites) documents into the store. - - :param documents: a list of documents. - :param policy: documents with the same ID count as duplicates. When duplicates are met, - the store can: - - skip: keep the existing document and ignore the new one. - - overwrite: remove the old document and write the new one. - - fail: an error is raised - :raises DuplicateDocumentError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL` - :return: None + def delete_documents(self, document_ids: List[str]) -> None: """ - if not isinstance(documents, list): - msg = "Documents must be a list" - raise ValueError(msg) + Deletes all documents with a matching document_ids from the document store. - index = self._index(self.index) - index_connection = self._index_connection_exists(index, create=True) - if index_connection: - self.pinecone_indexes[index] = index_connection - - duplicate_documents = policy or self.duplicate_documents - policy_options = ["skip", "overwrite", "fail"] - assert ( - duplicate_documents in policy_options - ), f"duplicate_documents parameter must be {', '.join(policy_options)}" - - add_vectors = documents[0].embedding is not None - type_metadata = DOCUMENT_WITH_EMBEDDING if add_vectors else DOCUMENT_WITHOUT_EMBEDDING - - if not add_vectors: - # To store documents in Pinecone, we use dummy embeddings (to be replaced with real embeddings later) - embeddings_to_index = np.zeros((self.batch_size, self.embedding_dim), dtype="float32") - # Convert embeddings to list objects - embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index] - - with tqdm( - total=len(documents), - disable=not self.progress_bar, - position=0, - desc="Writing Documents", - ) as progress_bar: - for i in range(0, len(documents), self.batch_size): - document_batch = documents[i : i + self.batch_size] - ids = [doc.id for doc in document_batch] - # If duplicate_documents set to `skip` or `fail`, we need to check for existing documents - if duplicate_documents in ["skip", "fail"]: - existing_documents = self.get_documents_by_id( - ids=ids, - index=index, - namespace=self.namespace, - include_type_metadata=True, - ) - # First check for documents in current batch that exist in the index - if existing_documents: - if duplicate_documents == "skip": - # If we should skip existing documents, we drop the ids that already exist - skip_ids = [doc.id for doc in existing_documents] - # We need to drop the affected document objects from the batch - document_batch = [doc for doc in document_batch if doc.id not in skip_ids] - # Now rebuild the ID list - ids = [doc.id for doc in document_batch] - progress_bar.update(len(skip_ids)) - elif duplicate_documents == "fail": - # Otherwise, we raise an error - raise DuplicateDocumentError( - f"Document ID {existing_documents[0].id} already exists in index {index}" - ) - # Now check for duplicate documents within the batch itself - if len(ids) != len(set(ids)): - if duplicate_documents == "skip": - # We just keep the first instance of each duplicate document - ids = [] - temp_document_batch = [] - for doc in document_batch: - if doc.id not in ids: - ids.append(doc.id) - temp_document_batch.append(doc) - document_batch = temp_document_batch - elif duplicate_documents == "fail": - # Otherwise, we raise an error - raise DuplicateDocumentError(f"Duplicate document IDs found in batch: {ids}") - metadata = [ - self._meta_for_pinecone( - { - TYPE_METADATA_FIELD: type_metadata, # add `doc_type` in metadata - "text": doc.content, - "content_type": doc.meta, - } - ) - for doc in documents[i : i + self.batch_size] - ] - if add_vectors: - embeddings = [doc.embedding for doc in documents[i : i + self.batch_size]] - embeddings_to_index = np.array(embeddings, dtype="float32") - - # Convert embeddings to list objects - embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index] - data_to_write_to_pinecone = zip(ids, embeddings, metadata) - # Metadata fields and embeddings are stored in Pinecone - self.pinecone_indexes[index].upsert(vectors=data_to_write_to_pinecone, namespace=self.namespace) - # Add IDs to ID list - self._add_local_ids(index, ids) - progress_bar.update(self.batch_size) - progress_bar.close() - - def _limit_check(self, top_k: int, include_values: Optional[bool] = None): + :param document_ids: the document ids to delete """ - Confirms the top_k value does not exceed Pinecone vector database limits. - """ - if include_values: - if top_k > self.top_k_limit_vectors: - raise PineconeDocumentStoreError( - f"PineconeDocumentStore allows requests of no more than {self.top_k_limit_vectors} records " - f"when returning embedding values. This request is attempting to return {top_k} records." - ) - else: - if top_k > self.top_k_limit: - raise PineconeDocumentStoreError( - f"PineconeDocumentStore allows requests of no more than {self.top_k_limit} records. " - f"This request is attempting to return {top_k} records." - ) + self._index.delete(ids=document_ids, namespace=self.namespace) - def query_by_embedding( + def _embedding_retrieval( self, query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, + *, + filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 top_k: int = 10, - scale_score: bool = True, - return_embedding: Optional[bool] = None, ) -> List[Document]: """ - Find the document that is most similar to the provided `query_embedding` by using a vector similarity metric. - - :param query_embedding: Embedding of the query. - :param filters: A dictionary with filters to narrow down the search space. - :param top_k: The maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents or not. - :param return_embedding: Whether to return the embedding of the retrieved Documents. - :return: The retrieved documents. - """ - if return_embedding is None: - return_embedding = self.return_embedding + Retrieves documents that are most similar to the query embedding using a vector similarity metric. - self._limit_check(top_k, include_values=return_embedding) + This method is not mean to be part of the public interface of + `PineconeDocumentStore` nor called directly. + `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. - index = self._index(self.index) - self._index_connection_exists(index) - - type_metadata = DOCUMENT_WITH_EMBEDDING # type: ignore + :param query_embedding: Embedding of the query. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + :param top_k: Maximum number of Documents to return, defaults to 10 - filters = filters or {} - filters = self._add_type_metadata_filter(filters, type_metadata) + :return: List of Document that are most similar to `query_embedding` + """ - pinecone_syntax_filter = LogicalFilterClause.parse(filters).convert_to_pinecone() if filters else None + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) - res = self.pinecone_indexes[index].query( - query_embedding, - namespace=self.namespace, + result = self._index.query( + vector=query_embedding, top_k=top_k, - include_values=return_embedding, + namespace=self.namespace, + include_values=True, include_metadata=True, - filter=pinecone_syntax_filter, - ) - - score_matrix = [] - vector_id_matrix = [] - meta_matrix = [] - embedding_matrix = [] - for match in res["matches"]: - score_matrix.append(match["score"]) - vector_id_matrix.append(match["id"]) - meta_matrix.append(match["metadata"]) - if return_embedding: - embedding_matrix.append(match["values"]) - if return_embedding: - values = embedding_matrix - else: - values = None - documents = self._get_documents_by_meta( - vector_id_matrix, - meta_matrix, - values=values, - index=index, - return_embedding=return_embedding, - ) - - # assign query score to each document - scores_for_vector_ids: Dict[str, float] = {str(v_id): s for v_id, s in zip(vector_id_matrix, score_matrix)} - return_documents = [] - for doc in documents: - score = scores_for_vector_ids[doc.id] - if scale_score: - if self.similarity == "cosine": - score = (score + 1) / 2 - else: - score = float(1 / (1 + np.exp(-score / 100))) - doc.score = score - return_document = copy.copy(doc) - return_documents.append(return_document) - - return return_documents - - def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - """ - Returns the documents that match the filters provided. - - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. - - Example: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. - - Example: - - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` - - :param filters: the filters to apply to the document list. - :return: a list of Documents that match the given filters. - """ - docs = self.query_by_embedding( - query_embedding=self.dummy_query, - filters=filters, - top_k=10, - scale_score=True, - return_embedding=True, ) - return docs - - def _attach_embedding_to_document(self, document: Document, index: str): - """ - Fetches the Document's embedding from the specified Pinecone index and attaches it to the Document's - embedding field. - """ - result = self.pinecone_indexes[index].fetch(ids=[document.id]) - if result["vectors"].get(document.id, False): - embedding = result["vectors"][document.id].get("values", None) - document.embedding = np.asarray(embedding, dtype=np.float32) - - def _get_documents_by_meta( - self, - ids: List[str], - metadata: List[dict], - values: Optional[List[List[float]]] = None, - index: Optional[str] = None, - return_embedding: Optional[bool] = None, - ) -> List[Document]: - if return_embedding is None: - return_embedding = self.return_embedding - - index = self._index(index) - - # extract ID, content, and metadata to create Documents - documents = [] - for _id, meta in zip(ids, metadata): - content = meta.pop("content") - content_type = meta.pop("content_type") - if "_split_overlap" in meta: - meta["_split_overlap"] = json.loads(meta["_split_overlap"]) - doc = Document(id=_id, content=content, content_type=content_type, meta=meta) - documents.append(doc) - if return_embedding: - if values is None: - # If no embedding values are provided, we must request the embeddings from Pinecone - for doc in documents: - self._attach_embedding_to_document(document=doc, index=index) - else: - # If embedding values are given, we just add - for doc, embedding in zip(documents, values): - doc.embedding = np.asarray(embedding, dtype=np.float32) - - return documents - - def get_documents_by_id( - self, - ids: List[str], - index: Optional[str] = None, - batch_size: int = 100, - return_embedding: Optional[bool] = None, - namespace: Optional[str] = None, - include_type_metadata: Optional[bool] = False, - ) -> List[Document]: - """ - Retrieves all documents in the index using their IDs. - - :param ids: List of IDs to retrieve. - :param index: Optional index name to retrieve all documents from. - :param batch_size: Number of documents to retrieve at a time. When working with large number of documents, - batching can help reduce memory footprint. - :param headers: Pinecone does not support headers. - :param return_embedding: Optional flag to return the embedding of the document. - :param namespace: Optional namespace to retrieve document from. If not specified, None is default. - :param include_type_metadata: Indicates if `doc_type` value will be included in document metadata or not. - If not specified, `doc_type` field will be dropped from document metadata. - """ - - if return_embedding is None: - return_embedding = self.return_embedding - - index = self._index(index) - self._index_connection_exists(index) + return self._convert_query_result_to_documents(result) + def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> List[Document]: + pinecone_docs = query_result["matches"] documents = [] - for i in range(0, len(ids), batch_size): - i_end = min(len(ids), i + batch_size) - id_batch = ids[i:i_end] - result = self.pinecone_indexes[index].fetch(ids=id_batch, namespace=namespace) - - vector_id_matrix = [] - meta_matrix = [] - embedding_matrix = [] - for _id in result["vectors"]: - vector_id_matrix.append(_id) - metadata = result["vectors"][_id]["metadata"] - if not include_type_metadata and TYPE_METADATA_FIELD in metadata: - metadata.pop(TYPE_METADATA_FIELD) - meta_matrix.append(self._pinecone_meta_format(metadata)) - if return_embedding: - embedding_matrix.append(result["vectors"][_id]["values"]) - if return_embedding: - values = embedding_matrix - else: - values = None - document_batch = self._get_documents_by_meta( - vector_id_matrix, - meta_matrix, - values=values, - index=index, - return_embedding=return_embedding, + for pinecone_doc in pinecone_docs: + content = pinecone_doc["metadata"].pop("content", None) + + dataframe = None + dataframe_string = pinecone_doc["metadata"].pop("dataframe", None) + if dataframe_string: + dataframe = pd.read_json(io.StringIO(dataframe_string)) + + doc = Document( + id=pinecone_doc["id"], + content=content, + dataframe=dataframe, + meta=pinecone_doc["metadata"], + embedding=pinecone_doc["values"], + score=pinecone_doc["score"], ) - documents.extend(document_batch) + documents.append(doc) return documents - - def delete_documents(self, document_ids: List[str]) -> None: - """ - Deletes all documents with a matching document_ids from the document store. - Fails with `MissingDocumentError` if no document with this id is present in the store. - - :param document_ids: the document_ids to delete - """ - for doc_id in document_ids: - msg = f"ID '{doc_id}' not found, cannot delete it." - document_ids.remove(doc_id) - raise MissingDocumentError(msg) - - index = self._index(self.index) - self._index_connection_exists(index) - - if index not in self.all_ids: - self.all_ids[index] = set() - if document_ids is None: - # If no IDs we delete everything - self.pinecone_indexes[index].delete(delete_all=True, namespace=self.namespace) - id_values = list(self.all_ids[index]) - else: - id_values = document_ids - self.pinecone_indexes[index].delete(ids=document_ids, namespace=self.namespace) - - # Remove deleted ids from all_ids - self.all_ids[index] = self.all_ids[index].difference(set(id_values)) - - def delete_index(self, index: Optional[str]): - """ - Delete an existing index. The index including all data will be removed. - - :param index: The name of the index to delete. - :return: None - """ - index = self._index(index) - - if index in pinecone.list_indexes(): - pinecone.delete_index(index) - logger.info("Index '%s' deleted.", index) - if index in self.pinecone_indexes: - del self.pinecone_indexes[index] - if index in self.all_ids: - self.all_ids[index] = set() diff --git a/integrations/pinecone/src/pinecone_haystack/errors.py b/integrations/pinecone/src/pinecone_haystack/errors.py index 08d71b35a..994f34cf0 100644 --- a/integrations/pinecone/src/pinecone_haystack/errors.py +++ b/integrations/pinecone/src/pinecone_haystack/errors.py @@ -1,5 +1,5 @@ -from haystack.preview.document_stores.errors import DocumentStoreError -from haystack.preview.errors import FilterError +from haystack.document_stores.errors import DocumentStoreError +from haystack.errors import FilterError class PineconeDocumentStoreError(DocumentStoreError): diff --git a/integrations/pinecone/src/pinecone_haystack/filter_utils.py b/integrations/pinecone/src/pinecone_haystack/filter_utils.py deleted file mode 100644 index cbf256210..000000000 --- a/integrations/pinecone/src/pinecone_haystack/filter_utils.py +++ /dev/null @@ -1,432 +0,0 @@ -import logging -from abc import ABC, abstractmethod -from collections import defaultdict -from typing import Dict, List, Union - -from haystack.preview.errors import FilterError - -from pinecone_haystack.errors import PineconeDocumentStoreFilterError - -logger = logging.getLogger(__file__) - - -def nested_defaultdict() -> defaultdict: - """ - Data structure that recursively adds a dictionary as value if a key does not exist. Advantage: In nested dictionary - structures, we don't need to check if a key already exists (which can become hard to maintain in nested dictionaries - with many levels) but access the existing value if a key exists and create an empty dictionary if a key does not - exist. - """ - return defaultdict(nested_defaultdict) - - -class LogicalFilterClause(ABC): - """ - Class that is able to parse a filter and convert it to the format that the underlying databases of our - DocumentStores require. - - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical - operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - Logical operator keys take a dictionary of metadata field names and/or logical operators as - value. Metadata field names take a dictionary of comparison operators as value. Comparison - operator keys take a single value or (in case of `"$in"`) a list of values as value. - If no logical operator is provided, `"$and"` is used as default operation. If no comparison - operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default - operation. - Example: - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators take optionally a list of - dictionaries as value. - - Example: - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` - - """ - - def __init__(self, conditions: List[Union["LogicalFilterClause", "ComparisonOperation"]]): - self.conditions = conditions - - @abstractmethod - def evaluate(self, fields) -> bool: - pass - - @classmethod - def parse(cls, filter_term: Union[dict, List[dict]]) -> Union["LogicalFilterClause", "ComparisonOperation"]: - """ - Parses a filter dictionary/list and returns a LogicalFilterClause instance. - - :param filter_term: Dictionary or list that contains the filter definition. - """ - conditions: List[Union[LogicalFilterClause, ComparisonOperation]] = [] - - if isinstance(filter_term, dict): - filter_term = [filter_term] - for item in filter_term: - for key, value in item.items(): - if key == "$not": - conditions.append(NotOperation.parse(value)) - elif key == "$and": - conditions.append(AndOperation.parse(value)) - elif key == "$or": - conditions.append(OrOperation.parse(value)) - # Key needs to be a metadata field - else: - conditions.extend(ComparisonOperation.parse(key, value)) - - if cls == LogicalFilterClause: - if len(conditions) == 1: - return conditions[0] - else: - return AndOperation(conditions) - else: - return cls(conditions) - - def convert_to_pinecone(self): - """ - Converts the LogicalFilterClause instance to a Pinecone filter. - """ - pass - - -class ComparisonOperation(ABC): - def __init__(self, field_name: str, comparison_value: Union[str, int, float, bool, List]): - self.field_name = field_name - self.comparison_value = comparison_value - - @abstractmethod - def evaluate(self, fields) -> bool: - pass - - @classmethod - def parse(cls, field_name, comparison_clause: Union[Dict, List, str, float]) -> List["ComparisonOperation"]: - comparison_operations: List[ComparisonOperation] = [] - - if isinstance(comparison_clause, dict): - for comparison_operation, comparison_value in comparison_clause.items(): - if comparison_operation == "$eq": - comparison_operations.append(EqOperation(field_name, comparison_value)) - elif comparison_operation == "$in": - comparison_operations.append(InOperation(field_name, comparison_value)) - elif comparison_operation == "$ne": - comparison_operations.append(NeOperation(field_name, comparison_value)) - elif comparison_operation == "$nin": - comparison_operations.append(NinOperation(field_name, comparison_value)) - elif comparison_operation == "$gt": - comparison_operations.append(GtOperation(field_name, comparison_value)) - elif comparison_operation == "$gte": - comparison_operations.append(GteOperation(field_name, comparison_value)) - elif comparison_operation == "$lt": - comparison_operations.append(LtOperation(field_name, comparison_value)) - elif comparison_operation == "$lte": - comparison_operations.append(LteOperation(field_name, comparison_value)) - - # No comparison operator is given, so we use the default operators "$in" if the comparison value is a list and - # "$eq" in every other case - elif isinstance(comparison_clause, list): - comparison_operations.append(InOperation(field_name, comparison_clause)) - else: - comparison_operations.append(EqOperation(field_name, comparison_clause)) - - return comparison_operations - - def convert_to_pinecone(self): - """ - Converts the ComparisonOperation instance to a Pinecone comparison operator. - """ - pass - - def invert(self) -> "ComparisonOperation": - """ - Inverts the ComparisonOperation. - Necessary for Weaviate as Weaviate doesn't seem to support the 'Not' operator anymore. - (https://github.com/semi-technologies/weaviate/issues/1717) - """ - pass - - -class NotOperation(LogicalFilterClause): - """ - Handles conversion of logical 'NOT' operations. - """ - - def evaluate(self, fields) -> bool: - return not any(condition.evaluate(fields) for condition in self.conditions) - - def convert_to_pinecone(self) -> Dict[str, Union[str, int, float, bool, List[Dict]]]: - conditions = [condition.invert().convert_to_pinecone() for condition in self.conditions] - if len(conditions) > 1: - # Conditions in self.conditions are by default combined with AND which becomes OR according to DeMorgan - return {"$or": conditions} - else: - return conditions[0] - - def invert(self) -> Union[LogicalFilterClause, ComparisonOperation]: - # This method is called when a "$not" operation is embedded in another "$not" operation. Therefore, we don't - # invert the operations here, as two "$not" operation annihilate each other. - # (If we have more than one condition, we return an AndOperation, the default logical operation for combining - # multiple conditions.) - if len(self.conditions) > 1: - return AndOperation(self.conditions) - else: - return self.conditions[0] - - -class AndOperation(LogicalFilterClause): - """ - Handles conversion of logical 'AND' operations. - """ - - def evaluate(self, fields) -> bool: - return all(condition.evaluate(fields) for condition in self.conditions) - - def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: - conditions = [condition.convert_to_pinecone() for condition in self.conditions] - return {"$and": conditions} - - def invert(self) -> "OrOperation": - return OrOperation([condition.invert() for condition in self.conditions]) - - -class OrOperation(LogicalFilterClause): - """ - Handles conversion of logical 'OR' operations. - """ - - def evaluate(self, fields) -> bool: - return any(condition.evaluate(fields) for condition in self.conditions) - - def convert_to_pinecone(self) -> Dict[str, Union[str, List[Dict]]]: - conditions = [condition.convert_to_pinecone() for condition in self.conditions] - return {"$or": conditions} - - def invert(self) -> AndOperation: - return AndOperation([condition.invert() for condition in self.conditions]) - - -class EqOperation(ComparisonOperation): - """ - Handles conversion of the '$eq' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - return fields[self.field_name] == self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: - return {self.field_name: {"$eq": self.comparison_value}} - - def invert(self) -> "NeOperation": - return NeOperation(self.field_name, self.comparison_value) - - -class InOperation(ComparisonOperation): - """ - Handles conversion of the '$in' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - - if not isinstance(self.comparison_value, list): - raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") - - # If the document field is a list, check if any of its values are in the comparison value - if isinstance(fields[self.field_name], list): - return any(field in self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] in self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: - if not isinstance(self.comparison_value, list): - raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") - return {self.field_name: {"$in": self.comparison_value}} - - def invert(self) -> "NinOperation": - return NinOperation(self.field_name, self.comparison_value) - - -class NeOperation(ComparisonOperation): - """ - Handles conversion of the '$ne' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - return fields[self.field_name] != self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[List[str], str, int, float, bool]]]: - return {self.field_name: {"$ne": self.comparison_value}} - - def invert(self) -> "EqOperation": - return EqOperation(self.field_name, self.comparison_value) - - -class NinOperation(ComparisonOperation): - """ - Handles conversion of the '$nin' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return True - - if not isinstance(self.comparison_value, list): - raise PineconeDocumentStoreFilterError("'$nin' operation requires comparison value to be a list.") - - # If the document field is a list, check if any of its values are in the comparison value - if isinstance(fields[self.field_name], list): - return not any(field in self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] not in self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, List]]: - if not isinstance(self.comparison_value, list): - raise PineconeDocumentStoreFilterError("'$in' operation requires comparison value to be a list.") - return {self.field_name: {"$nin": self.comparison_value}} - - def invert(self) -> "InOperation": - return InOperation(self.field_name, self.comparison_value) - - -class GtOperation(ComparisonOperation): - """ - Handles conversion of the '$gt' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - - # If the document field is a list, check if any of its values are greater than the comparison value - if isinstance(fields[self.field_name], list): - return any(field > self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] > self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - if not isinstance(self.comparison_value, (float, int)): - raise PineconeDocumentStoreFilterError("Comparison value for '$gt' operation must be a float or int.") - return {self.field_name: {"$gt": self.comparison_value}} - - def invert(self) -> "LteOperation": - return LteOperation(self.field_name, self.comparison_value) - - -class GteOperation(ComparisonOperation): - """ - Handles conversion of the '$gte' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - - # If the document field is a list, check if any of its values are greater than or equal to the comparison value - if isinstance(fields[self.field_name], list): - return any(field >= self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] >= self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - if not isinstance(self.comparison_value, (float, int)): - raise PineconeDocumentStoreFilterError("Comparison value for '$gte' operation must be a float or int.") - return {self.field_name: {"$gte": self.comparison_value}} - - def invert(self) -> "LtOperation": - return LtOperation(self.field_name, self.comparison_value) - - -class LtOperation(ComparisonOperation): - """ - Handles conversion of the '$lt' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - - # If the document field is a list, check if any of its values are less than the comparison value - if isinstance(fields[self.field_name], list): - return any(field < self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] < self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - if not isinstance(self.comparison_value, (float, int)): - raise PineconeDocumentStoreFilterError("Comparison value for '$lt' operation must be a float or int.") - return {self.field_name: {"$lt": self.comparison_value}} - - def invert(self) -> "GteOperation": - return GteOperation(self.field_name, self.comparison_value) - - -class LteOperation(ComparisonOperation): - """ - Handles conversion of the '$lte' comparison operation. - """ - - def evaluate(self, fields) -> bool: - if self.field_name not in fields: - return False - - # If the document field is a list, check if any of its values are less than or equal to the comparison value - if isinstance(fields[self.field_name], list): - return any(field <= self.comparison_value for field in fields[self.field_name]) - - return fields[self.field_name] <= self.comparison_value - - def convert_to_pinecone(self) -> Dict[str, Dict[str, Union[float, int]]]: - if not isinstance(self.comparison_value, (float, int)): - raise PineconeDocumentStoreFilterError("Comparison value for '$lte' operation must be a float or int.") - return {self.field_name: {"$lte": self.comparison_value}} - - def invert(self) -> "GtOperation": - return GtOperation(self.field_name, self.comparison_value) diff --git a/integrations/pinecone/src/pinecone_haystack/retriever.py b/integrations/pinecone/src/pinecone_haystack/retriever.py deleted file mode 100644 index a4d89b7c9..000000000 --- a/integrations/pinecone/src/pinecone_haystack/retriever.py +++ /dev/null @@ -1,128 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present John Doe -# -# SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional - -from haystack.preview import ( - DeserializationError, - Document, - component, - default_from_dict, - default_to_dict, -) -from haystack.preview.dataclasses import Document - -from pinecone_haystack.document_store import PineconeDocumentStore - - -@component -class PineconeRetriever: - """ - A component for retrieving documents from an PineconeDocumentStore using a vector similarity metric. - - Needs to be connected to a PineconeDocumentStore to run. - """ - - def __init__( - self, - document_store: PineconeDocumentStore, - filters: Optional[Dict[str, Any]] = None, - top_k: int = 10, - scale_score: bool = True, - return_embedding: bool = False, - ): - """ - Create a PineconeRetriever component. - - :param document_store: An instance of PineconeDocumentStore. - :param filters: A dictionary with filters to narrow down the search space. Default is None. - :param top_k: The maximum number of documents to retrieve. Default is 10. - :param scale_score: Whether to scale the scores of the retrieved documents or not. Default is True. - :param return_embedding: Whether to return the embedding of the retrieved Documents. Default is False. - - :raises ValueError: If the specified top_k is not > 0. - """ - if not isinstance(document_store, PineconeDocumentStore): - raise ValueError("document_store must be an instance of PineconeDocumentStore") - - self.document_store = document_store - - if top_k <= 0: - raise ValueError(f"top_k must be > 0, but got {top_k}") - - self.filters = filters - self.top_k = top_k - self.scale_score = scale_score - self.return_embedding = return_embedding - - def to_dict(self) -> Dict[str, Any]: - """ - Serialize this component to a dictionary. - """ - docstore = self.document_store.to_dict() - return default_to_dict( - self, - document_store=docstore, - filters=self.filters, - top_k=self.top_k, - scale_score=self.scale_score, - return_embedding=self.return_embedding, - ) - - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "PineconeRetriever": - """ - Deserialize this component from a dictionary. - """ - init_params = data.get("init_parameters", {}) - if "document_store" not in init_params: - raise DeserializationError("Missing 'document_store' in serialization data") - if "type" not in init_params["document_store"]: - raise DeserializationError("Missing 'type' in document store's serialization data") - if init_params["document_store"]["type"] not in document_store.registry: - raise DeserializationError(f"DocumentStore type '{init_params['document_store']['type']}' not found") - - docstore_class = document_store.registry[init_params["document_store"]["type"]] - docstore = docstore_class.from_dict(init_params["document_store"]) - data["init_parameters"]["document_store"] = docstore - return default_from_dict(cls, data) - - @component.output_types(documents=List[Document]) - def run( - self, - query_embedding: List[float], - filters: Optional[Dict[str, Any]] = None, - top_k: Optional[int] = None, - scale_score: Optional[bool] = None, - return_embedding: Optional[bool] = None, - ): - """ - Run the Embedding Retriever on the given input data. - - :param query_embedding: Embedding of the query. - :param filters: A dictionary with filters to narrow down the search space. - :param top_k: The maximum number of documents to return. - :param scale_score: Whether to scale the scores of the retrieved documents or not. - :param return_embedding: Whether to return the embedding of the retrieved Documents. - :return: The retrieved documents. - - :raises ValueError: If the specified DocumentStore is not found or is not a MemoryDocumentStore instance. - """ - if filters is None: - filters = self.filters - if top_k is None: - top_k = self.top_k - if scale_score is None: - scale_score = self.scale_score - if return_embedding is None: - return_embedding = self.return_embedding - - docs = self.document_store.query_by_embedding( - query_embedding=query_embedding, - filters=filters, - top_k=top_k, - scale_score=scale_score, - return_embedding=return_embedding, - ) - - return {"documents": docs} From f5c5028cb4d45f3f47272b7ddb02b3ba1e66d3d9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 19 Dec 2023 17:13:02 +0100 Subject: [PATCH 11/38] simplified tests --- .github/workflows/pinecone.yml | 3 +- .../src/pinecone_haystack/__init__.py | 2 +- .../src/pinecone_haystack/document_store.py | 2 +- integrations/pinecone/tests/__init__.py | 2 +- integrations/pinecone/tests/conftest.py | 46 +++ integrations/pinecone/tests/pinecone_mock.py | 329 ----------------- integrations/pinecone/tests/test_count.py | 7 + integrations/pinecone/tests/test_delete.py | 7 + .../pinecone/tests/test_document_store.py | 20 ++ .../tests/test_pinecone_document_store.py | 335 ------------------ integrations/pinecone/tests/test_retriever.py | 93 ----- integrations/pinecone/tests/test_write.py | 40 +++ 12 files changed, 125 insertions(+), 761 deletions(-) create mode 100644 integrations/pinecone/tests/conftest.py delete mode 100644 integrations/pinecone/tests/pinecone_mock.py create mode 100644 integrations/pinecone/tests/test_count.py create mode 100644 integrations/pinecone/tests/test_delete.py create mode 100644 integrations/pinecone/tests/test_document_store.py delete mode 100644 integrations/pinecone/tests/test_pinecone_document_store.py delete mode 100644 integrations/pinecone/tests/test_retriever.py create mode 100644 integrations/pinecone/tests/test_write.py diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml index 8f91c4a71..d42330849 100644 --- a/.github/workflows/pinecone.yml +++ b/.github/workflows/pinecone.yml @@ -25,8 +25,9 @@ jobs: strategy: fail-fast: false matrix: + # Pinecone tests are time expensive, so the matrix is limited to Python 3.9 and 3.10 os: [ubuntu-latest] - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10"] steps: - uses: actions/checkout@v4 diff --git a/integrations/pinecone/src/pinecone_haystack/__init__.py b/integrations/pinecone/src/pinecone_haystack/__init__.py index dbd6664ea..dbfb60832 100644 --- a/integrations/pinecone/src/pinecone_haystack/__init__.py +++ b/integrations/pinecone/src/pinecone_haystack/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present John Doe +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 from pinecone_haystack.document_store import PineconeDocumentStore diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index ae812e7ec..af295e0c0 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -181,7 +181,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, - filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 + filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: """ diff --git a/integrations/pinecone/tests/__init__.py b/integrations/pinecone/tests/__init__.py index 7eda7517e..e873bc332 100644 --- a/integrations/pinecone/tests/__init__.py +++ b/integrations/pinecone/tests/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present John Doe +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py new file mode 100644 index 000000000..63ec94819 --- /dev/null +++ b/integrations/pinecone/tests/conftest.py @@ -0,0 +1,46 @@ +import time +from random import randint + +import pytest + +from pinecone_haystack.document_store import PineconeDocumentStore + +# This is the approximate time it takes for the documents to be available +SLEEP_TIME = 17 + + +@pytest.fixture() +def sleep_time(): + return SLEEP_TIME + + +@pytest.fixture +def document_store(request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + environment = "gcp-starter" + index = "default" + # Use a different namespace for each test so we can run them in parallel + namespace = f"{request.node.name}-{randint(0, 1000)}" # noqa: S311 Ruff complains about using random numbers for cryptographic purposes + dimension = 10 + + store = PineconeDocumentStore( + environment=environment, + index=index, + namespace=namespace, + dimension=dimension, + ) + + # Override the count_documents method to wait for the documents to be available + original_count_documents = store.count_documents + + def count_documents_sleep(): + time.sleep(SLEEP_TIME) + return original_count_documents() + + store.count_documents = count_documents_sleep + + yield store + store._index.delete(delete_all=True, namespace=namespace) diff --git a/integrations/pinecone/tests/pinecone_mock.py b/integrations/pinecone/tests/pinecone_mock.py deleted file mode 100644 index 215b0e428..000000000 --- a/integrations/pinecone/tests/pinecone_mock.py +++ /dev/null @@ -1,329 +0,0 @@ -import logging -from typing import Any, Dict, List, Optional, Union - -logger = logging.getLogger(__name__) - - -# Mock Pinecone instance -CONFIG: dict = {"api_key": None, "environment": None, "indexes": {}} - - -# Mock Pinecone Index instance -class IndexObject: - def __init__( - self, - index: str, - api_key: Optional[str] = None, - environment: Optional[str] = None, - dimension: Optional[int] = None, - metric: Optional[str] = None, - replicas: Optional[int] = None, - shards: Optional[int] = None, - metadata_config: Optional[dict] = None, - ): - self.index = index - self.api_key = api_key - self.environment = environment - self.dimension = dimension - self.metric = metric - self.replicas = replicas - self.shards = shards - self.metadata_config = metadata_config - self.namespaces: dict = {} - - -# Mock the Pinecone Index class -class Index: - def __init__(self, index: str): - self.index = index - self.index_config = CONFIG["indexes"][index] - - def upsert(self, vectors: List[tuple], namespace: str = ""): - if namespace not in self.index_config.namespaces: - self.index_config.namespaces[namespace] = {} - upsert_count = 0 - for record in vectors: - # Extract info from tuple - _id = record[0] - vector = record[1] - metadata = record[2] - # Checks - assert type(_id) is str - assert type(vector) is list - assert len(vector) == self.index_config.dimension - assert type(metadata) is dict - # Create record (eg document) - new_record: dict = {"id": _id, "values": vector, "metadata": metadata} - self.index_config.namespaces[namespace][_id] = new_record - upsert_count += 1 - return {"upserted_count": upsert_count} - - def update(self, namespace: str, id: str, set_metadata: dict): - # Get existing item metadata - meta = self.index_config.namespaces[namespace][id]["metadata"] - # Add new metadata to existing item metadata - self.index_config.namespaces[namespace][id]["metadata"] = {**meta, **set_metadata} - - def describe_index_stats(self, filter=None): - namespaces = {} - for namespace in self.index_config.namespaces.items(): - records = self.index_config.namespaces[namespace[0]] - if filter: - filtered_records = [] - for record in records.values(): - if self._filter(metadata=record["metadata"], filters=filter, top_level=True): - filtered_records.append(record) - records = filtered_records - namespaces[namespace[0]] = {"vector_count": len(records)} - return {"dimension": self.index_config.dimension, "index_fullness": 0.0, "namespaces": namespaces} - - def query( - self, - vector: List[float], - top_k: int, - namespace: str = "", - include_values: bool = False, - include_metadata: bool = False, - filter: Optional[dict] = None, - ): - return self.query_filter( - vector=vector, - top_k=top_k, - namespace=namespace, - include_values=include_values, - include_metadata=include_metadata, - filter=filter, - ) - - def query_filter( - self, - vector: List[float], - top_k: int, - namespace: str = "", - include_values: bool = False, - include_metadata: bool = False, - filter: Optional[dict] = None, - ): - assert len(vector) == self.index_config.dimension - response: dict = {"matches": []} - if namespace not in self.index_config.namespaces: - return response - else: - records = self.index_config.namespaces[namespace] - namespace_ids = list(records.keys())[:top_k] - - for _id in namespace_ids: - match = {"id": _id} - if include_values: - match["values"] = records[_id]["values"].copy() - if include_metadata: - match["metadata"] = records[_id]["metadata"].copy() - match["score"] = 0.0 - - if filter is None or ( - filter is not None and self._filter(records[_id]["metadata"], filter, top_level=True) - ): - # filter if needed - response["matches"].append(match) - return response - - def fetch(self, ids: List[str], namespace: str = ""): - response: dict = {"namespace": namespace, "vectors": {}} - if namespace not in self.index_config.namespaces: - # If we query an empty/non-existent namespace, Pinecone will just return an empty response - logger.warning("No namespace called '%s'", namespace) - return response - records = self.index_config.namespaces[namespace] - namespace_ids = records.keys() - for _id in namespace_ids: - if _id in ids.copy(): - response["vectors"][_id] = { - "id": _id, - "metadata": records[_id]["metadata"].copy(), - "values": records[_id]["values"].copy(), - } - return response - - def _filter( - self, - metadata: dict, - filters: Dict[str, Any] = None, - mode: Optional[str] = "$and", - top_level=False, - ) -> dict: - """ - Mock filtering function - """ - # This function has a very high McCabe cyclomatic complexity score of 38 - # (recommended is 10) and contains 55 branches (recommended is 12). - bools = [] - if type(filters) is list: - list_bools = [] - for _filter in filters: - res = self._filter(metadata, _filter, mode=mode) - for key, value in res.items(): - if key == "$and": - list_bools.append(all(value)) - else: - list_bools.append(any(value)) - if mode == "$and": - bools.append(all(list_bools)) - elif mode == "$or": - bools.append(any(list_bools)) - else: - for field, potential_value in filters.items(): - if field in ["$and", "$or"]: - bools.append(self._filter(metadata, potential_value, mode=field)) - mode = field - cond = field - else: - if type(potential_value) is dict: - sub_bool = [] - for cond, value in potential_value.items(): - if len(potential_value.keys()) > 1: - sub_filter = {field: {cond: value}} - bools.append(self._filter(metadata, sub_filter)) - if len(sub_bool) > 1: - if field == "$or": - bools.append(any(sub_bool)) - else: - bools.append(all(sub_bool)) - elif type(potential_value) is list: - cond = "$in" - value = potential_value - else: - cond = "$eq" - value = potential_value - # main chunk of condition checks - if cond == "$eq": - if field in metadata and metadata[field] == value: - bools.append(True) - else: - bools.append(False) - elif cond == "$ne": - if field in metadata and metadata[field] != value: - bools.append(True) - else: - bools.append(False) - elif cond == "$in": - if field in metadata and metadata[field] in value: - bools.append(True) - else: - bools.append(False) - elif cond == "$nin": - if field in metadata and metadata[field] not in value: - bools.append(True) - else: - bools.append(False) - elif cond == "$gt": - if field in metadata and metadata[field] > value: - bools.append(True) - else: - bools.append(False) - elif cond == "$lt": - if field in metadata and metadata[field] < value: - bools.append(True) - else: - bools.append(False) - elif cond == "$gte": - if field in metadata and metadata[field] >= value: - bools.append(True) - else: - bools.append(False) - elif cond == "$lte": - if field in metadata and metadata[field] <= value: - bools.append(True) - else: - bools.append(False) - if top_level: - final = [] - for item in bools: - if type(item) is dict: - for key, value in item.items(): - if key == "$and": - final.append(all(value)) - else: - final.append(any(value)) - else: - final.append(item) - if mode == "$and": - bools = all(final) - else: - bools = any(final) - else: - if mode == "$and": - return {"$and": bools} - else: - return {"$or": bools} - return bools - - def delete( - self, - ids: Optional[List[str]] = None, - namespace: str = "", - filters: Dict[str, Any] = None, - delete_all: bool = False, - ): - if filters: - # Get a filtered list of IDs - matches = self.query(filters=filters, namespace=namespace, include_values=False, include_metadata=False)[ - "vectors" - ] - filter_ids: List[str] = matches.keys() # .keys() returns an object that supports set operators already - elif delete_all: - self.index_config.namespaces[namespace] = {} - - if namespace not in self.index_config.namespaces: - pass - elif ids is not None: - id_list: List[str] = ids - if filters: - # We find the intersect between the IDs and filtered IDs - id_list = set(id_list).intersection(filter_ids) - records = self.index_config.namespaces[namespace] - for _id in list(records.keys()): # list() is needed to be able to del below - if _id in id_list: - del records[_id] - else: - # Delete all - self.index_config.namespaces[namespace] = {} - return {} - - def _get_config(self): - return self.index_config - - -# Mock core Pinecone client functions -def init(api_key: Optional[str] = None, environment: Optional[str] = None): - CONFIG["api_key"] = api_key - CONFIG["environment"] = environment - CONFIG["indexes"] = {} - - -def list_indexes(): - return list(CONFIG["indexes"].keys()) - - -def create_index( - name: str, - dimension: int, - metric: str = "cosine", - replicas: int = 1, - shards: int = 1, - metadata_config: Optional[dict] = None, -): - index_object = IndexObject( - api_key=CONFIG["api_key"], - environment=CONFIG["environment"], - index=name, - dimension=dimension, - metric=metric, - replicas=replicas, - shards=shards, - metadata_config=metadata_config, - ) - CONFIG["indexes"][name] = index_object - - -def delete_index(index: str): - del CONFIG["indexes"][index] diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py new file mode 100644 index 000000000..02462d422 --- /dev/null +++ b/integrations/pinecone/tests/test_count.py @@ -0,0 +1,7 @@ +from haystack.testing.document_store import ( + CountDocumentsTest, +) + + +class TestCountDocuments(CountDocumentsTest): + ... diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py new file mode 100644 index 000000000..88b145704 --- /dev/null +++ b/integrations/pinecone/tests/test_delete.py @@ -0,0 +1,7 @@ +from haystack.testing.document_store import ( + DeleteDocumentsTest, +) + + +class TestDeleteDocuments(DeleteDocumentsTest): + ... diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py new file mode 100644 index 000000000..74315aad2 --- /dev/null +++ b/integrations/pinecone/tests/test_document_store.py @@ -0,0 +1,20 @@ +import time + +from haystack import Document + +from pinecone_haystack.document_store import PineconeDocumentStore + + +class TestDocumentStore: + def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): + docs = [ + Document(content="Most similar document", embedding=[1.0] * 10), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 0.8, 0.5, 0.8, 0.8, 0.8, 0.8, 0.5]), + Document(content="Not very similar document", embedding=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]), + ] + document_store.write_documents(docs) + time.sleep(sleep_time) + results = document_store._embedding_retrieval(query_embedding=[0.1] * 10, top_k=2, filters={}) + assert len(results) == 2 + assert results[0].content == "Most similar document" + assert results[1].content == "2nd best document" diff --git a/integrations/pinecone/tests/test_pinecone_document_store.py b/integrations/pinecone/tests/test_pinecone_document_store.py deleted file mode 100644 index 8d0f1b097..000000000 --- a/integrations/pinecone/tests/test_pinecone_document_store.py +++ /dev/null @@ -1,335 +0,0 @@ -import os -from inspect import getmembers, isclass, isfunction -from typing import Any, Dict, List, Union -from unittest.mock import MagicMock - -import numpy as np -import pinecone -import pytest -from haystack.preview.dataclasses import Document -from haystack.preview.testing.document_store import DocumentStoreBaseTests - -from pinecone_haystack.document_store import PineconeDocumentStore -from pinecone_haystack.errors import ( - PineconeDocumentStoreError, - PineconeDocumentStoreFilterError, -) -from tests import pinecone_mock - - -class TestPineconeDocumentStore(DocumentStoreBaseTests): - @pytest.fixture - def ds(self, monkeypatch, request) -> PineconeDocumentStore: - """ - This fixture provides an empty document store and takes care of cleaning up after each test - """ - - for fname, function in getmembers(pinecone_mock, isfunction): - monkeypatch.setattr(f"pinecone.{fname}", function, raising=False) - for cname, class_ in getmembers(pinecone_mock, isclass): - monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) - - return PineconeDocumentStore( - api_key=os.environ.get("PINECONE_API_KEY") or "pinecone-test-key", - embedding_dim=768, - embedding_field="embedding", - index="haystack_tests", - similarity="cosine", - recreate_index=True, - ) - - @pytest.fixture - def doc_store_with_docs(self, ds: PineconeDocumentStore) -> PineconeDocumentStore: - """ - This fixture provides a pre-populated document store and takes care of cleaning up after each test - """ - documents = [ - Document( - content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - meta={ - "target": "Lloyds", - "sentiment_score": -0.532, - "format": "headline", - }, - ), - Document( - content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - meta={ - "target": "Glencore", - "sentiment_score": 0.037, - "format": "headline", - }, - ), - Document( - content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - meta={ - "target": "Shell", - "sentiment_score": -0.345, - "format": "headline", - }, - ), - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ), - Document( - content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - meta={ - "target": "HSBC", - "sentiment_score": 0.154, - "format": "post", - }, - ), - # Without meta - Document( - content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." - ), - Document( - content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." - ), - Document( - content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." - ), - ] - ds.write_documents(documents) - return ds - - @pytest.fixture - def mocked_ds(self): - class DSMock(PineconeDocumentStore): - pass - - pinecone.init = MagicMock() - DSMock._create_index = MagicMock() - mocked_ds = DSMock(api_key="MOCK") - - return mocked_ds - - def docs_all_formats(self) -> List[Union[Document, Dict[str, Any]]]: - return [ - # Document object - Document( - content="Lloyds to cut 945 jobs as part of 3-year restructuring plan, Last month we added to our $GILD position and started a new one in $BWLD We see slow, steady, unspectacular growth going forward near term. Lloyds Banking Group's share price lifts amid reports bank is poised to axe hundreds of UK jobs", - meta={ - "target": "Lloyds", - "sentiment_score": -0.532, - "format": "headline", - }, - ), - Document( - content="FTSE 100 drops 2.5 pct on Glencore, metals price fears. Glencore sees Tripoli-based NOC as sole legal seller of Libyan oil. Glencore Studies Possible IPO for Agricultural Trading Business. Glencore chief blames rivals' overproduction for share price fall.", - meta={ - "target": "Glencore", - "sentiment_score": 0.037, - "format": "headline", - }, - ), - Document( - content="Shell's $70 Billion BG Deal Meets Shareholder Skepticism. Shell and BG Shareholders to Vote on Deal at End of January. EU drops Shell, BP, Statoil from ethanol benchmark investigation. Shell challenges Exxon dominance with 47 billion-pound bid for BG", - meta={ - "target": "Shell", - "sentiment_score": -0.345, - "format": "headline", - }, - ), - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ), - Document( - content="HSBC appoints business leaders to board. HSBC Says Unit to Book $585 Million Charge on Settlement. HSBC Hit by Fresh Details of Tax Evasion Claims. HSBC Hit by Fresh Details of Tax Evasion Claims. Goldman Sachs, Barclays, HSBC downplay Brexit threat.", - meta={ - "target": "HSBC", - "sentiment_score": 0.154, - "format": "post", - }, - ), - # Without meta - Document( - content="Aspen to Buy Anaesthetics From AstraZeneca for $520 Million. AstraZeneca wins FDA approval for key new lung cancer pill. AstraZeneca boosts respiratory unit with $575 mln Takeda deal. AstraZeneca Acquires ZS Pharma in $2.7 Billion Deal." - ), - Document( - content="Anheuser-Busch InBev Increases Offer for Rival SABMiller. Australia clears AB Inbev's $100 billion SABMiller buyout plan.Australia clears AB Inbev's $100 billion SABMiller buyout plan." - ), - Document( - content="The Coca-Cola Company and Coca-Cola FEMSA to Acquire AdeS Soy-Based Beverage Business From Unilever." - ), - ] - - @pytest.mark.integration - def test_ne_filters(self, ds, documents): - ds.write_documents(documents) - - result = ds.get_filter_documents(filters={"format": {"$ne": "headline"}}) - assert len(result) == 2 - - @pytest.mark.integration - def test_filter_documents_with_extended_filter_eq(self, doc_store_with_docs: PineconeDocumentStore): - eq_docs = doc_store_with_docs.filter_documents(filters={"type": {"$eq": "article"}}) - normal_docs = doc_store_with_docs.filter_documents(filters={"type": "article"}) - assert eq_docs == normal_docs - - @pytest.mark.integration - def test_filter_documents_ids_extended_filter_ne(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"target": {"$ne": "Glencore"}}) - assert all(d.meta.get("metadata", None) != "Glencore" for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_nin(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"format": {"$nin": ["target", "post"]}}) - assert {"target", "post"}.isdisjoint({d.meta.get("metadata", None) for d in retrieved_docs}) - - @pytest.mark.integration - def test_filter_documents_extended_filter_gt(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gt": 3.0}}) - assert all(d.meta["sentiment_score"] > 3.0 for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_gte(self, doc_store_with_docs: PineconeDocumentStore): - retrieved_docs = doc_store_with_docs.filter_documents(filters={"sentiment_score": {"$gte": 3.0}}) - assert all(d.meta["sentiment_score"] >= 3.0 for d in retrieved_docs) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_and_other_field_simplified( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters_simplified = { - "sentiment_score": {"$lte": 0.2, "$gte": 0.4}, - "target": ["Shell", "Glencore", "HSBC", "Lloyds", "TSLA"], - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_and_or_explicit( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters = { - "$and": { - "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, - "target": { - "name": {"$in": ["HSBC", "Lloyds"]}, - "sentiment_score": {"$lte": 5.0}, - }, - } - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_simplified(self, doc_store_with_docs: PineconeDocumentStore): - filters_simplified = { - "sentiment_score": {"$lte": 0.2, "$gte": 0.3}, - "$or": {"format": ["headline", "post"], "sentiment_score": {"0.318"}}, - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_and_not_explicit(self, doc_store_with_docs: PineconeDocumentStore): - filters = { - "$and": { - "sentiment_score": {"$gte": 0.037}, - "$or": { - "target": {"$in": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"]}, - "$and": {"format": {"$in": ["headline", "post"]}}, - }, - } - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_and_or_and_not_simplified( - self, doc_store_with_docs: PineconeDocumentStore - ): - filters_simplified = { - "sentiment_score": {"$lte": "0.037"}, - "$or": { - "target": ["LLyods", "Glencore"], - "$and": {"format": {"$lte": "headline"}, "$not": {"format": "post"}}, - }, - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters_simplified) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_nested_not(self, doc_store_with_docs: PineconeDocumentStore): - # Test nested logical operations within "$not". - filters = { - "$not": { - "$or": { - "$and": {"target": {"Lloyds"}}, - "$not": {"format": {"healdine"}}, - } - } - } - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]t' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - @pytest.mark.integration - def test_filter_documents_extended_filter_compound_same_level_not(self, doc_store_with_docs: PineconeDocumentStore): - # Test same logical operator twice on the same level. - filters = { - "$or": [ - { - "$and": { - "target": ["LLyods", "Glencore", "TSLA", "Shell"], - "format": {"$in": ["post"]}, - } - }, - { - "$and": { - "target": ["LLyods", "Glencore", "HSBC", "TSLA", "Shell"], - "format": {"$in": ["headline"]}, - } - }, - ] - } - - with pytest.raises( - PineconeDocumentStoreFilterError, - match=r"Comparison value for '\$[l|g]te' operation must be a float or int.", - ): - doc_store_with_docs.filter_documents(filters=filters) - - def test_get_embedding_count(self, doc_store_with_docs: PineconeDocumentStore): - """ - We expect 1 doc with an embeddings because all documents in already written in doc_store_with_docs contain no - embeddings. - """ - doc = Document( - content="Doc with embedding", - embedding=np.random.rand(768).astype(np.float32), - ) - doc_store_with_docs.write_documents([doc]) - assert doc_store_with_docs.get_embedding_count() == 1 diff --git a/integrations/pinecone/tests/test_retriever.py b/integrations/pinecone/tests/test_retriever.py deleted file mode 100644 index 4f21d357a..000000000 --- a/integrations/pinecone/tests/test_retriever.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -from inspect import getmembers, isclass, isfunction -from typing import Any, Dict, List, Union -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pinecone -import pytest -from haystack.preview import ( - DeserializationError, - Document, - component, - default_from_dict, - default_to_dict, -) -from haystack.preview.dataclasses import Document - -from pinecone_haystack.document_store import PineconeDocumentStore -from pinecone_haystack.retriever import PineconeRetriever -from tests import pinecone_mock - - -class TestPineconeRetriever: - @pytest.mark.unit - def test_init(self): - mock_store = Mock(spec=PineconeDocumentStore) - retriever = PineconeRetriever(document_store=mock_store) - assert retriever.document_store == mock_store - assert retriever.filters == None - assert retriever.top_k == 10 - assert retriever.scale_score == True - assert retriever.return_embedding == False - - @pytest.mark.unit - def test_run(self): - mock_store = Mock(spec=PineconeDocumentStore) - mock_store.query_by_embedding.return_value = [ - Document( - content="$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in.", - meta={ - "target": "TSLA", - "sentiment_score": 0.318, - "format": "post", - }, - ) - ] - - retriever = PineconeRetriever(document_store=mock_store) - results = retriever.run(["How many cars is TSLA recalling?"]) - - assert len(results["documents"]) == 1 - assert ( - results["documents"][0].content - == "$TSLA lots of green on the 5 min, watch the hourly $259.33 possible resistance currently @ $257.00.Tesla is recalling 2,700 Model X cars.Hard to find new buyers of $TSLA at 250. Shorts continue to pile in." - ) - - @pytest.mark.integration - def test_to_dict(self): - document_store = PineconeDocumentStore("pinecone-test-key") - retriever = PineconeRetriever(document_store=document_store) - doc_dict = retriever.to_dict() - assert doc_dict == { - "init_parameters": { - "document_store": "test_document_store", - "filters": None, - "top_k": 10, - "scale_score": "True", - "return_embedding": False, - } - } - - @pytest.mark.integration - def test_from_dict(self): - """ - Test deserialization of this component from a dictionary, using default initialization parameters. - """ - retriever_component_dict = { - "type": "PineconeRetriever", - "init_parameters": { - "document_store": "test_document_store", - "filters": None, - "top_k": 10, - "scale_score": True, - "return_embedding": False, - }, - } - retriever = PineconeRetriever.from_dict(retriever_component_dict) - - assert retriever.document_store == "test_document_store" - assert retriever.filters is None - assert retriever.top_k == 10 - assert retriever.scale_score is True - assert retriever.return_embedding is False diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py new file mode 100644 index 000000000..25641f7a4 --- /dev/null +++ b/integrations/pinecone/tests/test_write.py @@ -0,0 +1,40 @@ +import time + +import pytest +from haystack import Document +from haystack.document_stores import DuplicatePolicy +from haystack.testing.document_store import ( + WriteDocumentsTest, +) + +from pinecone_haystack.document_store import PineconeDocumentStore + + +class TestWriteDocuments(WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + # overriden to wait for Pinecone to be updated + def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocumentStore, sleep_time): + """ + Test write_documents() overwrites stored Document when trying to write one with same id + using DuplicatePolicy.OVERWRITE. + """ + doc1 = Document(id="1", content="test doc 1") + doc2 = Document(id="1", content="test doc 2") + + assert document_store.write_documents([doc2], policy=DuplicatePolicy.OVERWRITE) == 1 + time.sleep(sleep_time) + self.assert_documents_are_equal(document_store.filter_documents(), [doc2]) + assert document_store.write_documents(documents=[doc1], policy=DuplicatePolicy.OVERWRITE) == 1 + time.sleep(sleep_time) + self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) + + @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... From 89ca25c0819f700ac2b31d18fb8f065ccc634bbf Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 19 Dec 2023 17:16:22 +0100 Subject: [PATCH 12/38] make workflow read the api key --- .github/workflows/pinecone.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pinecone.yml b/.github/workflows/pinecone.yml index d42330849..fe1b1d456 100644 --- a/.github/workflows/pinecone.yml +++ b/.github/workflows/pinecone.yml @@ -17,6 +17,7 @@ concurrency: env: PYTHONUNBUFFERED: "1" FORCE_COLOR: "1" + PINECONE_API_KEY: ${{ secrets.PINECONE_API_KEY }} jobs: run: From 542ec80b52f1a05da1a5288ffde2edb77816cbd5 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 19 Dec 2023 17:40:35 +0100 Subject: [PATCH 13/38] rm score when filtering docs --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index af295e0c0..e9a063f18 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -158,6 +158,8 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc if not filters: # in this case, we try to return all documents but Pinecone has some limits documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT) + for doc in documents: + doc.score = None total_docs_number = self.count_documents() if total_docs_number > TOP_K_LIMIT: From abf985a9121af787622e8c8bb5eca91388c54c5e Mon Sep 17 00:00:00 2001 From: anakin87 Date: Tue, 19 Dec 2023 18:04:48 +0100 Subject: [PATCH 14/38] increase wait time --- integrations/pinecone/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 63ec94819..12dd808a6 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -6,7 +6,7 @@ from pinecone_haystack.document_store import PineconeDocumentStore # This is the approximate time it takes for the documents to be available -SLEEP_TIME = 17 +SLEEP_TIME = 20 @pytest.fixture() From f17b6ecab7a5b9ffc43fd5f3402933b48f45aa3a Mon Sep 17 00:00:00 2001 From: anakin87 Date: Wed, 20 Dec 2023 09:49:14 +0100 Subject: [PATCH 15/38] improve api key reading; more tests --- .../src/pinecone_haystack/document_store.py | 16 ++--- integrations/pinecone/tests/conftest.py | 4 +- .../pinecone/tests/test_document_store.py | 70 +++++++++++++++++++ 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index e9a063f18..d213235df 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -53,15 +53,13 @@ def __init__( [API reference](https://docs.pinecone.io/reference/create_index-1). """ - if api_key is None: - try: - api_key = os.environ["PINECONE_API_KEY"] - except KeyError as e: - msg = ( - "PineconeDocumentStore expects a Pinecone API key. " - "Set the PINECONE_API_KEY environment variable (recommended) or pass it explicitly." - ) - raise ValueError(msg) from e + api_key = api_key or os.environ.get("PINECONE_API_KEY", None) + if not api_key: + msg = ( + "PineconeDocumentStore expects a Pinecone API key. " + "Set the PINECONE_API_KEY environment variable (recommended) or pass it explicitly." + ) + raise ValueError(msg) pinecone.init(api_key=api_key, environment=environment) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 12dd808a6..6296c4f1e 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -36,11 +36,11 @@ def document_store(request): # Override the count_documents method to wait for the documents to be available original_count_documents = store.count_documents - def count_documents_sleep(): + def wait_and_count_documents(): time.sleep(SLEEP_TIME) return original_count_documents() - store.count_documents = count_documents_sleep + store.count_documents = wait_and_count_documents yield store store._index.delete(delete_all=True, namespace=namespace) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 74315aad2..2d709449c 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -1,4 +1,5 @@ import time +from unittest.mock import patch from haystack import Document @@ -6,6 +7,75 @@ class TestDocumentStore: + @patch("pinecone_haystack.document_store.pinecone") + def test_init(self, mock_pinecone): + mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} + + document_store = PineconeDocumentStore( + api_key="fake-api-key", + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + + assert document_store.environment == "gcp-starter" + assert document_store.index == "my_index" + assert document_store.namespace == "test" + assert document_store.batch_size == 50 + assert document_store.dimension == 30 + assert document_store.index_creation_kwargs == {"metric": "euclidean"} + + @patch("pinecone_haystack.document_store.pinecone") + def test_to_dict(self, mock_pinecone): + mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} + document_store = PineconeDocumentStore( + api_key="fake-api-key", + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + assert document_store.to_dict() == { + "type": "pinecone_haystack.document_store.PineconeDocumentStore", + "init_parameters": { + "environment": "gcp-starter", + "index": "my_index", + "dimension": 30, + "namespace": "test", + "batch_size": 50, + "metric": "euclidean", + }, + } + + @patch("pinecone_haystack.document_store.pinecone") + def test_from_dict(self, mock_pinecone): + mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} + + data = { + "type": "pinecone_haystack.document_store.PineconeDocumentStore", + "init_parameters": { + "environment": "gcp-starter", + "index": "my_index", + "dimension": 30, + "namespace": "test", + "batch_size": 50, + "metric": "euclidean", + }, + } + + document_store = PineconeDocumentStore.from_dict(data) + assert document_store.environment == "gcp-starter" + assert document_store.index == "my_index" + assert document_store.namespace == "test" + assert document_store.batch_size == 50 + assert document_store.dimension == 30 + assert document_store.index_creation_kwargs == {"metric": "euclidean"} + def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): docs = [ Document(content="Most similar document", embedding=[1.0] * 10), From c63eac2f0150f25f531bf9666a4da47ed575ada9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 11:45:16 +0100 Subject: [PATCH 16/38] improvements from PR review --- integrations/pinecone/pyproject.toml | 2 +- .../src/pinecone_haystack/document_store.py | 35 ++++++++----- integrations/pinecone/tests/conftest.py | 3 +- .../pinecone/tests/test_document_store.py | 50 ++++++++++--------- integrations/pinecone/tests/test_write.py | 4 +- 5 files changed, 53 insertions(+), 41 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 1f8b1f328..506795e7f 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -11,7 +11,7 @@ requires-python = ">=3.8" license = "Apache-2.0" keywords = [] authors = [ - { name = "John Doe", email = "jd@example.com" }, + { name = "deepset GmbH", email = "info@deepset.ai" }, ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index d213235df..c940334c6 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -8,7 +8,7 @@ import pandas as pd import pinecone -from haystack import default_from_dict, default_to_dict +from haystack import default_to_dict from haystack.dataclasses import Document from haystack.document_stores import DuplicatePolicy @@ -53,7 +53,7 @@ def __init__( [API reference](https://docs.pinecone.io/reference/create_index-1). """ - api_key = api_key or os.environ.get("PINECONE_API_KEY", None) + api_key = api_key or os.environ.get("PINECONE_API_KEY") if not api_key: msg = ( "PineconeDocumentStore expects a Pinecone API key. " @@ -64,10 +64,22 @@ def __init__( pinecone.init(api_key=api_key, environment=environment) if index not in pinecone.list_indexes(): + logger.info(f"Index {index} does not exist. Creating a new index.") pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs) + else: + logger.info(f"Index {index} already exists. Connecting to it.") self._index = pinecone.Index(index_name=index) - self.dimension = self._index.describe_index_stats()["dimension"] + + actual_dimension = self._index.describe_index_stats().get("dimension") + if actual_dimension and actual_dimension != dimension: + logger.warning( + f"Dimension of index {index} is {actual_dimension}, but {dimension} was specified. " + "The specified dimension will be ignored." + "If you need an index with a different dimension, please create a new one." + ) + self.dimension = actual_dimension or dimension + self._dummy_vector = [0.0] * self.dimension self.environment = environment self.index = index @@ -86,10 +98,6 @@ def to_dict(self) -> Dict[str, Any]: **self.index_creation_kwargs, ) - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore": - return default_from_dict(cls, data) - def count_documents(self) -> int: """ Returns how many documents are present in the document store. @@ -110,10 +118,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D :return: The number of documents written to the document store. """ - if len(documents) > 0: + try: if not isinstance(documents[0], Document): - msg = "param 'documents' must contain a list of objects of type Document" - raise ValueError(msg) + raise TypeError() + except (TypeError, KeyError) as e: + msg = "param 'documents' must contain a list of objects of type Document" + raise TypeError(msg) from e if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: logger.warning( @@ -139,8 +149,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() if document.blob is not None: logger.warning( - f"Document {document.id} has a blob. Currently, storing blob in Pinecone is not supported. " - "The blob will be ignored." + f"Document {document.id} has the `blob` field set, but storing `ByteStream` " + "objects in Pinecone is not supported. " + "The content of the `blob` field will be ignored." ) documents_for_pinecone.append(doc_for_pinecone) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 6296c4f1e..4f5c677e7 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -1,5 +1,4 @@ import time -from random import randint import pytest @@ -23,7 +22,7 @@ def document_store(request): environment = "gcp-starter" index = "default" # Use a different namespace for each test so we can run them in parallel - namespace = f"{request.node.name}-{randint(0, 1000)}" # noqa: S311 Ruff complains about using random numbers for cryptographic purposes + namespace = f"{request.node.name}-{int(time.time())}" dimension = 10 store = PineconeDocumentStore( diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 2d709449c..b68aec49d 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -1,6 +1,7 @@ import time from unittest.mock import patch +import pytest from haystack import Document from pinecone_haystack.document_store import PineconeDocumentStore @@ -28,6 +29,31 @@ def test_init(self, mock_pinecone): assert document_store.dimension == 30 assert document_store.index_creation_kwargs == {"metric": "euclidean"} + @patch("pinecone_haystack.document_store.pinecone") + def test_init_api_key_in_environment_variable(self, monkeypatch): + monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key") + + PineconeDocumentStore( + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + + assert True + + def test_init_fails_wo_api_key(self, monkeypatch): + api_key = None + monkeypatch.delenv("PINECONE_API_KEY", raising=False) + with pytest.raises(ValueError): + PineconeDocumentStore( + api_key=api_key, + environment="gcp-starter", + index="my_index", + ) + @patch("pinecone_haystack.document_store.pinecone") def test_to_dict(self, mock_pinecone): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} @@ -52,30 +78,6 @@ def test_to_dict(self, mock_pinecone): }, } - @patch("pinecone_haystack.document_store.pinecone") - def test_from_dict(self, mock_pinecone): - mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} - - data = { - "type": "pinecone_haystack.document_store.PineconeDocumentStore", - "init_parameters": { - "environment": "gcp-starter", - "index": "my_index", - "dimension": 30, - "namespace": "test", - "batch_size": 50, - "metric": "euclidean", - }, - } - - document_store = PineconeDocumentStore.from_dict(data) - assert document_store.environment == "gcp-starter" - assert document_store.index == "my_index" - assert document_store.namespace == "test" - assert document_store.batch_size == 50 - assert document_store.dimension == 30 - assert document_store.index_creation_kwargs == {"metric": "euclidean"} - def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): docs = [ Document(content="Most similar document", embedding=[1.0] * 10), diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py index 25641f7a4..3aa2e1eda 100644 --- a/integrations/pinecone/tests/test_write.py +++ b/integrations/pinecone/tests/test_write.py @@ -31,10 +31,10 @@ def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocum time.sleep(sleep_time) self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) - @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): ... - @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): ... From f7d048d552288a9a71a5f12baf8bbc76ba7a9dc4 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 12:11:10 +0100 Subject: [PATCH 17/38] test simplification --- .../src/pinecone_haystack/document_store.py | 8 +++++++- integrations/pinecone/tests/conftest.py | 19 ++++++++++++------ .../pinecone/tests/test_document_store.py | 20 ++++++++++++------- integrations/pinecone/tests/test_write.py | 18 ----------------- 4 files changed, 33 insertions(+), 32 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index c940334c6..49b956a24 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -234,12 +234,18 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li if dataframe_string: dataframe = pd.read_json(io.StringIO(dataframe_string)) + # we always store vectors during writing + # but we don't want to return them if they are dummy vectors + embedding = None + if pinecone_doc["values"] != self._dummy_vector: + embedding = pinecone_doc["values"] + doc = Document( id=pinecone_doc["id"], content=content, dataframe=dataframe, meta=pinecone_doc["metadata"], - embedding=pinecone_doc["values"], + embedding=embedding, score=pinecone_doc["score"], ) documents.append(doc) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 4f5c677e7..c13b7b989 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -1,6 +1,7 @@ import time import pytest +from haystack.document_stores import DuplicatePolicy from pinecone_haystack.document_store import PineconeDocumentStore @@ -23,7 +24,7 @@ def document_store(request): index = "default" # Use a different namespace for each test so we can run them in parallel namespace = f"{request.node.name}-{int(time.time())}" - dimension = 10 + dimension = 768 store = PineconeDocumentStore( environment=environment, @@ -32,14 +33,20 @@ def document_store(request): dimension=dimension, ) - # Override the count_documents method to wait for the documents to be available - original_count_documents = store.count_documents + # Override some methods to wait for the documents to be available + original_write_documents = store.write_documents + def write_documents_and_wait(documents, policy=DuplicatePolicy.NONE): + written_docs = original_write_documents(documents, policy) + time.sleep(SLEEP_TIME) + return written_docs - def wait_and_count_documents(): + original_delete_documents = store.delete_documents + def delete_documents_and_wait(filters): + original_delete_documents(filters) time.sleep(SLEEP_TIME) - return original_count_documents() - store.count_documents = wait_and_count_documents + store.write_documents = write_documents_and_wait + store.delete_documents = delete_documents_and_wait yield store store._index.delete(delete_all=True, namespace=namespace) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index b68aec49d..fb5c9ed29 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -1,7 +1,7 @@ -import time from unittest.mock import patch import pytest +import numpy as np from haystack import Document from pinecone_haystack.document_store import PineconeDocumentStore @@ -78,15 +78,21 @@ def test_to_dict(self, mock_pinecone): }, } - def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): + def test_embedding_retrieval(self, document_store: PineconeDocumentStore): + query_embedding=[0.1] * 768 + most_similar_embedding=[0.8] * 768 + second_best_embedding=[0.8] * 700 + [0.1] * 3 + [0.2]* 65 + another_embedding=np.random.rand(768).tolist() + docs = [ - Document(content="Most similar document", embedding=[1.0] * 10), - Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 0.8, 0.5, 0.8, 0.8, 0.8, 0.8, 0.5]), - Document(content="Not very similar document", embedding=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]), + Document(content="Most similar document", embedding=most_similar_embedding), + Document(content="2nd best document", embedding=second_best_embedding), + Document(content="Not very similar document", embedding=another_embedding), ] + document_store.write_documents(docs) - time.sleep(sleep_time) - results = document_store._embedding_retrieval(query_embedding=[0.1] * 10, top_k=2, filters={}) + + results = document_store._embedding_retrieval(query_embedding=query_embedding, top_k=2, filters={}) assert len(results) == 2 assert results[0].content == "Most similar document" assert results[1].content == "2nd best document" diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py index 3aa2e1eda..e22bc8aec 100644 --- a/integrations/pinecone/tests/test_write.py +++ b/integrations/pinecone/tests/test_write.py @@ -1,8 +1,5 @@ -import time - import pytest from haystack import Document -from haystack.document_stores import DuplicatePolicy from haystack.testing.document_store import ( WriteDocumentsTest, ) @@ -15,21 +12,6 @@ def test_write_documents(self, document_store: PineconeDocumentStore): docs = [Document(id="1")] assert document_store.write_documents(docs) == 1 - # overriden to wait for Pinecone to be updated - def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocumentStore, sleep_time): - """ - Test write_documents() overwrites stored Document when trying to write one with same id - using DuplicatePolicy.OVERWRITE. - """ - doc1 = Document(id="1", content="test doc 1") - doc2 = Document(id="1", content="test doc 2") - - assert document_store.write_documents([doc2], policy=DuplicatePolicy.OVERWRITE) == 1 - time.sleep(sleep_time) - self.assert_documents_are_equal(document_store.filter_documents(), [doc2]) - assert document_store.write_documents(documents=[doc1], policy=DuplicatePolicy.OVERWRITE) == 1 - time.sleep(sleep_time) - self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): From c5e9174a89a6c29320ac6370e8698869f0732386 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 12:11:52 +0100 Subject: [PATCH 18/38] test simplification 2 --- integrations/pinecone/tests/conftest.py | 2 ++ integrations/pinecone/tests/test_document_store.py | 10 +++++----- integrations/pinecone/tests/test_write.py | 1 - 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index c13b7b989..ea0fc0167 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -35,12 +35,14 @@ def document_store(request): # Override some methods to wait for the documents to be available original_write_documents = store.write_documents + def write_documents_and_wait(documents, policy=DuplicatePolicy.NONE): written_docs = original_write_documents(documents, policy) time.sleep(SLEEP_TIME) return written_docs original_delete_documents = store.delete_documents + def delete_documents_and_wait(filters): original_delete_documents(filters) time.sleep(SLEEP_TIME) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index fb5c9ed29..a0a523991 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -1,7 +1,7 @@ from unittest.mock import patch -import pytest import numpy as np +import pytest from haystack import Document from pinecone_haystack.document_store import PineconeDocumentStore @@ -79,10 +79,10 @@ def test_to_dict(self, mock_pinecone): } def test_embedding_retrieval(self, document_store: PineconeDocumentStore): - query_embedding=[0.1] * 768 - most_similar_embedding=[0.8] * 768 - second_best_embedding=[0.8] * 700 + [0.1] * 3 + [0.2]* 65 - another_embedding=np.random.rand(768).tolist() + query_embedding = [0.1] * 768 + most_similar_embedding = [0.8] * 768 + second_best_embedding = [0.8] * 700 + [0.1] * 3 + [0.2] * 65 + another_embedding = np.random.rand(768).tolist() docs = [ Document(content="Most similar document", embedding=most_similar_embedding), diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py index e22bc8aec..7c04a93be 100644 --- a/integrations/pinecone/tests/test_write.py +++ b/integrations/pinecone/tests/test_write.py @@ -12,7 +12,6 @@ def test_write_documents(self, document_store: PineconeDocumentStore): docs = [Document(id="1")] assert document_store.write_documents(docs) == 1 - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): ... From 42da9abf0929fd8fb87b44ed61f1b9bd5324d5fc Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 12:51:23 +0100 Subject: [PATCH 19/38] fix --- .../pinecone/src/pinecone_haystack/document_store.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 49b956a24..34b398fc3 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -4,6 +4,7 @@ import io import logging import os +from copy import deepcopy from typing import Any, Dict, List, Optional import pandas as pd @@ -118,12 +119,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D :return: The number of documents written to the document store. """ - try: - if not isinstance(documents[0], Document): - raise TypeError() - except (TypeError, KeyError) as e: + if len(documents) > 0 and not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" - raise TypeError(msg) from e + raise TypeError(msg) if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: logger.warning( @@ -132,7 +130,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D ) documents_for_pinecone = [] - for document in documents: + for document in deepcopy(documents): if document.embedding is None: logger.warning( f"Document {document.id} has no embedding. Pinecone is a purely vector database. " From a12a31ca846dccd8c3323b7657983856fe4aae1c Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 12:58:01 +0100 Subject: [PATCH 20/38] std ds tests want valueerror --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 34b398fc3..2c089f9b3 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -121,7 +121,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D """ if len(documents) > 0 and not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" - raise TypeError(msg) + raise ValueError(msg) if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: logger.warning( From 72570ed48474f5393bc684a89a4bce74566128ce Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:51:24 +0100 Subject: [PATCH 21/38] put tests together --- .../src/pinecone_haystack/document_store.py | 5 +++-- integrations/pinecone/tests/test_count.py | 7 ------- integrations/pinecone/tests/test_delete.py | 7 ------- .../pinecone/tests/test_document_store.py | 15 ++++++++++++- integrations/pinecone/tests/test_write.py | 21 ------------------- 5 files changed, 17 insertions(+), 38 deletions(-) delete mode 100644 integrations/pinecone/tests/test_count.py delete mode 100644 integrations/pinecone/tests/test_delete.py delete mode 100644 integrations/pinecone/tests/test_write.py diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 2c089f9b3..0c8d65e6c 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -164,7 +164,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: if not filters: # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval(query_embedding=self._dummy_vector, top_k=TOP_K_LIMIT) + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT) for doc in documents: doc.score = None @@ -190,6 +190,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, + namespace: Optional[str]=None, filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: @@ -214,7 +215,7 @@ def _embedding_retrieval( result = self._index.query( vector=query_embedding, top_k=top_k, - namespace=self.namespace, + namespace=namespace, include_values=True, include_metadata=True, ) diff --git a/integrations/pinecone/tests/test_count.py b/integrations/pinecone/tests/test_count.py deleted file mode 100644 index 02462d422..000000000 --- a/integrations/pinecone/tests/test_count.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - CountDocumentsTest, -) - - -class TestCountDocuments(CountDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_delete.py b/integrations/pinecone/tests/test_delete.py deleted file mode 100644 index 88b145704..000000000 --- a/integrations/pinecone/tests/test_delete.py +++ /dev/null @@ -1,7 +0,0 @@ -from haystack.testing.document_store import ( - DeleteDocumentsTest, -) - - -class TestDeleteDocuments(DeleteDocumentsTest): - ... diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index a0a523991..01017d67f 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -3,11 +3,24 @@ import numpy as np import pytest from haystack import Document +from haystack.testing.document_store import CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest from pinecone_haystack.document_store import PineconeDocumentStore -class TestDocumentStore: +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, WriteDocumentsTest): + def test_write_documents(self, document_store: PineconeDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): + ... + + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") + def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): + ... + @patch("pinecone_haystack.document_store.pinecone") def test_init(self, mock_pinecone): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py deleted file mode 100644 index 7c04a93be..000000000 --- a/integrations/pinecone/tests/test_write.py +++ /dev/null @@ -1,21 +0,0 @@ -import pytest -from haystack import Document -from haystack.testing.document_store import ( - WriteDocumentsTest, -) - -from pinecone_haystack.document_store import PineconeDocumentStore - - -class TestWriteDocuments(WriteDocumentsTest): - def test_write_documents(self, document_store: PineconeDocumentStore): - docs = [Document(id="1")] - assert document_store.write_documents(docs) == 1 - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): - ... - - @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") - def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): - ... From 2e690e4a9d8d220f3705373987acc52902a50c42 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:51:41 +0100 Subject: [PATCH 22/38] format --- .../pinecone/src/pinecone_haystack/document_store.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 0c8d65e6c..b7133e757 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -164,7 +164,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: if not filters: # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval(query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT) + documents = self._embedding_retrieval( + query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT + ) for doc in documents: doc.score = None @@ -190,7 +192,7 @@ def _embedding_retrieval( self, query_embedding: List[float], *, - namespace: Optional[str]=None, + namespace: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, # noqa: ARG002 (filters to be implemented) top_k: int = 10, ) -> List[Document]: From 9437c02e69004f9e6b78d670a83e7137feedee89 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 09:59:01 +0100 Subject: [PATCH 23/38] add fallback for namespace in _embedding_retrieval --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index b7133e757..c81efd58c 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -217,7 +217,7 @@ def _embedding_retrieval( result = self._index.query( vector=query_embedding, top_k=top_k, - namespace=namespace, + namespace=namespace or self.namespace, include_values=True, include_metadata=True, ) From fdfd3e7641cdf386878815a8b9ee5037b86da0df Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 10:21:14 +0100 Subject: [PATCH 24/38] try to parallelize tests --- integrations/pinecone/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 506795e7f..9de2561fe 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -48,8 +48,9 @@ dependencies = [ "pytest-xdist", ] [tool.hatch.envs.default.scripts] -test = "pytest {args:tests}" -test-cov = "coverage run -m pytest {args:tests}" +# Pinecone tests are slow (require HTTP requests), so we run them in parallel +test = "pytest -n auto --dist worksteal {args:tests}" +test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}" cov-report = [ "- coverage combine", "coverage report", From 8e6f0e63487d1b8e0ee9fd2037de82636fa9b6f9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 10:31:28 +0100 Subject: [PATCH 25/38] better try --- integrations/pinecone/pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 9de2561fe..069dba1be 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -49,8 +49,9 @@ dependencies = [ ] [tool.hatch.envs.default.scripts] # Pinecone tests are slow (require HTTP requests), so we run them in parallel -test = "pytest -n auto --dist worksteal {args:tests}" -test-cov = "coverage run -m pytest -n auto --dist worksteal {args:tests}" +# with pytest-xdist (https://pytest-xdist.readthedocs.io/en/stable/distribution.html) +test = "pytest -n auto --maxprocesses=3 {args:tests}" +test-cov = "coverage run -m pytest -n auto --maxprocesses=3 {args:tests}" cov-report = [ "- coverage combine", "coverage report", From c759d103e656e3a1a815c32ab965e287a3128328 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 11:04:44 +0100 Subject: [PATCH 26/38] labeler --- .github/labeler.yml | 5 +++ .../src/pinecone_haystack/document_store.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index f2dcedad2..1a41c2caf 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -44,6 +44,11 @@ integration:qdrant: - any-glob-to-any-file: "integrations/qdrant/**/*" - any-glob-to-any-file: ".github/workflows/qdrant.yml" +integration:pinecone: + - changed-files: + - any-glob-to-any-file: "integrations/pinecone/**/*" + - any-glob-to-any-file: ".github/workflows/pinecone.yml" + integration:unstructured-fileconverter: - changed-files: - any-glob-to-any-file: "integrations/unstructured/fileconverter/**/*" diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index c81efd58c..dfa5bd564 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -162,23 +162,31 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D return written_docs def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - if not filters: - # in this case, we try to return all documents but Pinecone has some limits - documents = self._embedding_retrieval( - query_embedding=self._dummy_vector, namespace=self.namespace, top_k=TOP_K_LIMIT - ) - for doc in documents: - doc.score = None + """ + Returns the documents that match the filters provided. - total_docs_number = self.count_documents() - if total_docs_number > TOP_K_LIMIT: - logger.warning( - f"PineconeDocumentStore can only return {TOP_K_LIMIT} documents. " - f"However, there are {total_docs_number} documents in the namespace. " - ) - return documents + For a detailed specification of the filters, + refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) - return [] + :param filters: The filters to apply to the document list. + :return: A list of Documents that match the given filters. + """ + + # Pinecone only performs vector similarity search + # here we are querying with a dummy vector and the max compatible top_k + documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) + + # when simply filtering, we don't want to return any scores + # furthermore, we are querying with a dummy vector, so the scores are meaningless + for doc in documents: + doc.score = None + + if len(documents) == TOP_K_LIMIT: + logger.warning( + f"PineconeDocumentStore can return at most {TOP_K_LIMIT} documents and the query has hit this limit. " + f"It is likely that there are more matching documents in the document store. " + ) + return documents def delete_documents(self, document_ids: List[str]) -> None: """ @@ -204,6 +212,7 @@ def _embedding_retrieval( `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. + :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. :param filters: Filters applied to the retrieved Documents. Defaults to None. :param top_k: Maximum number of Documents to return, defaults to 10 From 017cd750b81d3600cba6884986f68867f4099ff3 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 11:13:20 +0100 Subject: [PATCH 27/38] format fix --- .../pinecone/src/pinecone_haystack/document_store.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index dfa5bd564..2e4fe5f14 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -165,17 +165,17 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc """ Returns the documents that match the filters provided. - For a detailed specification of the filters, + For a detailed specification of the filters, refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering) :param filters: The filters to apply to the document list. :return: A list of Documents that match the given filters. """ - + # Pinecone only performs vector similarity search # here we are querying with a dummy vector and the max compatible top_k documents = self._embedding_retrieval(query_embedding=self._dummy_vector, filters=filters, top_k=TOP_K_LIMIT) - + # when simply filtering, we don't want to return any scores # furthermore, we are querying with a dummy vector, so the scores are meaningless for doc in documents: From f42c54080d12d34b5f304c66d0c32aa726a073af Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci Date: Fri, 22 Dec 2023 11:40:41 +0100 Subject: [PATCH 28/38] Apply suggestions from code review Co-authored-by: Massimiliano Pippi --- integrations/pinecone/tests/test_document_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 01017d67f..9f0db402b 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -41,6 +41,7 @@ def test_init(self, mock_pinecone): assert document_store.batch_size == 50 assert document_store.dimension == 30 assert document_store.index_creation_kwargs == {"metric": "euclidean"} + assert document_store.api_key == "fake-api-key" @patch("pinecone_haystack.document_store.pinecone") def test_init_api_key_in_environment_variable(self, monkeypatch): @@ -55,7 +56,7 @@ def test_init_api_key_in_environment_variable(self, monkeypatch): metric="euclidean", ) - assert True + assert document_store.api_key == "fake-api-key" def test_init_fails_wo_api_key(self, monkeypatch): api_key = None From d918414a4d3b5bdbeb9c601fc49b1d0eddd90ebf Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 11:45:03 +0100 Subject: [PATCH 29/38] Revert "Apply suggestions from code review" This reverts commit f42c54080d12d34b5f304c66d0c32aa726a073af. --- integrations/pinecone/tests/test_document_store.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 9f0db402b..01017d67f 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -41,7 +41,6 @@ def test_init(self, mock_pinecone): assert document_store.batch_size == 50 assert document_store.dimension == 30 assert document_store.index_creation_kwargs == {"metric": "euclidean"} - assert document_store.api_key == "fake-api-key" @patch("pinecone_haystack.document_store.pinecone") def test_init_api_key_in_environment_variable(self, monkeypatch): @@ -56,7 +55,7 @@ def test_init_api_key_in_environment_variable(self, monkeypatch): metric="euclidean", ) - assert document_store.api_key == "fake-api-key" + assert True def test_init_fails_wo_api_key(self, monkeypatch): api_key = None From 4d90b8cef1cb1792f91edcec036c614b4959c600 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 13:57:09 +0100 Subject: [PATCH 30/38] improve document conversion --- .../src/pinecone_haystack/document_store.py | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 2e4fe5f14..f841b23d9 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -129,30 +129,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D f"but got {policy}. Overwriting duplicates is enabled by default." ) - documents_for_pinecone = [] - for document in deepcopy(documents): - if document.embedding is None: - logger.warning( - f"Document {document.id} has no embedding. Pinecone is a purely vector database. " - "A dummy embedding will be used, but this can affect the search results. " - ) - document.embedding = self._dummy_vector - doc_for_pinecone = {"id": document.id, "values": document.embedding, "metadata": document.meta} - - # we save content/dataframe as metadata - # currently, storing blob in Pinecone is not supported - if document.content is not None: - doc_for_pinecone["metadata"]["content"] = document.content - if document.dataframe is not None: - doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() - if document.blob is not None: - logger.warning( - f"Document {document.id} has the `blob` field set, but storing `ByteStream` " - "objects in Pinecone is not supported. " - "The content of the `blob` field will be ignored." - ) - - documents_for_pinecone.append(doc_for_pinecone) + documents_for_pinecone = self._convert_documents_to_pinecone_format(documents) result = self._index.upsert( vectors=documents_for_pinecone, namespace=self.namespace, batch_size=self.batch_size @@ -261,3 +238,30 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li documents.append(doc) return documents + + def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: + documents_for_pinecone = [] + for document in documents: + embedding = document.embedding + if embedding is None: + logger.warning( + f"Document {document.id} has no embedding. Pinecone is a purely vector database. " + "A dummy embedding will be used, but this can affect the search results. " + ) + embedding = self._dummy_vector + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": document.meta} + + # we save content/dataframe as metadata + # currently, storing blob in Pinecone is not supported + if document.content is not None: + doc_for_pinecone["metadata"]["content"] = document.content + if document.dataframe is not None: + doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() + if document.blob is not None: + logger.warning( + f"Document {document.id} has the `blob` field set, but storing `ByteStream` " + "objects in Pinecone is not supported. " + "The content of the `blob` field will be ignored." + ) + + documents_for_pinecone.append(doc_for_pinecone) From 3f07182e050fb25e1103ef5baa389e7588284001 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 13:58:06 +0100 Subject: [PATCH 31/38] rm deepcopy --- integrations/pinecone/src/pinecone_haystack/document_store.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index f841b23d9..9c25f7f83 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -4,7 +4,6 @@ import io import logging import os -from copy import deepcopy from typing import Any, Dict, List, Optional import pandas as pd From 7668a4a0392c437df4ccaca2d3f64e94d1762cdb Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 14:01:51 +0100 Subject: [PATCH 32/38] missing return --- integrations/pinecone/src/pinecone_haystack/document_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 9c25f7f83..9175012a8 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -264,3 +264,4 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li ) documents_for_pinecone.append(doc_for_pinecone) + return documents_for_pinecone \ No newline at end of file From c4f5079f7dab5dfcf620981e28b53d34d38880b6 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 14:03:26 +0100 Subject: [PATCH 33/38] fix fmt --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 9175012a8..5b4afb39c 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -264,4 +264,4 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li ) documents_for_pinecone.append(doc_for_pinecone) - return documents_for_pinecone \ No newline at end of file + return documents_for_pinecone From 54646e889d2bf3dcce15a41f8d05435bb1a38de0 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 14:26:37 +0100 Subject: [PATCH 34/38] copy metadata --- .../pinecone/src/pinecone_haystack/document_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 5b4afb39c..96415b306 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -5,6 +5,7 @@ import logging import os from typing import Any, Dict, List, Optional +from copy import copy import pandas as pd import pinecone @@ -241,14 +242,14 @@ def _convert_query_result_to_documents(self, query_result: Dict[str, Any]) -> Li def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> List[Dict[str, Any]]: documents_for_pinecone = [] for document in documents: - embedding = document.embedding + embedding = copy(document.embedding) if embedding is None: logger.warning( f"Document {document.id} has no embedding. Pinecone is a purely vector database. " "A dummy embedding will be used, but this can affect the search results. " ) embedding = self._dummy_vector - doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": document.meta} + doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} # we save content/dataframe as metadata # currently, storing blob in Pinecone is not supported From 9aa1ae86800d6422b2c3a1fb2c61ca967ca9396e Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 14:28:41 +0100 Subject: [PATCH 35/38] fmt --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 96415b306..f1d2d41c4 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -4,8 +4,8 @@ import io import logging import os -from typing import Any, Dict, List, Optional from copy import copy +from typing import Any, Dict, List, Optional import pandas as pd import pinecone From 2ff5adf5f0de2b417fbf54725f0f72380d7d512b Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 14:42:27 +0100 Subject: [PATCH 36/38] mv comment --- integrations/pinecone/src/pinecone_haystack/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index f1d2d41c4..3b10dd168 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -252,11 +252,11 @@ def _convert_documents_to_pinecone_format(self, documents: List[Document]) -> Li doc_for_pinecone = {"id": document.id, "values": embedding, "metadata": dict(document.meta)} # we save content/dataframe as metadata - # currently, storing blob in Pinecone is not supported if document.content is not None: doc_for_pinecone["metadata"]["content"] = document.content if document.dataframe is not None: doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() + # currently, storing blob in Pinecone is not supported if document.blob is not None: logger.warning( f"Document {document.id} has the `blob` field set, but storing `ByteStream` " From ffe3c73211b91c159ffbba233e8fe24c17e30b03 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 15:10:29 +0100 Subject: [PATCH 37/38] improve tests --- .../pinecone/src/pinecone_haystack/document_store.py | 2 +- integrations/pinecone/tests/test_document_store.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index 3b10dd168..576993de6 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -186,7 +186,7 @@ def _embedding_retrieval( This method is not mean to be part of the public interface of `PineconeDocumentStore` nor called directly. - `PineconeEmbeddingRetriever` uses this method directly and is the public interface for it. + `PineconeDenseRetriever` uses this method directly and is the public interface for it. :param query_embedding: Embedding of the query. :param namespace: Pinecone namespace to query. Defaults the namespace of the document store. diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 01017d67f..5c9b32698 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -35,6 +35,8 @@ def test_init(self, mock_pinecone): metric="euclidean", ) + mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter") + assert document_store.environment == "gcp-starter" assert document_store.index == "my_index" assert document_store.namespace == "test" @@ -43,7 +45,7 @@ def test_init(self, mock_pinecone): assert document_store.index_creation_kwargs == {"metric": "euclidean"} @patch("pinecone_haystack.document_store.pinecone") - def test_init_api_key_in_environment_variable(self, monkeypatch): + def test_init_api_key_in_environment_variable(self, mock_pinecone, monkeypatch): monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key") PineconeDocumentStore( @@ -55,7 +57,7 @@ def test_init_api_key_in_environment_variable(self, monkeypatch): metric="euclidean", ) - assert True + mock_pinecone.init.assert_called_with(api_key="fake-api-key", environment="gcp-starter") def test_init_fails_wo_api_key(self, monkeypatch): api_key = None From 091b82ae6c7ea362b9747477d4425e2370c0fdd7 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Fri, 22 Dec 2023 15:27:42 +0100 Subject: [PATCH 38/38] readmes --- README.md | 3 ++- integrations/pinecone/README.md | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2dc547050..352533579 100644 --- a/README.md +++ b/README.md @@ -71,4 +71,5 @@ deepset-haystack | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | -| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | +| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) +| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | diff --git a/integrations/pinecone/README.md b/integrations/pinecone/README.md index 3fa4de623..bf48e1e66 100644 --- a/integrations/pinecone/README.md +++ b/integrations/pinecone/README.md @@ -1,17 +1,23 @@ -[![test](https://github.com/deepset-ai/document-store/actions/workflows/test.yml/badge.svg)](https://github.com/deepset-ai/document-store/actions/workflows/test.yml) +[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) -# Pinecone Document Store +[![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pinecone-haystack.svg)](https://pypi.org/project/pinecone-haystack) -This Github repository is a template that can be used to create custom document stores to extend -the new [Haystack](https://github.com/deepset-ai/haystack/) API available under the `preview` -package starting from version 1.15. +# Pinecone Document Store -While the new API is still under active development, the new "Store" architecture is quite stable -and we are encouraging early adopters to contribute their custom document stores. +Document Store for Haystack 2.x, supports Pinecone. ## Installation -## Examples +```console +pip install pinecone-haystack +``` + +## Testing + +```console +hatch run test +``` ## License