From c63eac2f0150f25f531bf9666a4da47ed575ada9 Mon Sep 17 00:00:00 2001 From: anakin87 Date: Thu, 21 Dec 2023 11:45:16 +0100 Subject: [PATCH] improvements from PR review --- integrations/pinecone/pyproject.toml | 2 +- .../src/pinecone_haystack/document_store.py | 35 ++++++++----- integrations/pinecone/tests/conftest.py | 3 +- .../pinecone/tests/test_document_store.py | 50 ++++++++++--------- integrations/pinecone/tests/test_write.py | 4 +- 5 files changed, 53 insertions(+), 41 deletions(-) diff --git a/integrations/pinecone/pyproject.toml b/integrations/pinecone/pyproject.toml index 1f8b1f328..506795e7f 100644 --- a/integrations/pinecone/pyproject.toml +++ b/integrations/pinecone/pyproject.toml @@ -11,7 +11,7 @@ requires-python = ">=3.8" license = "Apache-2.0" keywords = [] authors = [ - { name = "John Doe", email = "jd@example.com" }, + { name = "deepset GmbH", email = "info@deepset.ai" }, ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/integrations/pinecone/src/pinecone_haystack/document_store.py b/integrations/pinecone/src/pinecone_haystack/document_store.py index d213235df..c940334c6 100644 --- a/integrations/pinecone/src/pinecone_haystack/document_store.py +++ b/integrations/pinecone/src/pinecone_haystack/document_store.py @@ -8,7 +8,7 @@ import pandas as pd import pinecone -from haystack import default_from_dict, default_to_dict +from haystack import default_to_dict from haystack.dataclasses import Document from haystack.document_stores import DuplicatePolicy @@ -53,7 +53,7 @@ def __init__( [API reference](https://docs.pinecone.io/reference/create_index-1). """ - api_key = api_key or os.environ.get("PINECONE_API_KEY", None) + api_key = api_key or os.environ.get("PINECONE_API_KEY") if not api_key: msg = ( "PineconeDocumentStore expects a Pinecone API key. " @@ -64,10 +64,22 @@ def __init__( pinecone.init(api_key=api_key, environment=environment) if index not in pinecone.list_indexes(): + logger.info(f"Index {index} does not exist. Creating a new index.") pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs) + else: + logger.info(f"Index {index} already exists. Connecting to it.") self._index = pinecone.Index(index_name=index) - self.dimension = self._index.describe_index_stats()["dimension"] + + actual_dimension = self._index.describe_index_stats().get("dimension") + if actual_dimension and actual_dimension != dimension: + logger.warning( + f"Dimension of index {index} is {actual_dimension}, but {dimension} was specified. " + "The specified dimension will be ignored." + "If you need an index with a different dimension, please create a new one." + ) + self.dimension = actual_dimension or dimension + self._dummy_vector = [0.0] * self.dimension self.environment = environment self.index = index @@ -86,10 +98,6 @@ def to_dict(self) -> Dict[str, Any]: **self.index_creation_kwargs, ) - @classmethod - def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore": - return default_from_dict(cls, data) - def count_documents(self) -> int: """ Returns how many documents are present in the document store. @@ -110,10 +118,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D :return: The number of documents written to the document store. """ - if len(documents) > 0: + try: if not isinstance(documents[0], Document): - msg = "param 'documents' must contain a list of objects of type Document" - raise ValueError(msg) + raise TypeError() + except (TypeError, KeyError) as e: + msg = "param 'documents' must contain a list of objects of type Document" + raise TypeError(msg) from e if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]: logger.warning( @@ -139,8 +149,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json() if document.blob is not None: logger.warning( - f"Document {document.id} has a blob. Currently, storing blob in Pinecone is not supported. " - "The blob will be ignored." + f"Document {document.id} has the `blob` field set, but storing `ByteStream` " + "objects in Pinecone is not supported. " + "The content of the `blob` field will be ignored." ) documents_for_pinecone.append(doc_for_pinecone) diff --git a/integrations/pinecone/tests/conftest.py b/integrations/pinecone/tests/conftest.py index 6296c4f1e..4f5c677e7 100644 --- a/integrations/pinecone/tests/conftest.py +++ b/integrations/pinecone/tests/conftest.py @@ -1,5 +1,4 @@ import time -from random import randint import pytest @@ -23,7 +22,7 @@ def document_store(request): environment = "gcp-starter" index = "default" # Use a different namespace for each test so we can run them in parallel - namespace = f"{request.node.name}-{randint(0, 1000)}" # noqa: S311 Ruff complains about using random numbers for cryptographic purposes + namespace = f"{request.node.name}-{int(time.time())}" dimension = 10 store = PineconeDocumentStore( diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 2d709449c..b68aec49d 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -1,6 +1,7 @@ import time from unittest.mock import patch +import pytest from haystack import Document from pinecone_haystack.document_store import PineconeDocumentStore @@ -28,6 +29,31 @@ def test_init(self, mock_pinecone): assert document_store.dimension == 30 assert document_store.index_creation_kwargs == {"metric": "euclidean"} + @patch("pinecone_haystack.document_store.pinecone") + def test_init_api_key_in_environment_variable(self, monkeypatch): + monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key") + + PineconeDocumentStore( + environment="gcp-starter", + index="my_index", + namespace="test", + batch_size=50, + dimension=30, + metric="euclidean", + ) + + assert True + + def test_init_fails_wo_api_key(self, monkeypatch): + api_key = None + monkeypatch.delenv("PINECONE_API_KEY", raising=False) + with pytest.raises(ValueError): + PineconeDocumentStore( + api_key=api_key, + environment="gcp-starter", + index="my_index", + ) + @patch("pinecone_haystack.document_store.pinecone") def test_to_dict(self, mock_pinecone): mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} @@ -52,30 +78,6 @@ def test_to_dict(self, mock_pinecone): }, } - @patch("pinecone_haystack.document_store.pinecone") - def test_from_dict(self, mock_pinecone): - mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30} - - data = { - "type": "pinecone_haystack.document_store.PineconeDocumentStore", - "init_parameters": { - "environment": "gcp-starter", - "index": "my_index", - "dimension": 30, - "namespace": "test", - "batch_size": 50, - "metric": "euclidean", - }, - } - - document_store = PineconeDocumentStore.from_dict(data) - assert document_store.environment == "gcp-starter" - assert document_store.index == "my_index" - assert document_store.namespace == "test" - assert document_store.batch_size == 50 - assert document_store.dimension == 30 - assert document_store.index_creation_kwargs == {"metric": "euclidean"} - def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time): docs = [ Document(content="Most similar document", embedding=[1.0] * 10), diff --git a/integrations/pinecone/tests/test_write.py b/integrations/pinecone/tests/test_write.py index 25641f7a4..3aa2e1eda 100644 --- a/integrations/pinecone/tests/test_write.py +++ b/integrations/pinecone/tests/test_write.py @@ -31,10 +31,10 @@ def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocum time.sleep(sleep_time) self.assert_documents_are_equal(document_store.filter_documents(), [doc1]) - @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore): ... - @pytest.mark.skip(reason="Qdrant only supports UPSERT operations") + @pytest.mark.skip(reason="Pinecone only supports UPSERT operations") def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore): ...