Skip to content

Commit

Permalink
improvements from PR review
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 committed Dec 21, 2023
1 parent f17b6ec commit c63eac2
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 41 deletions.
2 changes: 1 addition & 1 deletion integrations/pinecone/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ requires-python = ">=3.8"
license = "Apache-2.0"
keywords = []
authors = [
{ name = "John Doe", email = "[email protected]" },
{ name = "deepset GmbH", email = "[email protected]" },
]
classifiers = [
"Development Status :: 4 - Beta",
Expand Down
35 changes: 23 additions & 12 deletions integrations/pinecone/src/pinecone_haystack/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd
import pinecone
from haystack import default_from_dict, default_to_dict
from haystack import default_to_dict
from haystack.dataclasses import Document
from haystack.document_stores import DuplicatePolicy

Expand Down Expand Up @@ -53,7 +53,7 @@ def __init__(
[API reference](https://docs.pinecone.io/reference/create_index-1).
"""
api_key = api_key or os.environ.get("PINECONE_API_KEY", None)
api_key = api_key or os.environ.get("PINECONE_API_KEY")
if not api_key:
msg = (
"PineconeDocumentStore expects a Pinecone API key. "
Expand All @@ -64,10 +64,22 @@ def __init__(
pinecone.init(api_key=api_key, environment=environment)

if index not in pinecone.list_indexes():
logger.info(f"Index {index} does not exist. Creating a new index.")
pinecone.create_index(name=index, dimension=dimension, **index_creation_kwargs)
else:
logger.info(f"Index {index} already exists. Connecting to it.")

self._index = pinecone.Index(index_name=index)
self.dimension = self._index.describe_index_stats()["dimension"]

actual_dimension = self._index.describe_index_stats().get("dimension")
if actual_dimension and actual_dimension != dimension:
logger.warning(
f"Dimension of index {index} is {actual_dimension}, but {dimension} was specified. "
"The specified dimension will be ignored."
"If you need an index with a different dimension, please create a new one."
)
self.dimension = actual_dimension or dimension

self._dummy_vector = [0.0] * self.dimension
self.environment = environment
self.index = index
Expand All @@ -86,10 +98,6 @@ def to_dict(self) -> Dict[str, Any]:
**self.index_creation_kwargs,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore":
return default_from_dict(cls, data)

def count_documents(self) -> int:
"""
Returns how many documents are present in the document store.
Expand All @@ -110,10 +118,12 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
:return: The number of documents written to the document store.
"""
if len(documents) > 0:
try:
if not isinstance(documents[0], Document):
msg = "param 'documents' must contain a list of objects of type Document"
raise ValueError(msg)
raise TypeError()
except (TypeError, KeyError) as e:
msg = "param 'documents' must contain a list of objects of type Document"
raise TypeError(msg) from e

if policy not in [DuplicatePolicy.NONE, DuplicatePolicy.OVERWRITE]:
logger.warning(
Expand All @@ -139,8 +149,9 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
doc_for_pinecone["metadata"]["dataframe"] = document.dataframe.to_json()
if document.blob is not None:
logger.warning(
f"Document {document.id} has a blob. Currently, storing blob in Pinecone is not supported. "
"The blob will be ignored."
f"Document {document.id} has the `blob` field set, but storing `ByteStream` "
"objects in Pinecone is not supported. "
"The content of the `blob` field will be ignored."
)

documents_for_pinecone.append(doc_for_pinecone)
Expand Down
3 changes: 1 addition & 2 deletions integrations/pinecone/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import time
from random import randint

import pytest

Expand All @@ -23,7 +22,7 @@ def document_store(request):
environment = "gcp-starter"
index = "default"
# Use a different namespace for each test so we can run them in parallel
namespace = f"{request.node.name}-{randint(0, 1000)}" # noqa: S311 Ruff complains about using random numbers for cryptographic purposes
namespace = f"{request.node.name}-{int(time.time())}"
dimension = 10

store = PineconeDocumentStore(
Expand Down
50 changes: 26 additions & 24 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import time
from unittest.mock import patch

import pytest
from haystack import Document

from pinecone_haystack.document_store import PineconeDocumentStore
Expand Down Expand Up @@ -28,6 +29,31 @@ def test_init(self, mock_pinecone):
assert document_store.dimension == 30
assert document_store.index_creation_kwargs == {"metric": "euclidean"}

@patch("pinecone_haystack.document_store.pinecone")
def test_init_api_key_in_environment_variable(self, monkeypatch):
monkeypatch.setenv("PINECONE_API_KEY", "fake-api-key")

PineconeDocumentStore(
environment="gcp-starter",
index="my_index",
namespace="test",
batch_size=50,
dimension=30,
metric="euclidean",
)

assert True

def test_init_fails_wo_api_key(self, monkeypatch):
api_key = None
monkeypatch.delenv("PINECONE_API_KEY", raising=False)
with pytest.raises(ValueError):
PineconeDocumentStore(
api_key=api_key,
environment="gcp-starter",
index="my_index",
)

@patch("pinecone_haystack.document_store.pinecone")
def test_to_dict(self, mock_pinecone):
mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}
Expand All @@ -52,30 +78,6 @@ def test_to_dict(self, mock_pinecone):
},
}

@patch("pinecone_haystack.document_store.pinecone")
def test_from_dict(self, mock_pinecone):
mock_pinecone.Index.return_value.describe_index_stats.return_value = {"dimension": 30}

data = {
"type": "pinecone_haystack.document_store.PineconeDocumentStore",
"init_parameters": {
"environment": "gcp-starter",
"index": "my_index",
"dimension": 30,
"namespace": "test",
"batch_size": 50,
"metric": "euclidean",
},
}

document_store = PineconeDocumentStore.from_dict(data)
assert document_store.environment == "gcp-starter"
assert document_store.index == "my_index"
assert document_store.namespace == "test"
assert document_store.batch_size == 50
assert document_store.dimension == 30
assert document_store.index_creation_kwargs == {"metric": "euclidean"}

def test_embedding_retrieval(self, document_store: PineconeDocumentStore, sleep_time):
docs = [
Document(content="Most similar document", embedding=[1.0] * 10),
Expand Down
4 changes: 2 additions & 2 deletions integrations/pinecone/tests/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def test_write_documents_duplicate_overwrite(self, document_store: PineconeDocum
time.sleep(sleep_time)
self.assert_documents_are_equal(document_store.filter_documents(), [doc1])

@pytest.mark.skip(reason="Qdrant only supports UPSERT operations")
@pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
def test_write_documents_duplicate_fail(self, document_store: PineconeDocumentStore):
...

@pytest.mark.skip(reason="Qdrant only supports UPSERT operations")
@pytest.mark.skip(reason="Pinecone only supports UPSERT operations")
def test_write_documents_duplicate_skip(self, document_store: PineconeDocumentStore):
...

0 comments on commit c63eac2

Please sign in to comment.