Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Add dimension support for OpenAI embeddings (#273)
Browse files Browse the repository at this point in the history
* Add dimension support for OpenAI embeddings

* Fix Jina tests
  • Loading branch information
izellevy authored Feb 4, 2024
1 parent 95c7b24 commit 2ca606f
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 32 deletions.
3 changes: 1 addition & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,8 @@ chat_engine:
type: OpenAIRecordEncoder # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
params:
model_name: # The name of the model to use for encoding
text-embedding-ada-002
text-embedding-3-small
batch_size: 400 # The number of document chunks to encode in each call to the encoding model

create_index_params:
# -------------------------------------------------------------------------------------------
# Initialization parameters to be passed to create a canopy index. These parameters will
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ types-pyyaml = "^6.0.12.12"
jsonschema = "^4.2.0"
types-jsonschema = "^4.2.0"
prompt-toolkit = "^3.0.39"
pinecone-text = [{version = "^0.7.2"},
{version = "^0.7.2", extras = ["dense"], optional = true}]
pinecone-text = [{version = "^0.8.0"},
{version = "^0.8.0", extras = ["dense"], optional = true}]

tokenizers = "^0.15.0"
transformers = {version = "^4.35.2", optional = true}
Expand Down
33 changes: 14 additions & 19 deletions src/canopy/knowledge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def verify_index_connection(self) -> None:

def create_canopy_index(self,
spec: Union[Dict, ServerlessSpec, PodSpec] = None,
dimension: Optional[int] = None,
metric: Optional[str] = "cosine"
):
"""
Expand All @@ -283,9 +282,6 @@ def create_canopy_index(self,
spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes,
specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config,
and source_collection.
dimension: The dimension of the vectors to index.
If `dimension` isn't explicitly provided,
Canopy would try to infer the embedding's dimension based on the configured `Encoder`
metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The
default is 'cosine'.
Expand All @@ -297,22 +293,21 @@ def create_canopy_index(self,
region="us-west-2"
)

if dimension is None:
try:
encoder_dimension = self._encoder.dimension
if encoder_dimension is None:
raise RuntimeError(
f"The selected encoder {self._encoder.__class__.__name__} does "
f"not support inferring the vectors' dimensionality."
)
dimension = encoder_dimension
except Exception as e:
try:
encoder_dimension = self._encoder.dimension
if encoder_dimension is None:
raise RuntimeError(
f"Canopy has failed to infer vectors' dimensionality using the "
f"selected encoder: {self._encoder.__class__.__name__}. You can "
f"provide the dimension manually, try using a different encoder, or"
f" fix the underlying error:\n{e}"
) from e
f"The selected encoder {self._encoder.__class__.__name__} does "
f"not support inferring the vectors' dimensionality."
)
dimension = encoder_dimension
except Exception as e:
raise RuntimeError(
f"Canopy has failed to infer vectors' dimensionality using the "
f"selected encoder: {self._encoder.__class__.__name__}. You can "
f"provide the dimension manually, try using a different encoder, or"
f" fix the underlying error:\n{e}"
) from e

if self.index_name in list_canopy_indexes(self._pinecone_client):
raise RuntimeError(
Expand Down
5 changes: 1 addition & 4 deletions src/canopy/knowledge_base/record_encoder/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,11 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
def dimension(self) -> int:
"""
The dimension is the length of the vector generated by the `DenseRecordEncoder`
Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
is working properly.
Returns:
dimension(int): the dimension of the encoder
""" # noqa: E501
dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc")
return len(self.encode_documents([dummy_doc])[0].values)
return self._dense_encoder.dimension

async def _aencode_documents_batch(self,
documents: List[KBDocChunk]
Expand Down
8 changes: 5 additions & 3 deletions src/canopy/knowledge_base/record_encoder/openai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import List, Optional

from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
from pinecone_text.dense.openai_encoder import OpenAIEncoder
Expand All @@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder):
def __init__(
self,
*,
model_name: str = "text-embedding-ada-002",
model_name: str = "text-embedding-3-small",
batch_size: int = 400,
dimension: Optional[int] = None,
**kwargs
):
"""
Expand All @@ -29,10 +30,11 @@ def __init__(
model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
batch_size: The number of documents or queries to encode at once.
Defaults to 400.
dimension: The dimension of the embeddings vector to generate.
**kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
""" # noqa: E501
try:
encoder = OpenAIEncoder(model_name, **kwargs)
encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
except OpenAIError as e:
raise RuntimeError(
"Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "
Expand Down
7 changes: 5 additions & 2 deletions tests/unit/record_encoder/test_jina_record_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ def encoder():
def test_dimension(encoder):
with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
as mock_encode_documents:
mock_encode_documents.return_value = [[0.1, 0.2, 0.3]]
mock_encode_documents.return_value = [0.1, 0.2, 0.3]
assert encoder.dimension == 3


def custom_encode(*args, **kwargs):
input_to_encode = args[0]
return [[0.1, 0.2, 0.3] for _ in input_to_encode]
if isinstance(input_to_encode, list):
return [[0.1, 0.2, 0.3] for _ in input_to_encode]
else:
return [0.1, 0.2, 0.3]


@pytest.mark.parametrize("items,function",
Expand Down

0 comments on commit 2ca606f

Please sign in to comment.