Add dimension support for OpenAI embeddings (#273)

* Add dimension support for OpenAI embeddings * Fix Jina tests
pinecone-io · Feb 4, 2024 · 2ca606f · 2ca606f
1 parent 95c7b24
commit 2ca606f
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 32 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -112,9 +112,8 @@ chat_engine:
         type: OpenAIRecordEncoder       # Options: [OpenAIRecordEncoder, AzureOpenAIRecordEncoder]
         params:
           model_name:                   # The name of the model to use for encoding
-            text-embedding-ada-002
+            text-embedding-3-small
           batch_size: 400               # The number of document chunks to encode in each call to the encoding model
-
 create_index_params:
   # -------------------------------------------------------------------------------------------
   # Initialization parameters to be passed to create a canopy index. These parameters will

diff --git a/pyproject.toml b/pyproject.toml
@@ -28,8 +28,8 @@ types-pyyaml = "^6.0.12.12"
 jsonschema = "^4.2.0"
 types-jsonschema = "^4.2.0"
 prompt-toolkit = "^3.0.39"
-pinecone-text = [{version = "^0.7.2"},
-                 {version = "^0.7.2", extras = ["dense"], optional = true}]
+pinecone-text = [{version = "^0.8.0"},
+                 {version = "^0.8.0", extras = ["dense"], optional = true}]
 
 tokenizers = "^0.15.0"
 transformers = {version = "^4.35.2", optional = true}

diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py
@@ -260,7 +260,6 @@ def verify_index_connection(self) -> None:
 
     def create_canopy_index(self,
                             spec: Union[Dict, ServerlessSpec, PodSpec] = None,
-                            dimension: Optional[int] = None,
                             metric: Optional[str] = "cosine"
                             ):
         """
@@ -283,9 +282,6 @@ def create_canopy_index(self,
            spec: A dictionary containing configurations describing how the index should be deployed. For serverless indexes,
                  specify region and cloud. For pod indexes, specify replicas, shards, pods, pod_type, metadata_config,
                  and source_collection.
-           dimension: The dimension of the vectors to index.
-                       If `dimension` isn't explicitly provided,
-                       Canopy would try to infer the embedding's dimension based on the configured `Encoder`
            metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The
                    default is 'cosine'.
 
@@ -297,22 +293,21 @@ def create_canopy_index(self,
                 region="us-west-2"
             )
 
-        if dimension is None:
-            try:
-                encoder_dimension = self._encoder.dimension
-                if encoder_dimension is None:
-                    raise RuntimeError(
-                        f"The selected encoder {self._encoder.__class__.__name__} does "
-                        f"not support inferring the vectors' dimensionality."
-                    )
-                dimension = encoder_dimension
-            except Exception as e:
+        try:
+            encoder_dimension = self._encoder.dimension
+            if encoder_dimension is None:
                 raise RuntimeError(
-                    f"Canopy has failed to infer vectors' dimensionality using the "
-                    f"selected encoder: {self._encoder.__class__.__name__}. You can "
-                    f"provide the dimension manually, try using a different encoder, or"
-                    f" fix the underlying error:\n{e}"
-                ) from e
+                    f"The selected encoder {self._encoder.__class__.__name__} does "
+                    f"not support inferring the vectors' dimensionality."
+                )
+            dimension = encoder_dimension
+        except Exception as e:
+            raise RuntimeError(
+                f"Canopy has failed to infer vectors' dimensionality using the "
+                f"selected encoder: {self._encoder.__class__.__name__}. You can "
+                f"provide the dimension manually, try using a different encoder, or"
+                f" fix the underlying error:\n{e}"
+            ) from e
 
         if self.index_name in list_canopy_indexes(self._pinecone_client):
             raise RuntimeError(

diff --git a/src/canopy/knowledge_base/record_encoder/dense.py b/src/canopy/knowledge_base/record_encoder/dense.py
@@ -58,14 +58,11 @@ def _encode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
     def dimension(self) -> int:
         """
         The dimension is the length of the vector generated by the `DenseRecordEncoder`
-        Canopy will run a single word through the encoder to get the dimension, this will also validate that the encoder
-        is working properly.
 
         Returns:
             dimension(int): the dimension of the encoder
         """  # noqa: E501
-        dummy_doc = KBDocChunk(text="hello", id="dummy_doc", document_id="dummy_doc")
-        return len(self.encode_documents([dummy_doc])[0].values)
+        return self._dense_encoder.dimension
 
     async def _aencode_documents_batch(self,
                                        documents: List[KBDocChunk]

diff --git a/src/canopy/knowledge_base/record_encoder/openai.py b/src/canopy/knowledge_base/record_encoder/openai.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 
 from openai import OpenAIError, RateLimitError, APIConnectionError, AuthenticationError
 from pinecone_text.dense.openai_encoder import OpenAIEncoder
@@ -18,8 +18,9 @@ class OpenAIRecordEncoder(DenseRecordEncoder):
     def __init__(
         self,
         *,
-        model_name: str = "text-embedding-ada-002",
+        model_name: str = "text-embedding-3-small",
         batch_size: int = 400,
+        dimension: Optional[int] = None,
         **kwargs
     ):
         """
@@ -29,10 +30,11 @@ def __init__(
             model_name: The name of the OpenAI embeddings model to use for encoding. See https://platform.openai.com/docs/models/embeddings
             batch_size: The number of documents or queries to encode at once.
                         Defaults to 400.
+            dimension: The dimension of the embeddings vector to generate.
             **kwargs: Additional arguments to pass to the underlying `pinecone-text. OpenAIEncoder`.
         """  # noqa: E501
         try:
-            encoder = OpenAIEncoder(model_name, **kwargs)
+            encoder = OpenAIEncoder(model_name, dimension=dimension, **kwargs)
         except OpenAIError as e:
             raise RuntimeError(
                 "Failed to connect to OpenAI, please make sure that the OPENAI_API_KEY "

diff --git a/tests/unit/record_encoder/test_jina_record_encoder.py b/tests/unit/record_encoder/test_jina_record_encoder.py
@@ -30,13 +30,16 @@ def encoder():
 def test_dimension(encoder):
     with patch('pinecone_text.dense.JinaEncoder.encode_documents') \
             as mock_encode_documents:
-        mock_encode_documents.return_value = [[0.1, 0.2, 0.3]]
+        mock_encode_documents.return_value = [0.1, 0.2, 0.3]
         assert encoder.dimension == 3
 
 
 def custom_encode(*args, **kwargs):
     input_to_encode = args[0]
-    return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    if isinstance(input_to_encode, list):
+        return [[0.1, 0.2, 0.3] for _ in input_to_encode]
+    else:
+        return [0.1, 0.2, 0.3]
 
 
 @pytest.mark.parametrize("items,function",