Cohere integration (#203)

* add initial implementation of dense Cohere encoder * added cohere encoder test * add cohere api key to action envs * switch to english as default * [pyproject] Added cohere optional dependency So that users could use Canopy with Cohere embedding * [README] Added Cohere API key * Update action.yml and add --all-extras flag * Fix pinecone-text depedency Should be `^`, not hard coded single version. --------- Co-authored-by: DosticJelena <[email protected]> Co-authored-by: ilai <[email protected]> Co-authored-by: miararoy <[email protected]> Co-authored-by: igiloh-pinecone <[email protected]>
pinecone-io · Dec 12, 2023 · 5815be7 · 5815be7
1 parent 230034b
commit 5815be7
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 2 deletions.
diff --git a/.github/actions/install-deps-and-canopy/action.yml b/.github/actions/install-deps-and-canopy/action.yml
@@ -37,7 +37,7 @@ runs:
   - name: Install dependencies
     shell: bash
     if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-    run: poetry install --no-interaction --no-root --with dev
+    run: poetry install --no-interaction --no-root --all-extras --with dev
   - name: Install project
     if: ${{ inputs.install-canopy == 'true' }}
     shell: bash

diff --git a/.github/workflows/PR.yml b/.github/workflows/PR.yml
@@ -73,6 +73,7 @@ jobs:
         PINECONE_API_KEY: ${{ matrix.pinecone-plan == 'paid' && secrets.PINECONE_API_KEY_3 || secrets.PINECONE_API_KEY_4 }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         ANYSCALE_API_KEY: ${{ secrets.ANYSCALE_API_KEY }}
+        CO_API_KEY: ${{ secrets.CO_API_KEY }}
       run: poetry run pytest -n 3 --dist loadscope --html=report_system.html --self-contained-html tests/system
     - name: Run e2e tests
       if: github.event_name == 'merge_group'
@@ -81,6 +82,7 @@ jobs:
         PINECONE_ENVIRONMENT: ${{ matrix.pinecone-plan == 'paid' && secrets.PINECONE_ENVIRONMENT_3 || secrets.PINECONE_ENVIRONMENT_4 }}
         PINECONE_API_KEY: ${{ matrix.pinecone-plan == 'paid' && secrets.PINECONE_API_KEY_3 || secrets.PINECONE_API_KEY_4 }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        CO_API_KEY: ${{ secrets.CO_API_KEY }}
         CE_LOG_FILENAME: e2e.log
       run: poetry run pytest -n 3 --dist loadscope --html=report_e2e.html --self-contained-html tests/e2e
     - name: upload pytest report.html

diff --git a/README.md b/README.md
@@ -79,6 +79,7 @@ export INDEX_NAME="<INDEX_NAME>"
 | `PINECONE_ENVIRONMENT`| Determines the Pinecone service cloud environment of your index e.g `west1-gcp`, `us-east-1-aws`, etc                       | You can find the Pinecone environment next to the API key in [console](https://app.pinecone.io/)                                                                             |
 | `OPENAI_API_KEY`      | API key for OpenAI. Used to authenticate to OpenAI's services for embedding and chat API                                    | You can find your OpenAI API key [here](https://platform.openai.com/account/api-keys). You might need to login or register to OpenAI services                                |
 | `ANYSCALE_API_KEY`    | API key for Anyscale. Used to authenticate to Anyscale Endpoints for open source LLMs                                    | You can register Anyscale Endpoints and find your API key [here](https://app.endpoints.anyscale.com/)
+| `CO_API_KEY`   | API key for Cohere. Used to authenticate to Cohere services for embedding                                           | You can find more information on registering to Cohere [here](https://cohere.com/pricing)
 | `INDEX_NAME`          | Name of the Pinecone index Canopy will underlying work with                                                                  | You can choose any name as long as it follows Pinecone's [restrictions](https://support.pinecone.io/hc/en-us/articles/11729246212637-Are-there-restrictions-on-index-names-#:~:text=There%20are%20two%20main%20restrictions,and%20emojis%20are%20not%20supported.)                                                                                       |
 | `CANOPY_CONFIG_FILE` | The path of a configuration yaml file to be used by the Canopy server. | Optional - if not provided, default configuration would be used |
 </details>

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,10 +29,14 @@ types-pyyaml = "^6.0.12.12"
 jsonschema = "^4.2.0"
 types-jsonschema = "^4.2.0"
 prompt-toolkit = "^3.0.39"
-pinecone-text = "^0.7.0"
+pinecone-text = "^0.7.1"
 tokenizers = "^0.15.0"
 transformers = "^4.35.2"
 sentencepiece = "^0.1.99"
+cohere = { version = ">=4.37", optional = true }
+
+[tool.poetry.extras]
+cohere = ["cohere"]
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/src/canopy/knowledge_base/record_encoder/__init__.py b/src/canopy/knowledge_base/record_encoder/__init__.py
@@ -1,3 +1,4 @@
 from .base import RecordEncoder
+from .cohere import CohereEncoder
 from .dense import DenseRecordEncoder
 from .openai import OpenAIRecordEncoder
diff --git a/src/canopy/knowledge_base/record_encoder/cohere.py b/src/canopy/knowledge_base/record_encoder/cohere.py
@@ -0,0 +1,53 @@
+from typing import List
+from pinecone_text.dense.cohere_encoder import CohereEncoder
+from canopy.knowledge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
+from canopy.knowledge_base.record_encoder.dense import DenseRecordEncoder
+from canopy.models.data_models import Query
+
+
+class CohereRecordEncoder(DenseRecordEncoder):
+    """
+    CohereRecordEncoder is a type of DenseRecordEncoder that uses the Cohere `embed` API.
+    The implementation uses the `CohereEncoder` class from the `pinecone-text` library.
+    For more information about see: https://github.com/pinecone-io/pinecone-text
+
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        *,
+        model_name: str = "embed-english-v3.0",
+        batch_size: int = 100,
+        **kwargs,
+    ):
+        """
+        Initialize the CohereRecordEncoder
+
+        Args:
+            model_name: The name of the Cohere embeddings model to use for encoding. See https://docs.cohere.com/reference/embed
+            batch_size: The number of documents or queries to encode at once.
+                        Defaults to 400.
+            **kwargs: Additional arguments to pass to the underlying `pinecone-text. CohereEncoder`.
+        """  # noqa: E501
+        encoder = CohereEncoder(model_name, **kwargs)
+        super().__init__(dense_encoder=encoder, batch_size=batch_size)
+
+    def encode_documents(self, documents: List[KBDocChunk]) -> List[KBEncodedDocChunk]:
+        """
+        Encode a list of documents, takes a list of KBDocChunk and returns a list of KBEncodedDocChunk.
+
+        Args:
+            documents: A list of KBDocChunk to encode.
+
+        Returns:
+            encoded chunks: A list of KBEncodedDocChunk, with the `values` field populated by the generated embeddings vector.
+        """  # noqa: E501
+        return super().encode_documents(documents)
+
+    async def _aencode_documents_batch(
+        self, documents: List[KBDocChunk]
+    ) -> List[KBEncodedDocChunk]:
+        raise NotImplementedError
+
+    async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
+        raise NotImplementedError
diff --git a/tests/system/record_encoder/test_cohere_record_encoder.py b/tests/system/record_encoder/test_cohere_record_encoder.py
@@ -0,0 +1,53 @@
+import pytest
+
+from canopy.knowledge_base.models import KBDocChunk
+from canopy.knowledge_base.record_encoder.cohere import CohereRecordEncoder
+from canopy.models.data_models import Query
+
+
+documents = [KBDocChunk(
+            id=f"doc_1_{i}",
+            text=f"Sample document {i}",
+            document_id=f"doc_{i}",
+            metadata={"test": i},
+            source="doc_1",
+        )
+        for i in range(4)
+    ]
+
+queries = [Query(text="Sample query 1"),
+           Query(text="Sample query 2"),
+           Query(text="Sample query 3"),
+           Query(text="Sample query 4")]
+
+
+@pytest.fixture
+def encoder():
+    return CohereRecordEncoder(batch_size=2)
+
+
+def test_dimension(encoder):
+    assert encoder.dimension == 1024
+
+
+@pytest.mark.parametrize("items,function",
+                         [(documents, "encode_documents"),
+                          (queries, "encode_queries"),
+                          ([], "encode_documents"),
+                          ([], "encode_queries")])
+def test_encode_documents(encoder, items, function):
+
+    encoded_documents = getattr(encoder, function)(items)
+
+    assert len(encoded_documents) == len(items)
+    assert all(len(encoded.values) == encoder.dimension
+               for encoded in encoded_documents)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("items,function",
+                         [("aencode_documents", documents),
+                          ("aencode_queries", queries)])
+async def test_aencode_not_implemented(encoder, function, items):
+    with pytest.raises(NotImplementedError):
+        await encoder.aencode_queries(items)