diff --git a/README.md b/README.md index 4f5e01bb..a299ae1a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Canopy

- + Supported Python versions - + Package version

diff --git a/config/config.yaml b/config/config.yaml index 0d3576cd..e4ab611b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -93,10 +93,6 @@ chat_engine: knowledge_base: params: default_top_k: 5 # The default number of document chunks to retrieve for each query -# index_params: # Optional - index creation parameters for `create_canopy_index()` or `canopy new` -# metric: cosine -# pod_type: p1 - # -------------------------------------------------------------------------- # Configuration for the Chunker subcomponent of the knowledge base. diff --git a/src/canopy/knowledge_base/knowledge_base.py b/src/canopy/knowledge_base/knowledge_base.py index 314a93c5..2cb1d45e 100644 --- a/src/canopy/knowledge_base/knowledge_base.py +++ b/src/canopy/knowledge_base/knowledge_base.py @@ -28,7 +28,6 @@ RESERVED_METADATA_KEYS = {"document_id", "text", "source"} DELETE_STARTER_BATCH_SIZE = 30 - DELETE_STARTER_CHUNKS_PER_DOC = 32 @@ -190,14 +189,13 @@ def __init__(self, else: self._pinecone_client = _get_global_client() - # Normally, index creation params are passed directly to the `.create_canopy_index()` method. # noqa: E501 - # However, when KnowledgeBase is initialized from a config file, these params - # would be set by the `KnowledgeBase.from_config()` constructor. - self._index_params: Dict[str, Any] = {} - # The index object is initialized lazily, when the user calls `connect()` or # `create_canopy_index()` self._index: Optional[Index] = None + self._default_spec = ServerlessSpec( + cloud='aws', + region='us-west-2' + ) def _connect_index(self) -> None: if self.index_name not in list_canopy_indexes(self._pinecone_client): @@ -262,12 +260,9 @@ def verify_index_connection(self) -> None: ) from e def create_canopy_index(self, - spec: Union[Dict, ServerlessSpec, PodSpec] = ServerlessSpec( - cloud='aws', - region='us-west-2' - ), + spec: Union[Dict, ServerlessSpec, PodSpec] = None, dimension: Optional[int] = None, - index_params: Optional[dict] = None + metric: Optional[str] = "cosine" ): """ Creates the underlying Pinecone index that will be used by the KnowledgeBase. @@ -292,12 +287,14 @@ def create_canopy_index(self, dimension: The dimension of the vectors to index. If `dimension` isn't explicitly provided, Canopy would try to infer the embedding's dimension based on the configured `Encoder` - index_params: A dictionary of parameters to pass to the index creation API. - For example, you can set the index's number of replicas by passing {"replicas": 2}. - see https://docs.pinecone.io/docs/python-client#create_index + metric: The distance metric to be used for similarity search: 'euclidean', 'cosine', or 'dotproduct'. The + default is 'cosine'. """ # noqa: E501 + if spec is None: + spec = self._default_spec + if dimension is None: try: encoder_dimension = self._encoder.dimension @@ -318,15 +315,13 @@ def create_canopy_index(self, "If you wish to delete it, use `delete_index()`. " ) - # create index - index_params = index_params or self._index_params try: self._pinecone_client.create_index( name=self.index_name, dimension=dimension, spec=spec, timeout=TIMEOUT_INDEX_CREATE, - **index_params) + metric=metric) except (Exception, PineconeApiException) as e: raise RuntimeError( f"Failed to create index {self.index_name} due to error: " @@ -633,11 +628,7 @@ def from_config(cls, ) config['params']['index_name'] = index_name - # If the config includes an 'index_params' key, they need to be saved until - # the index is created, and then passed to the index creation method. - index_params = config['params'].pop('index_params', {}) kb = cls._from_config(config) - kb._index_params = index_params return kb @staticmethod diff --git a/tests/e2e/test_app.py b/tests/e2e/test_app.py index bf6ecf9d..254b12f3 100644 --- a/tests/e2e/test_app.py +++ b/tests/e2e/test_app.py @@ -46,7 +46,7 @@ def assert_vector_ids_not_exist(vector_ids: List[str], @retry(reraise=True, stop=stop_after_attempt(5), wait=wait_random(min=10, max=20)) def try_create_canopy_index(kb: KnowledgeBase): - kb.create_canopy_index(index_params={"metric": "dotproduct"}) + kb.create_canopy_index(metric="dotproduct") @pytest.fixture(scope="module") diff --git a/tests/system/knowledge_base/test_knowledge_base.py b/tests/system/knowledge_base/test_knowledge_base.py index 56ef4df0..7cc8c378 100644 --- a/tests/system/knowledge_base/test_knowledge_base.py +++ b/tests/system/knowledge_base/test_knowledge_base.py @@ -62,7 +62,7 @@ def encoder(): @retry(reraise=True, stop=stop_after_attempt(5), wait=wait_random(min=10, max=20)) def try_create_canopy_index(kb: KnowledgeBase): - kb.create_canopy_index(index_params={"metric": "dotproduct"}) + kb.create_canopy_index(metric="dotproduct") @pytest.fixture(scope="module", autouse=True)