diff --git a/docs/docs/how_to/indexing.ipynb b/docs/docs/how_to/indexing.ipynb index e3e6ec8aef6d7..cdce7015614ce 100644 --- a/docs/docs/how_to/indexing.ipynb +++ b/docs/docs/how_to/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", + "Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `CrateDBVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index d331fb66e85dd..fa6a8dade0916 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -77,6 +77,7 @@ requests-toolbelt>=1.0.0,<2 rspace_client>=2.5.0,<3 scikit-learn>=1.2.2,<2 simsimd>=5.0.0,<6 +sqlalchemy-cratedb>=0.40.1,<1 sqlite-vss>=0.1.2,<0.2 sqlite-vec>=0.1.0,<0.2 sseclient-py>=1.8.0,<2 diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index c38beea0ed6d2..76ca824bd4e3e 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -92,6 +92,9 @@ from langchain_community.vectorstores.couchbase import ( CouchbaseVectorStore, ) + from langchain_community.vectorstores.cratedb import ( + CrateDBVectorStore, + ) from langchain_community.vectorstores.dashvector import ( DashVector, ) @@ -334,6 +337,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "DeepLake", @@ -438,6 +442,7 @@ "Clickhouse": "langchain_community.vectorstores.clickhouse", "ClickhouseSettings": "langchain_community.vectorstores.clickhouse", "CouchbaseVectorStore": "langchain_community.vectorstores.couchbase", + "CrateDBVectorStore": "langchain_community.vectorstores.cratedb", "DashVector": "langchain_community.vectorstores.dashvector", "DatabricksVectorSearch": "langchain_community.vectorstores.databricks_vector_search", # noqa: E501 "DeepLake": "langchain_community.vectorstores.deeplake", diff --git a/libs/community/langchain_community/vectorstores/cratedb/__init__.py b/libs/community/langchain_community/vectorstores/cratedb/__init__.py new file mode 100644 index 0000000000000..de17f15f7b252 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/__init__.py @@ -0,0 +1,7 @@ +from .base import CrateDBVectorStore +from .extended import CrateDBVectorStoreMultiCollection + +__all__ = [ + "CrateDBVectorStore", + "CrateDBVectorStoreMultiCollection", +] diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py new file mode 100644 index 0000000000000..a9ac1a285bdd9 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -0,0 +1,475 @@ +from __future__ import annotations + +import enum +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, +) + +import sqlalchemy +from langchain.docstore.document import Document +from langchain.schema.embeddings import Embeddings +from langchain.utils import get_from_dict_or_env +from langchain.vectorstores.pgvector import PGVector +from sqlalchemy.orm import sessionmaker + +from langchain_community.vectorstores.cratedb.model import ModelFactory + + +class DistanceStrategy(str, enum.Enum): + """ + Enumerator of the Distance strategies. + + Note that CrateDB and Lucene currently only implement + similarity based on the Euclidean distance. + + > Today, when creating a FLOAT_VECTOR, it uses the default + > EUCLIDEAN_HNSW (L2) similarity. + > + > -- https://github.com/crate/crate/issues/15768 + """ + + EUCLIDEAN = "euclidean" + COSINE = "cosine" + MAX_INNER_PRODUCT = "inner" + + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN + + +_LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain" + + +def _results_to_docs(docs_and_scores: Any) -> List[Document]: + """Return docs from docs and scores.""" + return [doc for doc, _ in docs_and_scores] + + +class CrateDBVectorStore(PGVector): + """`CrateDB` vector store. + + To use it, please install the Python package `sqlalchemy-cratedb`. + + uv pip install --upgrade sqlalchemy-cratedb + + Args: + connection_string: Database connection string. + embedding_function: Any embedding function implementing + `langchain.embeddings.base.Embeddings` interface. + collection_name: The name of the collection to use. (default: langchain) + NOTE: This is not the name of the table, but the name of the collection. + The tables will be created when initializing the store (if not exists) + So, make sure the user has the right permissions to create tables. + distance_strategy: The distance strategy to use. (default: EUCLIDEAN) + pre_delete_collection: If True, will delete the collection if it exists. + (default: False). Useful for testing. + + Example: + .. code-block:: python + + from langchain.vectorstores import CrateDBVectorStore + from langchain.embeddings.openai import OpenAIEmbeddings + + CONNECTION_STRING = "crate://crate@localhost:4200/test3" + COLLECTION_NAME = "state_of_the_union_test" + embeddings = OpenAIEmbeddings() + vectorestore = CrateDBVectorStore.from_documents( + embedding=embeddings, + documents=docs, + collection_name=COLLECTION_NAME, + connection_string=CONNECTION_STRING, + ) + + + """ + + def __post_init__( + self, + ) -> None: + """ + Initialize the store. + """ + + self._engine = self._bind + self.Session = sessionmaker(bind=self._engine) # type: ignore[call-overload] + + # Patch dialect to invoke `REFRESH TABLE` after each DML operation. + from sqlalchemy_cratedb.support import refresh_after_dml + + refresh_after_dml(self._engine) + + # Need to defer initialization, because dimension size + # can only be figured out at runtime. + self.BaseModel = None + self.CollectionStore = None # type: ignore[assignment] + self.EmbeddingStore = None # type: ignore[assignment] + + def __del__(self) -> None: + """ + Work around premature session close. + + sqlalchemy.orm.exc.DetachedInstanceError: Parent instance is not bound + to a Session; lazy load operation of attribute 'embeddings' cannot proceed. + -- https://docs.sqlalchemy.org/en/20/errors.html#error-bhk3 + + TODO: Review! + """ # noqa: E501 + pass + + def _init_models(self, embedding: List[float]) -> None: + """ + Create SQLAlchemy models at runtime, when not established yet. + """ + + # TODO: Use a better way to run this only once. + if self.CollectionStore is not None and self.EmbeddingStore is not None: + return + + size = len(embedding) + self._init_models_with_dimensionality(size=size) + + def _init_models_with_dimensionality(self, size: int) -> None: + mf = ModelFactory(dimensions=size) + self.BaseModel, self.CollectionStore, self.EmbeddingStore = ( + mf.BaseModel, # type: ignore[assignment] + mf.CollectionStore, + mf.EmbeddingStore, + ) + + def get_collection(self, session: sqlalchemy.orm.Session) -> Any: + if self.CollectionStore is None: + raise RuntimeError( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + return self.CollectionStore.get_by_name(session, self.collection_name) + + def add_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Add embeddings to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + embeddings: List of list of embedding vectors. + metadatas: List of metadatas associated with the texts. + kwargs: vectorstore specific parameters + """ + from sqlalchemy_cratedb.support import refresh_table + + if not embeddings: + return [] + self._init_models(embeddings[0]) + + # When the user requested to delete the collection before running subsequent + # operations on it, run the deletion gracefully if the table does not exist + # yet. + if self.pre_delete_collection: + try: + self.delete_collection() + except sqlalchemy.exc.ProgrammingError as ex: + if "RelationUnknown" not in str(ex): + raise + + # Tables need to be created at runtime, because the `EmbeddingStore.embedding` + # field, a `FloatVector`, needs to be initialized with a dimensionality + # parameter, which is only obtained at runtime. + self.create_tables_if_not_exists() + self.create_collection() + + # After setting up the table/collection at runtime, add embeddings. + embedding_ids = super().add_embeddings( + texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + refresh_table(self.Session(), self.EmbeddingStore) + return embedding_ids + + def create_tables_if_not_exists(self) -> None: + """ + Need to overwrite because this `Base` is different from parent's `Base`. + """ + if self.BaseModel is None: + raise RuntimeError("Storage models not initialized") + self.BaseModel.metadata.create_all(self._engine) + + def drop_tables(self) -> None: + """ + Need to overwrite because this `Base` is different from parent's `Base`. + """ + mf = ModelFactory() + mf.Base.metadata.drop_all(self._engine) + + def delete( + self, + ids: Optional[List[str]] = None, + collection_only: bool = False, + **kwargs: Any, + ) -> None: + """ + Delete vectors by ids or uuids. + + Remark: Specialized for CrateDB to synchronize data. + + Args: + ids: List of ids to delete. + + Remark: Patch for CrateDB needs to overwrite this, in order to + add a "REFRESH TABLE" statement afterwards. The other + patch, listening to `after_delete` events seems not be + able to catch it. + """ + from sqlalchemy_cratedb.support import refresh_table + + super().delete(ids=ids, collection_only=collection_only, **kwargs) + + # CrateDB: Synchronize data because `on_flush` does not catch it. + with self.Session() as session: + refresh_table(session, self.EmbeddingStore) + + @property + def distance_strategy(self) -> Any: + if self._distance_strategy == DistanceStrategy.EUCLIDEAN: + return self.EmbeddingStore.embedding.euclidean_distance + elif self._distance_strategy == DistanceStrategy.COSINE: + raise NotImplementedError("Cosine similarity not implemented yet") + elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + raise NotImplementedError("Dot-product similarity not implemented yet") + else: + raise ValueError( + f"Got unexpected value for distance: {self._distance_strategy}. " + f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}." + ) + + def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]: + """Return docs and scores from results.""" + docs = [ + ( + Document( + page_content=result.EmbeddingStore.document, + metadata=result.EmbeddingStore.cmetadata, + ), + result.similarity if self.embedding_function is not None else None, + ) + for result in results + ] + return docs + + def _query_collection( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query the collection.""" + self._init_models(embedding) + with self.Session() as session: + collection = self.get_collection(session) + if not collection: + raise ValueError("Collection not found") + return self._query_collection_multi( + collections=[collection], embedding=embedding, k=k, filter=filter + ) + + def _query_collection_multi( + self, + collections: List[Any], + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query the collection.""" + self._init_models(embedding) + + collection_names = [coll.name for coll in collections] + collection_uuids = [coll.uuid for coll in collections] + self.logger.info(f"Querying collections: {collection_names}") + + with self.Session() as session: + filter_by = self.EmbeddingStore.collection_id.in_(collection_uuids) + + if filter is not None: + filter_clauses = [] + for key, value in filter.items(): + IN = "in" + if isinstance(value, dict) and IN in map(str.lower, value): + value_case_insensitive = { + k.lower(): v for k, v in value.items() + } + filter_by_metadata = self.EmbeddingStore.cmetadata[key].in_( + value_case_insensitive[IN] + ) + filter_clauses.append(filter_by_metadata) + else: + filter_by_metadata = self.EmbeddingStore.cmetadata[key] == str( + value + ) # type: ignore[assignment] + filter_clauses.append(filter_by_metadata) + + filter_by = sqlalchemy.and_(filter_by, *filter_clauses) # type: ignore[assignment] + + _type = self.EmbeddingStore + + results: List[Any] = ( + session.query( # type: ignore[attr-defined] + self.EmbeddingStore, + # TODO: Original pgvector code uses `self.distance_strategy`. + # CrateDB currently only supports EUCLIDEAN. + # self.distance_strategy(embedding).label("distance") + sqlalchemy.func.vector_similarity( + self.EmbeddingStore.embedding, + # TODO: Just reference the `embedding` symbol here, don't + # serialize its value prematurely. + # https://github.com/crate/crate/issues/16912 + # + # Until that got fixed, marshal the arguments to + # `vector_similarity()` manually, in order to work around + # this edge case bug. We don't need to use JSON marshalling, + # because Python's string representation of a list is just + # right. + sqlalchemy.text(str(embedding)), + ).label("similarity"), + ) + .filter(filter_by) + # CrateDB applies `KNN_MATCH` within the `WHERE` clause. + .filter( + sqlalchemy.func.knn_match( + self.EmbeddingStore.embedding, embedding, k + ) + ) + .order_by(sqlalchemy.desc("similarity")) + .join( + self.CollectionStore, + self.EmbeddingStore.collection_id == self.CollectionStore.uuid, + ) + .limit(k) + ) + return results + + @classmethod + def from_texts( # type: ignore[override] + cls: Type[CrateDBVectorStore], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ids: Optional[List[str]] = None, + pre_delete_collection: bool = False, + **kwargs: Any, + ) -> CrateDBVectorStore: + """ + Return VectorStore initialized from texts and embeddings. + Database connection string is required. + + Either pass it as a parameter, or set the CRATEDB_CONNECTION_STRING + environment variable. + + Remark: Needs to be overwritten, because CrateDB uses a different + DEFAULT_DISTANCE_STRATEGY. + """ + return super().from_texts( # type: ignore[return-value] + texts, + embedding, + metadatas=metadatas, + ids=ids, + collection_name=collection_name, + distance_strategy=distance_strategy, # type: ignore[arg-type] + pre_delete_collection=pre_delete_collection, + **kwargs, + ) + + @classmethod + def get_connection_string(cls, kwargs: Dict[str, Any]) -> str: + connection_string: str = get_from_dict_or_env( + data=kwargs, + key="connection_string", + env_key="CRATEDB_CONNECTION_STRING", + ) + + if not connection_string: + raise ValueError( + "Database connection string is required." + "Either pass it as a parameter, or set the " + "CRATEDB_CONNECTION_STRING environment variable." + ) + + return connection_string + + @classmethod + def connection_string_from_db_params( + cls, + driver: str, + host: str, + port: int, + database: str, + user: str, + password: str, + ) -> str: + """Return connection string from database parameters.""" + return str( + sqlalchemy.URL.create( + drivername=driver, + host=host, + port=port, + username=user, + password=password, + query={"schema": database}, + ) + ) + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided + # in vectorstore constructor + if self._distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.EUCLIDEAN: + return self._euclidean_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "No supported normalization function for distance_strategy of " + f"{self._distance_strategy}. Consider providing relevance_score_fn to " + "CrateDBVectorStore constructor." + ) + + @staticmethod + def _euclidean_relevance_score_fn(similarity: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many + # others are not!) + # - embedding dimensionality + # - etc. + # This function converts the Euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + + # CrateDB uses the `vector_similarity()` SQL function in this context, + # which already returns a normalized value. + # https://cratedb.com/docs/crate/reference/en/latest/general/builtins/scalar-functions.html#vector-similarity-float-vector-float-vector + return similarity diff --git a/libs/community/langchain_community/vectorstores/cratedb/extended.py b/libs/community/langchain_community/vectorstores/cratedb/extended.py new file mode 100644 index 0000000000000..2a123eb2ee14a --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/extended.py @@ -0,0 +1,91 @@ +import logging +from typing import ( + Any, + Callable, + Dict, + List, + Optional, +) + +import sqlalchemy +from langchain.schema.embeddings import Embeddings + +from langchain_community.vectorstores.cratedb.base import ( + DEFAULT_DISTANCE_STRATEGY, + CrateDBVectorStore, + DistanceStrategy, +) +from langchain_community.vectorstores.pgvector import _LANGCHAIN_DEFAULT_COLLECTION_NAME + + +class CrateDBVectorStoreMultiCollection(CrateDBVectorStore): + """ + Provide functionality for searching multiple collections. + It can not be used for indexing documents. + + To use it, you should have the ``sqlalchemy-cratedb`` Python package installed. + + Synopsis:: + + from langchain_community.vectorstores.cratedb import CrateDBVectorStoreMultiCollection + + multisearch = CrateDBVectorStoreMultiCollection( + collection_names=["collection_foo", "collection_bar"], + embedding_function=embeddings, + connection_string=CONNECTION_STRING, + ) + docs_with_score = multisearch.similarity_search_with_score(query) + """ # noqa: E501 + + def __init__( + self, + connection_string: str, + embedding_function: Embeddings, + collection_names: List[str] = [_LANGCHAIN_DEFAULT_COLLECTION_NAME], + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, # type: ignore[arg-type] + logger: Optional[logging.Logger] = None, + relevance_score_fn: Optional[Callable[[float], float]] = None, + *, + connection: Optional[sqlalchemy.engine.Connection] = None, + engine_args: Optional[dict[str, Any]] = None, + ) -> None: + self.connection_string = connection_string + self.embedding_function = embedding_function + self.collection_names = collection_names + self._distance_strategy = distance_strategy # type: ignore[assignment] + self.logger = logger or logging.getLogger(__name__) + self.override_relevance_score_fn = relevance_score_fn + self.engine_args = engine_args or {} + + # Create a connection if not provided, otherwise use the provided connection + self._bind = connection if connection else self._create_engine() + + self.__post_init__() + + @classmethod + def _from(cls, *args: List, **kwargs: Dict): # type: ignore[no-untyped-def,override] + raise NotImplementedError("This adapter can not be used for indexing documents") + + def get_collections(self, session: sqlalchemy.orm.Session) -> Any: + if self.CollectionStore is None: + raise RuntimeError( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + return self.CollectionStore.get_by_names(session, self.collection_names) + + def _query_collection( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query multiple collections.""" + self._init_models(embedding) + with self.Session() as session: + collections = self.get_collections(session) + if not collections: + raise ValueError("No collections found") + return self._query_collection_multi( + collections=collections, embedding=embedding, k=k, filter=filter + ) diff --git a/libs/community/langchain_community/vectorstores/cratedb/model.py b/libs/community/langchain_community/vectorstores/cratedb/model.py new file mode 100644 index 0000000000000..68848fa186a1c --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/model.py @@ -0,0 +1,115 @@ +import uuid +from typing import Any, List, Optional, Tuple + +import sqlalchemy +from sqlalchemy.orm import Session, declarative_base, relationship + + +def generate_uuid() -> str: + return str(uuid.uuid4()) + + +class ModelFactory: + """Provide SQLAlchemy model objects at runtime.""" + + def __init__(self, dimensions: Optional[int] = None): + from sqlalchemy_cratedb import FloatVector, ObjectType + + # While it does not have any function here, you will still need to supply a + # dummy dimension size value for operations like deleting records. + self.dimensions = dimensions or 1024 + + Base: Any = declarative_base() + + # Optional: Use a custom schema for the langchain tables. + # Base = declarative_base(metadata=MetaData(schema="langchain")) # type: Any + + class BaseModel(Base): + """Base model for the SQL stores.""" + + __abstract__ = True + uuid = sqlalchemy.Column( + sqlalchemy.String, primary_key=True, default=generate_uuid + ) + + class CollectionStore(BaseModel): + """Collection store.""" + + __tablename__ = "collection" + __table_args__ = {"keep_existing": True} + + name = sqlalchemy.Column(sqlalchemy.String) + cmetadata: sqlalchemy.Column = sqlalchemy.Column(ObjectType) + + embeddings = relationship( + "EmbeddingStore", + back_populates="collection", + cascade="all, delete-orphan", + passive_deletes=False, + ) + + @classmethod + def get_by_name( + cls, session: Session, name: str + ) -> Optional["CollectionStore"]: + return session.query(cls).filter(cls.name == name).first() # type: ignore[attr-defined] + + @classmethod + def get_by_names( + cls, session: Session, names: List[str] + ) -> List["CollectionStore"]: + return session.query(cls).filter(cls.name.in_(names)).all() # type: ignore[attr-defined] + + @classmethod + def get_or_create( + cls, + session: Session, + name: str, + cmetadata: Optional[dict] = None, + ) -> Tuple["CollectionStore", bool]: + """ + Get or create a collection. + Returns [Collection, bool] where the bool is True + if the collection was created. + """ + created = False + collection = cls.get_by_name(session, name) + if collection: + return collection, created + + collection = cls(name=name, cmetadata=cmetadata) + session.add(collection) + session.commit() + created = True + return collection, created + + class EmbeddingStore(BaseModel): + """Embedding store.""" + + __tablename__ = "embedding" + __table_args__ = {"keep_existing": True} + + collection_id = sqlalchemy.Column( + sqlalchemy.String, + sqlalchemy.ForeignKey( + f"{CollectionStore.__tablename__}.uuid", + ondelete="CASCADE", + ), + ) + collection = relationship("CollectionStore", back_populates="embeddings") + + embedding: sqlalchemy.Column = sqlalchemy.Column( + FloatVector(self.dimensions) + ) + document: sqlalchemy.Column = sqlalchemy.Column( + sqlalchemy.String, nullable=True + ) + cmetadata: sqlalchemy.Column = sqlalchemy.Column(ObjectType, nullable=True) + + # custom_id : any user defined id + custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) + + self.Base = Base + self.BaseModel = BaseModel + self.CollectionStore = CollectionStore + self.EmbeddingStore = EmbeddingStore diff --git a/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml b/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml new file mode 100644 index 0000000000000..b547b2c766f20 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml @@ -0,0 +1,20 @@ +version: "3" + +services: + postgresql: + image: crate/crate:nightly + environment: + - CRATE_HEAP_SIZE=4g + ports: + - "4200:4200" + - "5432:5432" + command: | + crate -Cdiscovery.type=single-node + healthcheck: + test: + [ + "CMD-SHELL", + "curl --silent --fail http://localhost:4200/ || exit 1", + ] + interval: 5s + retries: 60 diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py new file mode 100644 index 0000000000000..52aad4a0a8537 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -0,0 +1,664 @@ +""" +Test CrateDB `FLOAT_VECTOR` / `KNN_MATCH` functionality. + +cd tests/integration_tests/vectorstores/docker-compose +docker-compose -f cratedb.yml up +""" + +import os +import re +from typing import Dict, Generator, List + +import pytest +import sqlalchemy as sa +import sqlalchemy.orm +from langchain.docstore.document import Document +from sqlalchemy.exc import ProgrammingError +from sqlalchemy.orm import Session + +from langchain_community.vectorstores.cratedb import CrateDBVectorStore +from langchain_community.vectorstores.cratedb.extended import ( + CrateDBVectorStoreMultiCollection, +) +from langchain_community.vectorstores.cratedb.model import ModelFactory +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, + FakeEmbeddings, +) + +SCHEMA_NAME = os.environ.get("TEST_CRATEDB_DATABASE", "testdrive") + +CONNECTION_STRING = CrateDBVectorStore.connection_string_from_db_params( + driver=os.environ.get("TEST_CRATEDB_DRIVER", "crate"), + host=os.environ.get("TEST_CRATEDB_HOST", "localhost"), + port=int(os.environ.get("TEST_CRATEDB_PORT", "4200")), + database=SCHEMA_NAME, + user=os.environ.get("TEST_CRATEDB_USER", "crate"), + password=os.environ.get("TEST_CRATEDB_PASSWORD", ""), +) + +ADA_TOKEN_COUNT = 1536 + + +@pytest.fixture +def engine() -> sa.Engine: + """ + Return an SQLAlchemy engine object. + """ + return sa.create_engine(CONNECTION_STRING, echo=False) + + +@pytest.fixture +def session(engine: sa.Engine) -> Generator[sa.orm.Session, None, None]: + with engine.connect() as conn: + with Session(conn) as session: + yield session + + +@pytest.fixture(autouse=True) +def drop_tables(engine: sa.Engine) -> None: + """ + Drop database tables. + """ + try: + mf = ModelFactory() + mf.BaseModel.metadata.drop_all(engine, checkfirst=False) + except Exception as ex: + if "RelationUnknown" not in str(ex): + raise + + +@pytest.fixture +def prune_tables(engine: sa.Engine) -> None: + """ + Delete data from database tables. + """ + with engine.connect() as conn: + with Session(conn) as session: + mf = ModelFactory() + try: + session.query(mf.CollectionStore).delete() + except ProgrammingError: + pass + try: + session.query(mf.EmbeddingStore).delete() + except ProgrammingError: + pass + + +def ensure_collection(session: sa.orm.Session, name: str) -> None: + """ + Create a (fake) collection item. + """ + session.execute( + sa.text( + """ + CREATE TABLE IF NOT EXISTS collection ( + uuid TEXT, + name TEXT, + cmetadata OBJECT + ); + """ + ) + ) + session.execute( + sa.text( + """ + CREATE TABLE IF NOT EXISTS embedding ( + uuid TEXT, + collection_id TEXT, + embedding FLOAT_VECTOR(123), + document TEXT, + cmetadata OBJECT, + custom_id TEXT + ); + """ + ) + ) + try: + session.execute( + sa.text( + f"INSERT INTO collection (uuid, name, cmetadata) " + f"VALUES ('uuid-{name}', '{name}', {{}});" + ) + ) + session.execute(sa.text("REFRESH TABLE collection")) + except sa.exc.IntegrityError: + pass + + +class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): + """Fake embeddings functionality for testing.""" + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Return simple embeddings.""" + return [ + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) + ] + + def embed_query(self, text: str) -> List[float]: + """Return simple embeddings.""" + return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] + + +class ConsistentFakeEmbeddingsWithAdaDimension(ConsistentFakeEmbeddings): + """ + Fake embeddings which remember all the texts seen so far to return + consistent vectors for the same texts. + + Other than this, they also have a fixed dimensionality, which is + important in this case. + """ + + def __init__(self, *args: List, **kwargs: Dict) -> None: + super().__init__(dimensionality=ADA_TOKEN_COUNT) + + +def test_cratedb_texts() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_cratedb_embedding_dimension() -> None: + """Verify the `embedding` column uses the correct vector dimensionality.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + with docsearch.Session() as session: + result = session.execute(sa.text(f"SHOW CREATE TABLE {SCHEMA_NAME}.embedding")) + record = result.first() + if not record: + raise ValueError("No data found") + ddl = record[0] + assert f'"embedding" FLOAT_VECTOR({ADA_TOKEN_COUNT})' in ddl + + +def test_cratedb_embeddings() -> None: + """Test end to end construction with embeddings and search.""" + texts = ["foo", "bar", "baz"] + text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = CrateDBVectorStore.from_embeddings( + text_embeddings=text_embedding_pairs, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_cratedb_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + +def test_cratedb_with_metadatas_with_scores() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] + + +def test_cratedb_with_filter_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + # TODO: Original: + # assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] # noqa: E501 + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"}) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] + + +def test_cratedb_with_filter_distant_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=2, filter={"page": "2"}) + # Original score value: 0.0013003906671379406 + assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.2)] + + +def test_cratedb_with_filter_no_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"}) + assert output == [] + + +def test_cratedb_collection_delete() -> None: + """ + Test end to end collection construction and deletion. + Uses two different collections of embeddings. + """ + + store_foo = CrateDBVectorStore.from_texts( + texts=["foo"], + collection_name="test_collection_foo", + collection_metadata={"category": "foo"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=[{"document": "foo"}], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + store_bar = CrateDBVectorStore.from_texts( + texts=["bar"], + collection_name="test_collection_bar", + collection_metadata={"category": "bar"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=[{"document": "bar"}], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + session = store_foo.Session() + + # Verify data in database. + collection_foo = store_foo.get_collection(session) + collection_bar = store_bar.get_collection(session) + if collection_foo is None or collection_bar is None: + assert False, "Expected CollectionStore objects but received None" + assert collection_foo.embeddings[0].cmetadata == {"document": "foo"} + assert collection_bar.embeddings[0].cmetadata == {"document": "bar"} + + # Delete first collection. + store_foo.delete_collection() + + # Verify that the "foo" collection has been deleted. + collection_foo = store_foo.get_collection(session) + collection_bar = store_bar.get_collection(session) + if collection_bar is None: + assert False, "Expected CollectionStore object but received None" + assert collection_foo is None + assert collection_bar.embeddings[0].cmetadata == {"document": "bar"} + + # Verify that associated embeddings also have been deleted. + embeddings_count = session.query(store_foo.EmbeddingStore).count() + assert embeddings_count == 1 + + +def test_cratedb_collection_with_metadata() -> None: + """Test end to end collection construction""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + cratedb_vector = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + collection_metadata={"foo": "bar"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + collection = cratedb_vector.get_collection(cratedb_vector.Session()) + if collection is None: + assert False, "Expected a CollectionStore object but received None" + else: + assert collection.name == "test_collection" + assert collection.cmetadata == {"foo": "bar"} + + +def test_cratedb_collection_no_embedding_dimension() -> None: + """ + Verify that addressing collections fails when not specifying dimensions. + """ + cratedb_vector = CrateDBVectorStore( + embedding_function=None, # type: ignore[arg-type] + connection_string=CONNECTION_STRING, + ) + session = cratedb_vector.Session() + with pytest.raises(RuntimeError) as ex: + cratedb_vector.get_collection(session) + assert ex.match( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + + +def test_cratedb_collection_read_only(session: Session) -> None: + """ + Test using a collection, without adding any embeddings upfront. + + This happens when just invoking the "retrieval" case. + + In this scenario, embedding dimensionality needs to be figured out + from the supplied `embedding_function`. + """ + + # Create a fake collection item. + ensure_collection(session, "baz2") + + # This test case needs an embedding _with_ dimensionality. + # Otherwise, the data access layer is unable to figure it + # out at runtime. + embedding = ConsistentFakeEmbeddingsWithAdaDimension() + + vectorstore = CrateDBVectorStore( + collection_name="baz2", + connection_string=CONNECTION_STRING, + embedding_function=embedding, + ) + output = vectorstore.similarity_search("foo", k=1) + + # No documents/embeddings have been loaded, the collection is empty. + # This is why there are also no results. + assert output == [] + + +def test_cratedb_with_filter_in_set() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score( + "foo", k=2, filter={"page": {"IN": ["0", "2"]}} + ) + # Original score values: 0.0, 0.0013003906671379406 + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="baz", metadata={"page": "2"}), 0.2), + ] + + +def test_cratedb_delete_docs() -> None: + """Add and delete documents.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + ids=["1", "2", "3"], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + docsearch.delete(["1", "2"]) + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == ["3"] # type: ignore + + docsearch.delete(["2", "3"]) # Should not raise on missing ids + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == [] # type: ignore + + +def test_cratedb_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + # Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.5), + (Document(page_content="baz", metadata={"page": "2"}), 0.2), + ] + + +def test_cratedb_retriever_search_threshold() -> None: + """Test using retriever for searching with threshold.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.35}, # Original value: 0.999 + ) + output = retriever.invoke("summer") + assert output == [ + Document(page_content="foo", metadata={"page": "0"}), + Document(page_content="bar", metadata={"page": "1"}), + ] + + +def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: + """Test searching with threshold and custom normalization function""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + relevance_score_fn=lambda d: d * 0, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.5}, + ) + output = retriever.invoke("foo") + assert output == [] + + +def test_cratedb_max_marginal_relevance_search() -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3) + assert output == [Document(page_content="foo")] + + +def test_cratedb_max_marginal_relevance_search_with_score() -> None: + """Test max marginal relevance search with relevance scores.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorStore.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) + assert output == [(Document(page_content="foo"), 1.0)] + + +def test_cratedb_multicollection_search_success() -> None: + """ + `CrateDBVectorStoreMultiCollection` provides functionality for + searching multiple collections. + """ + + store_1 = CrateDBVectorStore.from_texts( + texts=["Räuber", "Hotzenplotz"], + collection_name="test_collection_1", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + _ = CrateDBVectorStore.from_texts( + texts=["John", "Doe"], + collection_name="test_collection_2", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + # Probe the first store. + output = store_1.similarity_search("Räuber", k=1) + assert Document(page_content="Räuber") in output[:2] + output = store_1.similarity_search("Hotzenplotz", k=1) + assert Document(page_content="Hotzenplotz") in output[:2] + output = store_1.similarity_search("John Doe", k=1) + assert Document(page_content="Hotzenplotz") in output[:2] + + # Probe the multi-store. + multisearch = CrateDBVectorStoreMultiCollection( + collection_names=["test_collection_1", "test_collection_2"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + output = multisearch.similarity_search("Räuber Hotzenplotz", k=2) + assert Document(page_content="Räuber") in output[:2] + output = multisearch.similarity_search("John Doe", k=2) + assert Document(page_content="Doe") in output[:2] + + +def test_cratedb_multicollection_fail_indexing_not_permitted() -> None: + """ + `CrateDBVectorStoreMultiCollection` does not provide functionality for + indexing documents. + """ + + with pytest.raises(NotImplementedError) as ex: + CrateDBVectorStoreMultiCollection.from_texts( + texts=["foo"], + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + assert ex.match("This adapter can not be used for indexing documents") + + +def test_cratedb_multicollection_search_table_does_not_exist() -> None: + """ + `CrateDBVectorStoreMultiCollection` will fail when the `collection` + table does not exist. + """ + + store = CrateDBVectorStoreMultiCollection( + collection_names=["unknown"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + with pytest.raises(ProgrammingError) as ex: + store.similarity_search("foo") + assert ex.match(re.escape("RelationUnknown[Relation 'collection' unknown]")) + + +def test_cratedb_multicollection_search_unknown_collection() -> None: + """ + `CrateDBVectorStoreMultiCollection` will fail when not able to identify + collections to search in. + """ + + CrateDBVectorStore.from_texts( + texts=["Räuber", "Hotzenplotz"], + collection_name="test_collection", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + store = CrateDBVectorStoreMultiCollection( + collection_names=["unknown"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + with pytest.raises(ValueError) as ex: + store.similarity_search("foo") + assert ex.match("No collections found") + + +def test_cratedb_multicollection_no_embedding_dimension() -> None: + """ + Verify that addressing collections fails when not specifying dimensions. + """ + store = CrateDBVectorStoreMultiCollection( + embedding_function=None, # type: ignore[arg-type] + connection_string=CONNECTION_STRING, + ) + session = store.Session() + with pytest.raises(RuntimeError) as ex: + store.get_collection(session) + assert ex.match( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 5ac0ca72b49c5..0412b3222005b 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -27,6 +27,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "DeepLake", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 041f4172b2dcb..2f56c8098a396 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -16,7 +16,7 @@ def test_compatible_vectorstore_documentation() -> None: case and 1) update docs in [1] and 2) update the `documented` dict in this test case. - [1] langchain/docs/docs/modules/data_connection/indexing.ipynb + [1] langchain/docs/docs/how_to/indexing.ipynb """ # Check if a vectorstore is compatible with the indexing API @@ -60,6 +60,7 @@ def check_compatibility(vector_store: VectorStore) -> bool: "Cassandra", "Chroma", "CouchbaseVectorStore", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "TiDBVectorStore",