From 41433e65394e4189b80a238bdede02d1c14d9421 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 24 Apr 2023 13:40:22 +0200 Subject: [PATCH 1/6] feat: add in-memory and hnswlib vectorstore Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 235 +++++++++++++++++ langchain/vectorstores/in_memory.py | 210 +++++++++++++++ poetry.lock | 245 ++++++++++++------ pyproject.toml | 7 +- .../vectorstores/test_hnsw_lib.py | 54 ++++ .../vectorstores/test_in_memory.py | 48 ++++ 6 files changed, 712 insertions(+), 87 deletions(-) create mode 100644 langchain/vectorstores/hnsw_lib.py create mode 100644 langchain/vectorstores/in_memory.py create mode 100644 tests/integration_tests/vectorstores/test_hnsw_lib.py create mode 100644 tests/integration_tests/vectorstores/test_in_memory.py diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py new file mode 100644 index 0000000000000..6974133c9891b --- /dev/null +++ b/langchain/vectorstores/hnsw_lib.py @@ -0,0 +1,235 @@ +"""Wrapper around in-memory DocArray store.""" +from __future__ import annotations + +from operator import itemgetter +from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore +from langchain.vectorstores.base import VST +from langchain.vectorstores.utils import maximal_marginal_relevance + +from docarray import BaseDoc +from docarray.typing import NdArray + + +class HnswLib(VectorStore): + """Wrapper around HnswLib storage. + + To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + """ + def __init__( + self, + work_dir: str, + n_dim: int, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]], + sim_metric: str = 'cosine', + kwargs: dict = None + ) -> None: + """Initialize HnswLib store.""" + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the HnswLib VectorStore the docarray version >=0.30.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + else: + from docarray import DocList + from docarray.index import HnswDocumentIndex + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + try: + import google.protobuf + except ImportError: + raise ImportError( + "Could not import protobuf python package. " + "Please install it with `pip install -U protobuf`." + ) + + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + self.embedding = embedding + + self.doc_cls = self._get_doc_cls(n_dim, sim_metric) + self.doc_index = HnswDocumentIndex[self.doc_cls](work_dir=work_dir) + embeddings = self.embedding.embed_documents(texts) + docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, embeddings) + ] + ) + self.doc_index.index(docs) + + @staticmethod + def _get_doc_cls(n_dim: int, sim_metric: str): + from pydantic import Field + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric) + metadata: Optional[dict] + + return DocArrayDoc + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + work_dir: str = None, + n_dim: int = None, + **kwargs: Any + ) -> HnswLib: + + if work_dir is None: + raise ValueError('`work_dir` parameter hs not been set.') + if n_dim is None: + raise ValueError('`n_dim` parameter has not been set.') + + return cls( + work_dir=work_dir, + n_dim=n_dim, + texts=texts, + embedding=embedding, + metadatas=metadatas, + kwargs=kwargs + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.doc_index.index(doc) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + query_embedding = self.embedding.embed_query(query) + query_embedding = [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.] + print(f"query_embedding = {query_embedding}") + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + + query_doc = self.doc_cls(embedding=embedding) + docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + + result = [Document(page_content=doc.text) for doc in docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + + embeddings = [emb for emb in docs.emb] + + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + return results + diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py new file mode 100644 index 0000000000000..a079b10da7887 --- /dev/null +++ b/langchain/vectorstores/in_memory.py @@ -0,0 +1,210 @@ +"""Wrapper around in-memory DocArray store.""" +from __future__ import annotations + +from operator import itemgetter +from typing import List, Optional, Any, Tuple, Iterable, Type + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore +from langchain.vectorstores.base import VST +from langchain.vectorstores.utils import maximal_marginal_relevance + +from docarray import BaseDoc +from docarray.typing import NdArray + + +class InMemory(VectorStore): + """Wrapper around in-memory storage. + + To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + """ + def __init__( + self, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] + ) -> None: + """Initialize in-memory store.""" + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the InMemory VectorStore the docarray version >=0.30.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + else: + from docarray import DocList + + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + self.embedding = embedding + self.doc_cls = self._get_doc_cls() + self.docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, self.embedding.embed_documents(texts)) + ] + ) + + @staticmethod + def _get_doc_cls(): + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] + metadata: Optional[dict] + + # DocArrayDoc.update_forward_refs() + return DocArrayDoc + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> InMemory: + return cls( + texts=texts, + embedding=embedding, + metadatas=metadatas + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.docs.append(doc) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + from docarray.utils.find import find # TODO move import + + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + from docarray.utils.find import find + + query_doc = self.doc_cls(embedding=embedding) + result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents + + result = [Document(page_content=doc.text) for doc in result_docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + from docarray.utils.find import find + + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + find_res = find(self.docs, query_doc, limit=k) + + embeddings = [emb for emb in find_res.documents.emb] + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [] + for idx in mmr_selected: + results.append(Document(page_content=self.docs[idx].text)) + return results + diff --git a/poetry.lock b/poetry.lock index 1138b9196a649..fc785b03aebfb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -1515,32 +1515,40 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.21.0" -description = "The data structure for unstructured data" +version = "0.30.0" +description = "The data structure for multimodal data" category = "main" optional = true -python-versions = "*" +python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.21.0.tar.gz", hash = "sha256:3c9f605123800c1b0cdf8c458be3fb19c05e9a81f723e51200ef531b02e689ee"}, + {file = "docarray-0.30.0-py3-none-any.whl", hash = "sha256:739dbe06bfee6f1cbc030156036764ca1c75832dcc01a07c724640c6d464651b"}, + {file = "docarray-0.30.0.tar.gz", hash = "sha256:dd73e9ff20485a1d819ac906a59ee0cbc4382e78a5061286e77eb7d7f8b28a8e"}, ] [package.dependencies] -jina-hubble-sdk = ">=0.24.0" -numpy = "*" -rich = ">=12.0.0" - -[package.extras] -annlite = ["annlite"] -benchmark = ["h5py", "matplotlib", "pandas", "seaborn"] -common = ["Pillow", "fastapi", "lz4", "matplotlib", "protobuf (>=3.13.0)", "pydantic (>=1.9.0)", "requests", "uvicorn"] -elasticsearch = ["elasticsearch (>=8.2.0)"] -full = ["Pillow", "av", "fastapi", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "ipython", "lz4", "matplotlib", "protobuf (>=3.13.0)", "pydantic (>=1.9.0)", "requests", "scipy", "strawberry-graphql", "trimesh[easy]", "uvicorn"] -milvus = ["pymilvus (>=2.1.0,<2.2.0)"] -opensearch = ["opensearch-py (==2.0.1)"] -qdrant = ["qdrant-client (>=0.10.3,<0.11.0)"] -redis = ["redis (>=4.3.0)"] -test = ["annlite", "black (==22.3.0)", "datasets", "elasticsearch (>=8.2.0)", "jina", "jupyterlab", "mock", "onnx", "onnxruntime", "opensearch-py (==2.0.1)", "paddlepaddle", "protobuf (>=3.13.0,<=3.20.0)", "pymilvus (==2.1.3)", "pytest", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-mock", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "redis (>=4.3.0)", "tensorflow (==2.7.0)", "torch (==1.9.0)", "torchvision (==0.10.0)", "transformers (>=4.16.2)", "weaviate-client (>=3.9.0,<3.10.0)"] -weaviate = ["weaviate-client (>=3.9.0,<3.10.0)"] +numpy = ">=1.17.3" +orjson = ">=3.8.2" +pydantic = ">=1.10.2" +rich = ">=13.1.0" +types-requests = ">=2.28.11.6" +typing-inspect = ">=0.8.0" + +[package.extras] +audio = ["pydub (>=0.25.1,<0.26.0)"] +aws = ["smart-open[s3] (>=6.3.0)"] +elasticsearch = ["elastic-transport (>=8.4.0,<9.0.0)", "elasticsearch (>=7.10.1)"] +full = ["av (>=10.0.0)", "lz4 (>=1.0.0)", "pandas (>=1.1.0)", "pillow (>=9.3.0)", "protobuf (>=3.19.0)", "pydub (>=0.25.1,<0.26.0)", "trimesh[easy] (>=3.17.1)", "types-pillow (>=9.3.0.1)"] +hnswlib = ["hnswlib (>=0.6.2)"] +image = ["pillow (>=9.3.0)", "types-pillow (>=9.3.0.1)"] +jac = ["jina-hubble-sdk (>=0.34.0)"] +mesh = ["trimesh[easy] (>=3.17.1)"] +pandas = ["pandas (>=1.1.0)"] +proto = ["lz4 (>=1.0.0)", "protobuf (>=3.19.0)"] +qdrant = ["qdrant-client (>=1.1.4)"] +torch = ["torch (>=1.0.0)"] +video = ["av (>=10.0.0)"] +weaviate = ["weaviate-client (>=3.15)"] +web = ["fastapi (>=0.87.0)"] [[package]] name = "docker" @@ -1740,7 +1748,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2018,26 +2026,24 @@ files = [ [[package]] name = "google-api-core" -version = "2.11.0" +version = "2.8.2" description = "Google API client core library" category = "main" optional = true -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "google-api-core-2.11.0.tar.gz", hash = "sha256:4b9bb5d5a380a0befa0573b302651b8a9a89262c1730e37bf423cec511804c22"}, - {file = "google_api_core-2.11.0-py3-none-any.whl", hash = "sha256:ce222e27b0de0d7bc63eb043b956996d6dccab14cc3b690aaea91c9cc99dc16e"}, + {file = "google-api-core-2.8.2.tar.gz", hash = "sha256:06f7244c640322b508b125903bb5701bebabce8832f85aba9335ec00b3d02edc"}, + {file = "google_api_core-2.8.2-py3-none-any.whl", hash = "sha256:93c6a91ccac79079ac6bbf8b74ee75db970cc899278b97d53bc012f35908cf50"}, ] [package.dependencies] -google-auth = ">=2.14.1,<3.0dev" +google-auth = ">=1.25.0,<3.0dev" googleapis-common-protos = ">=1.56.2,<2.0dev" -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" +protobuf = ">=3.15.0,<5.0.0dev" requests = ">=2.18.0,<3.0.0dev" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)", "grpcio-status (>=1.49.1,<2.0dev)"] -grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)"] [[package]] name = "google-api-python-client" @@ -2151,21 +2157,21 @@ requests = "*" [[package]] name = "googleapis-common-protos" -version = "1.59.0" +version = "1.56.4" description = "Common protobufs used in Google APIs" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "googleapis-common-protos-1.59.0.tar.gz", hash = "sha256:4168fcb568a826a52f23510412da405abd93f4d23ba544bb68d943b14ba3cb44"}, - {file = "googleapis_common_protos-1.59.0-py2.py3-none-any.whl", hash = "sha256:b287dc48449d1d41af0c69f4ea26242b5ae4c3d7249a38b0984c86a4caffff1f"}, + {file = "googleapis-common-protos-1.56.4.tar.gz", hash = "sha256:c25873c47279387cfdcbdafa36149887901d36202cb645a0e4f29686bf6e4417"}, + {file = "googleapis_common_protos-1.56.4-py2.py3-none-any.whl", hash = "sha256:8eb2cbc91b69feaf23e32452a7ae60e791e09967d81d4fcc7fc388182d1bd394"}, ] [package.dependencies] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" +protobuf = ">=3.15.0,<5.0.0dev" [package.extras] -grpc = ["grpcio (>=1.44.0,<2.0.0dev)"] +grpc = ["grpcio (>=1.0.0,<2.0.0dev)"] [[package]] name = "gptcache" @@ -2483,7 +2489,7 @@ numpy = ">=1.14.5" name = "hnswlib" version = "0.7.0" description = "hnswlib" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -2763,7 +2769,7 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2955,20 +2961,20 @@ testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] [[package]] name = "jina" -version = "3.15.0" +version = "3.14.1" description = "Build multimodal AI services via cloud native technologies · Neural Search · Generative AI · MLOps" category = "main" optional = true python-versions = "*" files = [ - {file = "jina-3.15.0.tar.gz", hash = "sha256:18a3be8ddca14ed66a554d8480a277bcb7620ebc6ae11352a9835c91865f9d1e"}, + {file = "jina-3.14.1.tar.gz", hash = "sha256:00b1f5995b13c9a49a2287bd534bd32eb8c05706064752035d569e616a15b411"}, ] [package.dependencies] aiofiles = "*" aiohttp = "*" aiostream = "*" -docarray = ">=0.16.4,<0.30.0" +docarray = ">=0.16.4" docker = "*" fastapi = ">=0.76.0" filelock = "*" @@ -3002,14 +3008,14 @@ websockets = "*" aiofiles = ["aiofiles"] aiohttp = ["aiohttp"] aiostream = ["aiostream"] -all = ["Pillow", "aiofiles", "aiohttp", "aiostream", "black (==22.3.0)", "bs4", "coverage (==6.2)", "docarray (>=0.16.4,<0.30.0)", "docker", "fastapi (>=0.76.0)", "filelock", "flaky", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "jsonschema", "kubernetes (>=18.20.0)", "mock", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "opentelemetry-sdk (>=1.14.0)", "opentelemetry-test-utils (>=0.33b0)", "packaging (>=20.0)", "pathspec", "portforward (>=0.2.4,<0.4.3)", "prometheus-api-client (>=0.5.1)", "prometheus_client (>=0.12.0)", "protobuf (>=3.19.0)", "psutil", "pydantic", "pytest", "pytest-asyncio", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-kind (==22.11.1)", "pytest-lazy-fixture", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "python-multipart", "pyyaml (>=5.3.1)", "requests", "requests-mock", "scipy (>=1.6.1)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] +all = ["Pillow", "aiofiles", "aiohttp", "aiostream", "black (==22.3.0)", "bs4", "coverage (==6.2)", "docarray (>=0.16.4)", "docker", "fastapi (>=0.76.0)", "filelock", "flaky", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "jsonschema", "kubernetes (>=18.20.0)", "mock", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "opentelemetry-sdk (>=1.14.0)", "opentelemetry-test-utils (>=0.33b0)", "packaging (>=20.0)", "pathspec", "portforward (>=0.2.4)", "prometheus-api-client (>=0.5.1)", "prometheus_client (>=0.12.0)", "protobuf (>=3.19.0)", "psutil", "pydantic", "pytest", "pytest-asyncio", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-kind (==22.11.1)", "pytest-lazy-fixture", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "python-multipart", "pyyaml (>=5.3.1)", "requests", "requests-mock", "scipy (>=1.6.1)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] black = ["black (==22.3.0)"] bs4 = ["bs4"] -cicd = ["bs4", "jsonschema", "portforward (>=0.2.4,<0.4.3)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch"] -core = ["aiostream", "docarray (>=0.16.4,<0.30.0)", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "packaging (>=20.0)", "protobuf (>=3.19.0)", "pyyaml (>=5.3.1)"] +cicd = ["bs4", "jsonschema", "portforward (>=0.2.4)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch"] +core = ["docarray (>=0.16.4)", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "packaging (>=20.0)", "protobuf (>=3.19.0)", "pyyaml (>=5.3.1)"] coverage = ["coverage (==6.2)"] -devel = ["aiofiles", "aiohttp", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "sgqlc", "strawberry-graphql (>=0.96.0)", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] -docarray = ["docarray (>=0.16.4,<0.30.0)"] +devel = ["aiofiles", "aiohttp", "aiostream", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "sgqlc", "strawberry-graphql (>=0.96.0)", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] +docarray = ["docarray (>=0.16.4)"] docker = ["docker"] fastapi = ["fastapi (>=0.76.0)"] filelock = ["filelock"] @@ -3036,7 +3042,7 @@ packaging = ["packaging (>=20.0)"] pathspec = ["pathspec"] perf = ["opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "prometheus_client (>=0.12.0)", "uvloop"] pillow = ["Pillow"] -portforward = ["portforward (>=0.2.4,<0.4.3)"] +portforward = ["portforward (>=0.2.4)"] prometheus-api-client = ["prometheus-api-client (>=0.5.1)"] prometheus-client = ["prometheus_client (>=0.12.0)"] protobuf = ["protobuf (>=3.19.0)"] @@ -3058,7 +3064,7 @@ requests = ["requests"] requests-mock = ["requests-mock"] scipy = ["scipy (>=1.6.1)"] sgqlc = ["sgqlc"] -standard = ["aiofiles", "aiohttp", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "uvicorn[standard]", "uvloop", "websockets"] +standard = ["aiofiles", "aiohttp", "aiostream", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "uvicorn[standard]", "uvloop", "websockets"] standrad = ["opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)"] strawberry-graphql = ["strawberry-graphql (>=0.96.0)"] tensorflow = ["tensorflow (>=2.0)"] @@ -4991,6 +4997,72 @@ numpy = ">=1.7" docs = ["numpydoc", "sphinx (==1.2.3)", "sphinx-rtd-theme", "sphinxcontrib-napoleon"] tests = ["pytest", "pytest-cov", "pytest-pep8"] +[[package]] +name = "orjson" +version = "3.8.10" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +category = "main" +optional = true +python-versions = ">= 3.7" +files = [ + {file = "orjson-3.8.10-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:4dfe0651e26492d5d929bbf4322de9afbd1c51ac2e3947a7f78492b20359711d"}, + {file = "orjson-3.8.10-cp310-cp310-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:bc30de5c7b3a402eb59cc0656b8ee53ca36322fc52ab67739c92635174f88336"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c08b426fae7b9577b528f99af0f7e0ff3ce46858dd9a7d1bf86d30f18df89a4c"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bce970f293825e008dbf739268dfa41dfe583aa2a1b5ef4efe53a0e92e9671ea"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9b23fb0264bbdd7218aa685cb6fc71f0dcecf34182f0a8596a3a0dff010c06f9"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0826ad2dc1cea1547edff14ce580374f0061d853cbac088c71162dbfe2e52205"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7bce6e61cea6426309259b04c6ee2295b3f823ea51a033749459fe2dd0423b2"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0b470d31244a6f647e5402aac7d2abaf7bb4f52379acf67722a09d35a45c9417"}, + {file = "orjson-3.8.10-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:48824649019a25d3e52f6454435cf19fe1eb3d05ee697e65d257f58ae3aa94d9"}, + {file = "orjson-3.8.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:faee89e885796a9cc493c930013fa5cfcec9bfaee431ddf00f0fbfb57166a8b3"}, + {file = "orjson-3.8.10-cp310-none-win_amd64.whl", hash = "sha256:3cfe32b1227fe029a5ad989fbec0b453a34e5e6d9a977723f7c3046d062d3537"}, + {file = "orjson-3.8.10-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:2073b62822738d6740bd2492f6035af5c2fd34aa198322b803dc0e70559a17b7"}, + {file = "orjson-3.8.10-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:b2c4faf20b6bb5a2d7ac0c16f58eb1a3800abcef188c011296d1dc2bb2224d48"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c1825997232a324911d11c75d91e1e0338c7b723c149cf53a5fc24496c048a4"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f7e85d4682f3ed7321d36846cad0503e944ea9579ef435d4c162e1b73ead8ac9"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8cdaacecb92997916603ab232bb096d0fa9e56b418ca956b9754187d65ca06"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ddabc5e44702d13137949adee3c60b7091e73a664f6e07c7b428eebb2dea7bbf"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27bb26e171e9cfdbec39c7ca4739b6bef8bd06c293d56d92d5e3a3fc017df17d"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1810e5446fe68d61732e9743592da0ec807e63972eef076d09e02878c2f5958e"}, + {file = "orjson-3.8.10-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:61e2e51cefe7ef90c4fbbc9fd38ecc091575a3ea7751d56fad95cbebeae2a054"}, + {file = "orjson-3.8.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f3e9ac9483c2b4cd794e760316966b7bd1e6afb52b0218f068a4e80c9b2db4f6"}, + {file = "orjson-3.8.10-cp311-none-win_amd64.whl", hash = "sha256:26aee557cf8c93b2a971b5a4a8e3cca19780573531493ce6573aa1002f5c4378"}, + {file = "orjson-3.8.10-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:11ae68f995a50724032af297c92f20bcde31005e0bf3653b12bff9356394615b"}, + {file = "orjson-3.8.10-cp37-cp37m-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:35d879b46b8029e1e01e9f6067928b470a4efa1ca749b6d053232b873c2dcf66"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:345e41abd1d9e3ecfb554e1e75ff818cf42e268bd06ad25a96c34e00f73a327e"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45a5afc9cda6b8aac066dd50d8194432fbc33e71f7164f95402999b725232d78"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad632dc330a7b39da42530c8d146f76f727d476c01b719dc6743c2b5701aaf6b"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bf2556ba99292c4dc550560384dd22e88b5cdbe6d98fb4e202e902b5775cf9f"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b88afd662190f19c3bb5036a903589f88b1d2c2608fbb97281ce000db6b08897"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:abce8d319aae800fd2d774db1106f926dee0e8a5ca85998fd76391fcb58ef94f"}, + {file = "orjson-3.8.10-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e999abca892accada083f7079612307d94dd14cc105a699588a324f843216509"}, + {file = "orjson-3.8.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a3fdee68c4bb3c5d6f89ed4560f1384b5d6260e48fbf868bae1a245a3c693d4d"}, + {file = "orjson-3.8.10-cp37-none-win_amd64.whl", hash = "sha256:e5d7f82506212e047b184c06e4bcd48c1483e101969013623cebcf51cf12cad9"}, + {file = "orjson-3.8.10-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:d953e6c2087dcd990e794f8405011369ee11cf13e9aaae3172ee762ee63947f2"}, + {file = "orjson-3.8.10-cp38-cp38-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:81aa3f321d201bff0bd0f4014ea44e51d58a9a02d8f2b0eeab2cee22611be8e1"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d27b6182f75896dd8c10ea0f78b9265a3454be72d00632b97f84d7031900dd4"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1486600bc1dd1db26c588dd482689edba3d72d301accbe4301db4b2b28bd7aa4"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:344ea91c556a2ce6423dc13401b83ab0392aa697a97fa4142c2c63a6fd0bbfef"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:979f231e3bad1c835627eef1a30db12a8af58bfb475a6758868ea7e81897211f"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa3a26dcf0f5f2912a8ce8e87273e68b2a9526854d19fd09ea671b154418e88"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:b6e79d8864794635974b18821b49a7f27859d17b93413d4603efadf2e92da7a5"}, + {file = "orjson-3.8.10-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ce49999bcbbc14791c61844bc8a69af44f5205d219be540e074660038adae6bf"}, + {file = "orjson-3.8.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2ef690335b24f9272dbf6639353c1ffc3f196623a92b851063e28e9515cf7dd"}, + {file = "orjson-3.8.10-cp38-none-win_amd64.whl", hash = "sha256:5a0b1f4e4fa75e26f814161196e365fc0e1a16e3c07428154505b680a17df02f"}, + {file = "orjson-3.8.10-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:af7601a78b99f0515af2f8ab12c955c0072ffcc1e437fb2556f4465783a4d813"}, + {file = "orjson-3.8.10-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6bbd7b3a3e2030b03c68c4d4b19a2ef5b89081cbb43c05fe2010767ef5e408db"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4355c9aedfefe60904e8bd7901315ebbc8bb828f665e4c9bc94b1432e67cb6f7"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b7b0ba074375e25c1594e770e2215941e2017c3cd121889150737fa1123e8bfe"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34b6901c110c06ab9e8d7d0496db4bc9a0c162ca8d77f67539d22cb39e0a1ef4"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb62ec16a1c26ad9487727b529103cb6a94a1d4969d5b32dd0eab5c3f4f5a6f2"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:595e1e7d04aaaa3d41113e4eb9f765ab642173c4001182684ae9ddc621bb11c8"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:64ffd92328473a2f9af059410bd10c703206a4bbc7b70abb1bedcd8761e39eb8"}, + {file = "orjson-3.8.10-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1f648ec89c6a426098868460c0ef8c86b457ce1378d7569ff4acb6c0c454048"}, + {file = "orjson-3.8.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6a286ad379972e4f46579e772f0477e6b505f1823aabcd64ef097dbb4549e1a4"}, + {file = "orjson-3.8.10-cp39-none-win_amd64.whl", hash = "sha256:d2874cee6856d7c386b596e50bc517d1973d73dc40b2bd6abec057b5e7c76b2f"}, + {file = "orjson-3.8.10.tar.gz", hash = "sha256:dcf6adb4471b69875034afab51a14b64f1026bc968175a2bb02c5f6b358bd413"}, +] + [[package]] name = "packaging" version = "23.1" @@ -5373,7 +5445,7 @@ typing-extensions = {version = "*", markers = "python_version <= \"3.8\""} name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5546,37 +5618,36 @@ requests = "*" [[package]] name = "protobuf" -version = "3.19.6" +version = "3.19.0" description = "Protocol Buffers" category = "main" optional = true python-versions = ">=3.5" files = [ - {file = "protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1"}, - {file = "protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6"}, - {file = "protobuf-3.19.6-cp310-cp310-win32.whl", hash = "sha256:559670e006e3173308c9254d63facb2c03865818f22204037ab76f7a0ff70b5f"}, - {file = "protobuf-3.19.6-cp310-cp310-win_amd64.whl", hash = "sha256:347b393d4dd06fb93a77620781e11c058b3b0a5289262f094379ada2920a3730"}, - {file = "protobuf-3.19.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a8ce5ae0de28b51dff886fb922012dad885e66176663950cb2344c0439ecb473"}, - {file = "protobuf-3.19.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b0d02163c4e67279ddb6dc25e063db0130fc299aefabb5d481053509fae5c8"}, - {file = "protobuf-3.19.6-cp36-cp36m-win32.whl", hash = "sha256:30f5370d50295b246eaa0296533403961f7e64b03ea12265d6dfce3a391d8992"}, - {file = "protobuf-3.19.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0c0714b025ec057b5a7600cb66ce7c693815f897cfda6d6efb58201c472e3437"}, - {file = "protobuf-3.19.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5057c64052a1f1dd7d4450e9aac25af6bf36cfbfb3a1cd89d16393a036c49157"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bb6776bd18f01ffe9920e78e03a8676530a5d6c5911934c6a1ac6eb78973ecb6"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a04134866861b11556a82dd91ea6daf1f4925746b992f277b84013a7cc1229"}, - {file = "protobuf-3.19.6-cp37-cp37m-win32.whl", hash = "sha256:4bc98de3cdccfb5cd769620d5785b92c662b6bfad03a202b83799b6ed3fa1fa7"}, - {file = "protobuf-3.19.6-cp37-cp37m-win_amd64.whl", hash = "sha256:aa3b82ca1f24ab5326dcf4ea00fcbda703e986b22f3d27541654f749564d778b"}, - {file = "protobuf-3.19.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2b2d2913bcda0e0ec9a784d194bc490f5dc3d9d71d322d070b11a0ade32ff6ba"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d0b635cefebd7a8a0f92020562dead912f81f401af7e71f16bf9506ff3bdbb38"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a552af4dc34793803f4e735aabe97ffc45962dfd3a237bdde242bff5a3de684"}, - {file = "protobuf-3.19.6-cp38-cp38-win32.whl", hash = "sha256:0469bc66160180165e4e29de7f445e57a34ab68f49357392c5b2f54c656ab25e"}, - {file = "protobuf-3.19.6-cp38-cp38-win_amd64.whl", hash = "sha256:91d5f1e139ff92c37e0ff07f391101df77e55ebb97f46bbc1535298d72019462"}, - {file = "protobuf-3.19.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c0ccd3f940fe7f3b35a261b1dd1b4fc850c8fde9f74207015431f174be5976b3"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0"}, - {file = "protobuf-3.19.6-cp39-cp39-win32.whl", hash = "sha256:5a0d7539a1b1fb7e76bf5faa0b44b30f812758e989e59c40f77a7dab320e79b9"}, - {file = "protobuf-3.19.6-cp39-cp39-win_amd64.whl", hash = "sha256:bbf5cea5048272e1c60d235c7bd12ce1b14b8a16e76917f371c718bd3005f045"}, - {file = "protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4"}, - {file = "protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4"}, + {file = "protobuf-3.19.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:01a0645ef3acddfbc90237e1cdfae1086130fc7cb480b5874656193afd657083"}, + {file = "protobuf-3.19.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d3861c9721a90ba83ee0936a9cfcc4fa1c4b4144ac9658fb6f6343b38558e9b4"}, + {file = "protobuf-3.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b64be5d7270cf5e76375bac049846e8a9543a2d4368b69afe78ab725380a7487"}, + {file = "protobuf-3.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2f6046b9e2feee0dce994493186e8715b4392ed5f50f356280ad9c2f9f93080a"}, + {file = "protobuf-3.19.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac2f8ec942d414609aba0331952ae12bb823e8f424bbb6b8c422f1cef32dc842"}, + {file = "protobuf-3.19.0-cp36-cp36m-win32.whl", hash = "sha256:3fea09aa04ef2f8b01fcc9bb87f19509934f8a35d177c865b8f9ee5c32b60c1b"}, + {file = "protobuf-3.19.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d1f4277d321f60456845ca9b882c4845736f1f5c1c69eb778eba22a97977d8af"}, + {file = "protobuf-3.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8488c2276f14f294e890cc1260ab342a13e90cd20dcc03319d2eea258f1fd321"}, + {file = "protobuf-3.19.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:36bf292f44966c67080e535321501717f4f1eba30faef8f2cd4b0c745a027211"}, + {file = "protobuf-3.19.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99af73ae34c93e0e2ace57ea2e70243f34fc015c8c23fd39ee93652e726f7e7"}, + {file = "protobuf-3.19.0-cp37-cp37m-win32.whl", hash = "sha256:f7a031cf8e2fc14acc0ba694f6dff0a01e06b70d817eba6edc72ee6cc20517ac"}, + {file = "protobuf-3.19.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d4ca5f0c7bc8d2e6966ca3bbd85e9ebe7191b6e21f067896d4af6b28ecff29fe"}, + {file = "protobuf-3.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9a8a880593015ef2c83f7af797fa4fbf583b2c98b4bd94e46c5b61fee319d84b"}, + {file = "protobuf-3.19.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:6f16925f5c977dd7787973a50c242e60c22b1d1182aba6bec7bd02862579c10f"}, + {file = "protobuf-3.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9097327d277b0aa4a3224e61cd6850aef3269172397715299bcffc9f90293c9"}, + {file = "protobuf-3.19.0-cp38-cp38-win32.whl", hash = "sha256:708d04394a63ee9bdc797938b6e15ed5bf24a1cb37743eb3886fd74a5a67a234"}, + {file = "protobuf-3.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:ee4d07d596357f51316b6ecf1cc1927660e9d5e418385bb1c51fd2496cd9bee7"}, + {file = "protobuf-3.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:34a77b8fafdeb8f89fee2b7108ae60d8958d72e33478680cc1e05517892ecc46"}, + {file = "protobuf-3.19.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4f93e0f6af796ddd1502225ff8ea25340ced186ca05b601c44d5c88b45ba80a0"}, + {file = "protobuf-3.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:942dd6bc8bd2a3c6a156d8ab0f80bd45313f22b78e1176283270054dcc8ca4c2"}, + {file = "protobuf-3.19.0-cp39-cp39-win32.whl", hash = "sha256:7b3867795708ac88fde8d6f34f0d9a50af56087e41f624bdb2e9ff808ea5dda7"}, + {file = "protobuf-3.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:a74432e9d28a6072a2359a0f49f81eb14dd718e7dbbfb6c0789b456c49e1f130"}, + {file = "protobuf-3.19.0-py2.py3-none-any.whl", hash = "sha256:c96e94d3e523a82caa3e5f74b35dd1c4884199358d01c950d95c341255ff48bc"}, + {file = "protobuf-3.19.0.tar.gz", hash = "sha256:6a1dc6584d24ef86f5b104bcad64fa0fe06ed36e5687f426e0445d363a041d18"}, ] [[package]] @@ -6105,7 +6176,7 @@ Pillow = ">=8.0.0" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -7507,7 +7578,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -7759,18 +7830,18 @@ files = [ [[package]] name = "tensorflow-hub" -version = "0.13.0" +version = "0.12.0" description = "TensorFlow Hub is a library to foster the publication, discovery, and consumption of reusable parts of machine learning models." category = "main" optional = true python-versions = "*" files = [ - {file = "tensorflow_hub-0.13.0-py2.py3-none-any.whl", hash = "sha256:3544f4fd9fd99e4eeb6da1b5b5320e4a2dbdef7f9bb778f66f76d6790f32dd65"}, + {file = "tensorflow_hub-0.12.0-py2.py3-none-any.whl", hash = "sha256:822fe5f7338c95efcc3a534011c6689e4309ba2459def87194179c4de8a6e1fc"}, ] [package.dependencies] numpy = ">=1.12.0" -protobuf = ">=3.19.6" +protobuf = ">=3.8.0" [package.extras] make-image-classifier = ["keras-preprocessing[image]"] @@ -8132,7 +8203,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -8414,7 +8485,7 @@ types-pyOpenSSL = "*" name = "types-requests" version = "2.28.11.17" description = "Typing stubs for requests" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -8441,7 +8512,7 @@ files = [ name = "types-urllib3" version = "1.26.25.10" description = "Typing stubs for urllib3" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -9267,13 +9338,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] +docarray = ["docarray", "protobuf"] +embeddings = ["sentence-transformers"] +llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "ab6ea1c53c7a6e792d5bdcf8865b87e5dcfe4c89080c18b356dc4ed8a17cc3a3" +content-hash = "81e7b09595d12739f056c5f5d34021ad7e3f855a8da711d3ccc23aab72cfbd83" diff --git a/pyproject.toml b/pyproject.toml index 0eec46451897e..61406f1db2e0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,10 @@ pytesseract = {version = "^0.3.10", optional=true} html2text = {version="^2020.1.16", optional=true} numexpr = "^2.8.4" duckduckgo-search = {version="^2.8.6", optional=true} +docarray = {version="^0.30.0", optional=true} +protobuf = {version="3.19", optional=true} +hnswlib = {version="^0.7.0", optional=true} +pytest = "^7.3.1" [tool.poetry.group.docs.dependencies] @@ -145,8 +149,9 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes qdrant = ["qdrant-client"] openai = ["openai"] cohere = ["cohere"] +docarray = ["docarray", "protobuf"] embeddings = ["sentence-transformers"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] [tool.ruff] select = [ diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py new file mode 100644 index 0000000000000..7aa3481cf19e7 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -0,0 +1,54 @@ +import pytest + +from langchain.schema import Document +from langchain.vectorstores.hnsw_lib import HnswLib +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + sim_metric='cosine', + ) + assert isinstance(docsearch, HnswLib) + + +@pytest.fixture +def docarray_vec_store(tmp_path): + texts = ["foo", "bar", "baz"] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + return docsearch + + +def test_sim_search(docarray_vec_store) -> None: + """Test end to end construction and simple similarity search.""" + + output = docarray_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_sim_search_with_score(docarray_vec_store) -> None: + """Test end to end construction and similarity search with score.""" + + output = docarray_vec_store.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo"), 1.0)] + + +def test_sim_search_by_vector(docarray_vec_store): + """Test end to end construction and similarity search by vector.""" + embedding = [1.0] * 10 + output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + + assert output == [Document(page_content="bar")] + + diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py new file mode 100644 index 0000000000000..79458727310a8 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -0,0 +1,48 @@ +import pytest + +from langchain.schema import Document +from langchain.vectorstores.in_memory import InMemory +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_docarray_vec_store_init() -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + ) + assert isinstance(docsearch, InMemory) + + +@pytest.fixture +def docarray_vec_store(): + texts = ["foo", "bar", "baz"] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + ) + return docsearch + + +def test_sim_search(docarray_vec_store) -> None: + """Test end to end construction and simple similarity search.""" + + output = docarray_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_sim_search_with_score(docarray_vec_store) -> None: + """Test end to end construction and similarity search with score.""" + + output = docarray_vec_store.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo"), 1.0)] + + +def test_sim_search_by_vector(docarray_vec_store): + """Test end to end construction and similarity search by vector.""" + embedding = [1.0] * 10 + output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + + assert output == [Document(page_content="bar")] + From b687fd487f596818f0fe9e7230712a0ca0da7ad5 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 12:22:02 +0200 Subject: [PATCH 2/6] refactor: use abtract VecStoreFromDocIndex for in memory and hnswlib implementation Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 204 ++----------- langchain/vectorstores/in_memory.py | 273 +++++++++--------- .../vector_store_from_doc_index.py | 186 ++++++++++++ .../vectorstores/test_hnsw_lib.py | 73 +++-- .../vectorstores/test_in_memory.py | 63 +++- 5 files changed, 448 insertions(+), 351 deletions(-) create mode 100644 langchain/vectorstores/vector_store_from_doc_index.py diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 6974133c9891b..51c85423ac2b7 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -1,52 +1,44 @@ """Wrapper around in-memory DocArray store.""" from __future__ import annotations -from operator import itemgetter from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING +from docarray.typing import NdArray from langchain.embeddings.base import Embeddings -from langchain.schema import Document -from langchain.vectorstores import VectorStore from langchain.vectorstores.base import VST -from langchain.vectorstores.utils import maximal_marginal_relevance - -from docarray import BaseDoc -from docarray.typing import NdArray +from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import -class HnswLib(VectorStore): +class HnswLib(VecStoreFromDocIndex): """Wrapper around HnswLib storage. - To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + To use it, you should have the ``docarray`` package with version >=0.31.0 installed. """ def __init__( self, - work_dir: str, - n_dim: int, texts: List[str], embedding: Embeddings, + work_dir: str, + n_dim: int, metadatas: Optional[List[dict]], - sim_metric: str = 'cosine', - kwargs: dict = None + dist_metric: str = 'cosine', + **kwargs, ) -> None: - """Initialize HnswLib store.""" - try: - import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: - raise ValueError( - f'To use the HnswLib VectorStore the docarray version >=0.30.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' - ) - else: - from docarray import DocList - from docarray.index import HnswDocumentIndex - except ImportError: - raise ImportError( - "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." - ) + """Initialize HnswLib store. + + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + dist_metric (str): Distance metric for HnswLib can be one of: 'cosine', + 'ip', and 'l2'. Defaults to 'cosine'. + """ + _check_docarray_import() + from docarray.index import HnswDocumentIndex + try: import google.protobuf except ImportError: @@ -55,27 +47,13 @@ def __init__( "Please install it with `pip install -U protobuf`." ) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - self.embedding = embedding - - self.doc_cls = self._get_doc_cls(n_dim, sim_metric) - self.doc_index = HnswDocumentIndex[self.doc_cls](work_dir=work_dir) - embeddings = self.embedding.embed_documents(texts) - docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, embeddings) - ] - ) - self.doc_index.index(docs) + doc_cls = self._get_doc_cls(n_dim, dist_metric) + doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) + super().__init__(doc_index, texts, embedding, metadatas) @staticmethod def _get_doc_cls(n_dim: int, sim_metric: str): + from docarray import BaseDoc from pydantic import Field class DocArrayDoc(BaseDoc): @@ -93,6 +71,7 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, + dist_metric: str = 'cosine', **kwargs: Any ) -> HnswLib: @@ -107,129 +86,6 @@ def from_texts( texts=texts, embedding=embedding, metadatas=metadatas, - kwargs=kwargs + dist_metric=dist_metric, + kwargs=kwargs, ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if metadatas is None: - metadatas = [{} for _ in range(len(list(texts)))] - - ids = [] - embeddings = self.embedding.embed_documents(texts) - for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) - self.doc_index.index(doc) - ids.append(doc.id) # TODO return index of self.docs ? - - return ids - - def similarity_search_with_score( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query and score for each. - """ - query_embedding = self.embedding.embed_query(query) - query_embedding = [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.] - print(f"query_embedding = {query_embedding}") - query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) - - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - return result - - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query. - """ - results = self.similarity_search_with_score(query, k) - return list(map(itemgetter(0), results)) - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - - query_doc = self.doc_cls(embedding=embedding) - docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents - - result = [Document(page_content=doc.text) for doc in docs] - return result - - def max_marginal_relevance_search( - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - - Returns: - List of Documents selected by maximal marginal relevance. - """ - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) - - embeddings = [emb for emb in docs.emb] - - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] - return results - diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py index a079b10da7887..7a5139d898401 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory.py @@ -1,71 +1,58 @@ """Wrapper around in-memory DocArray store.""" from __future__ import annotations -from operator import itemgetter -from typing import List, Optional, Any, Tuple, Iterable, Type +from typing import List, Optional, Any, Type + +from docarray.typing import NdArray from langchain.embeddings.base import Embeddings from langchain.schema import Document -from langchain.vectorstores import VectorStore from langchain.vectorstores.base import VST from langchain.vectorstores.utils import maximal_marginal_relevance - -from docarray import BaseDoc -from docarray.typing import NdArray +from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex -class InMemory(VectorStore): +class InMemory(VecStoreFromDocIndex): """Wrapper around in-memory storage. - To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + To use it, you should have the ``docarray`` package with version >=0.31.0 installed. """ def __init__( self, texts: List[str], embedding: Embeddings, - metadatas: Optional[List[dict]] + metadatas: Optional[List[dict]] = None, + metric: str = 'cosine_sim', ) -> None: - """Initialize in-memory store.""" - try: - import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: - raise ValueError( - f'To use the InMemory VectorStore the docarray version >=0.30.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' - ) - else: - from docarray import DocList - - except ImportError: - raise ImportError( - "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." - ) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - self.embedding = embedding - self.doc_cls = self._get_doc_cls() - self.docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, self.embedding.embed_documents(texts)) - ] - ) + """Initialize in-memory store. + + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + metric (str): metric for exact nearest-neighbor search. + Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. + Defaults to 'cosine_sim'. + + """ + _check_docarray_import() + from docarray.index import InMemoryDocIndex + + doc_cls = self._get_doc_cls(metric) + doc_index = InMemoryDocIndex[doc_cls]() + super().__init__(doc_index, texts, embedding, metadatas) @staticmethod - def _get_doc_cls(): + def _get_doc_cls(sim_metric: str): + from docarray import BaseDoc + from pydantic import Field + class DocArrayDoc(BaseDoc): text: Optional[str] - embedding: Optional[NdArray] + embedding: Optional[NdArray] = Field(space=sim_metric) metadata: Optional[dict] - # DocArrayDoc.update_forward_refs() return DocArrayDoc @classmethod @@ -74,110 +61,112 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, + metric: str = 'cosine_sim', **kwargs: Any ) -> InMemory: return cls( texts=texts, embedding=embedding, - metadatas=metadatas + metadatas=metadatas, + metric=metric, ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if metadatas is None: - metadatas = [{} for _ in range(len(list(texts)))] - - ids = [] - embeddings = self.embedding.embed_documents(texts) - for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) - self.docs.append(doc) - ids.append(doc.id) # TODO return index of self.docs ? - - return ids - - def similarity_search_with_score( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query and score for each. - """ - from docarray.utils.find import find # TODO move import - - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') - - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - return result - - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query. - """ - results = self.similarity_search_with_score(query, k) - return list(map(itemgetter(0), results)) - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - from docarray.utils.find import find - - query_doc = self.doc_cls(embedding=embedding) - result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents - - result = [Document(page_content=doc.text) for doc in result_docs] - return result + # + # def add_texts( + # self, + # texts: Iterable[str], + # metadatas: Optional[List[dict]] = None, + # **kwargs: Any + # ) -> List[str]: + # """Run more texts through the embeddings and add to the vectorstore. + # + # Args: + # texts: Iterable of strings to add to the vectorstore. + # metadatas: Optional list of metadatas associated with the texts. + # + # Returns: + # List of ids from adding the texts into the vectorstore. + # """ + # if metadatas is None: + # metadatas = [{} for _ in range(len(list(texts)))] + # + # ids = [] + # embeddings = self.embedding.embed_documents(texts) + # for t, m, e in zip(texts, metadatas, embeddings): + # doc = self.doc_cls( + # text=t, + # embedding=e, + # metadata=m + # ) + # self.docs.append(doc) + # ids.append(doc.id) # TODO return index of self.docs ? + # + # return ids + # + # def similarity_search_with_score( + # self, query: str, k: int = 4, **kwargs: Any + # ) -> List[Tuple[Document, float]]: + # """Return docs most similar to query. + # + # Args: + # query: Text to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query and score for each. + # """ + # from docarray.utils.find import find # TODO move import + # + # query_embedding = self.embedding.embed_query(query) + # query_doc = self.doc_cls(embedding=query_embedding) + # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') + # + # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + # return result + # + # def similarity_search( + # self, query: str, k: int = 4, **kwargs: Any + # ) -> List[Document]: + # """Return docs most similar to query. + # + # Args: + # query: Text to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query. + # """ + # results = self.similarity_search_with_score(query, k) + # return list(map(itemgetter(0), results)) + # + # def _similarity_search_with_relevance_scores( + # self, + # query: str, + # k: int = 4, + # **kwargs: Any, + # ) -> List[Tuple[Document, float]]: + # """Return docs and relevance scores, normalized on a scale from 0 to 1. + # + # 0 is dissimilar, 1 is most similar. + # """ + # raise NotImplementedError + # + # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + # """Return docs most similar to embedding vector. + # + # Args: + # embedding: Embedding to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query vector. + # """ + # from docarray.utils.find import find + # + # query_doc = self.doc_cls(embedding=embedding) + # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents + # + # result = [Document(page_content=doc.text) for doc in result_docs] + # return result def max_marginal_relevance_search( self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py new file mode 100644 index 0000000000000..a72c883b2e201 --- /dev/null +++ b/langchain/vectorstores/vector_store_from_doc_index.py @@ -0,0 +1,186 @@ +from typing import TYPE_CHECKING, TypeVar, List, Optional, Type, Iterable, Any, Tuple + +from docarray import DocList, BaseDoc +from operator import itemgetter + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore + +from docarray.index.abstract import BaseDocIndex + + +T_Doc = TypeVar('T_Doc', bound=BaseDocIndex) + + +def _check_docarray_import(): + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + + +class VecStoreFromDocIndex(VectorStore): + doc_index: BaseDocIndex = None + doc_cls: Type[BaseDoc] = None + embedding: Embeddings = None + + def __init__( + self, + doc_index: T_Doc, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]], + ): + self.doc_index = doc_index + self.doc_cls = doc_index._schema + self.embedding = embedding + + embeddings = self.embedding.embed_documents(texts) + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, embeddings) + ] + ) + if len(docs) > 0: + self.doc_index.index(docs) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.doc_index.index([doc]) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + + query_doc = self.doc_cls(embedding=embedding) + docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + + result = [Document(page_content=doc.text) for doc in docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + + embeddings = [emb for emb in docs.emb] + + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + return results + diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index 7aa3481cf19e7..58919d37e7094 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from langchain.schema import Document @@ -5,7 +6,7 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: +def test_hnswlib_vec_store_from_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] docsearch = HnswLib.from_texts( @@ -16,39 +17,71 @@ def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: sim_metric='cosine', ) assert isinstance(docsearch, HnswLib) + assert docsearch.doc_index.num_docs() == 3 -@pytest.fixture -def docarray_vec_store(tmp_path): - texts = ["foo", "bar", "baz"] - docsearch = HnswLib.from_texts( - texts, - FakeEmbeddings(), +def test_hnswlib_vec_store_add_texts(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + docsearch = HnswLib( work_dir=str(tmp_path), n_dim=10, + texts=[], + embedding=FakeEmbeddings(), + metadatas=[{}], + sim_metric='cosine', ) - return docsearch + assert isinstance(docsearch, HnswLib) + assert docsearch.doc_index.num_docs() == 0 + texts = ["foo", "bar", "baz"] + docsearch.add_texts(texts=texts) + assert docsearch.doc_index.num_docs() == 3 -def test_sim_search(docarray_vec_store) -> None: - """Test end to end construction and simple similarity search.""" - output = docarray_vec_store.similarity_search("foo", k=1) +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search(metric, tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + output = hnswlib_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -def test_sim_search_with_score(docarray_vec_store) -> None: - """Test end to end construction and similarity search with score.""" - - output = docarray_vec_store.similarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo"), 1.0)] - - -def test_sim_search_by_vector(docarray_vec_store): +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search_by_vector(metric, tmp_path): """Test end to end construction and similarity search by vector.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) embedding = [1.0] * 10 - output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1) assert output == [Document(page_content="bar")] +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search_with_score(metric, tmp_path) -> None: + """Test end to end construction and similarity search with score.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + output = hnswlib_vec_store.similarity_search_with_score("foo", k=1) + assert len(output) == 1 + + out_doc, out_score = output[0] + assert out_doc == Document(page_content="foo") + assert np.isclose(out_score, 0.0, atol=1.e-6) + diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py index 79458727310a8..62834336c7c24 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from langchain.schema import Document @@ -5,7 +6,7 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_vec_store_init() -> None: +def test_in_memory_vec_store_from_texts() -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] docsearch = InMemory.from_texts( @@ -13,36 +14,68 @@ def test_docarray_vec_store_init() -> None: FakeEmbeddings(), ) assert isinstance(docsearch, InMemory) + assert docsearch.doc_index.num_docs() == 3 -@pytest.fixture -def docarray_vec_store(): - texts = ["foo", "bar", "baz"] - docsearch = InMemory.from_texts( - texts, - FakeEmbeddings(), +def test_in_memory_vec_store_add_texts(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + docsearch = InMemory( + texts=[], + embedding=FakeEmbeddings(), ) - return docsearch + assert isinstance(docsearch, InMemory) + assert docsearch.doc_index.num_docs() == 0 + + texts = ["foo", "bar", "baz"] + docsearch.add_texts(texts=texts) + assert docsearch.doc_index.num_docs() == 3 -def test_sim_search(docarray_vec_store) -> None: +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) - output = docarray_vec_store.similarity_search("foo", k=1) + output = in_memory_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -def test_sim_search_with_score(docarray_vec_store) -> None: +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) + + output = in_memory_vec_store.similarity_search_with_score("foo", k=1) - output = docarray_vec_store.similarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo"), 1.0)] + out_doc, out_score = output[0] + assert out_doc == Document(page_content="foo") + expected_score = 0.0 if 'dist' in metric else 1.0 + assert np.isclose(out_score, expected_score, atol=1.e-6) -def test_sim_search_by_vector(docarray_vec_store): + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search_by_vector(metric): """Test end to end construction and similarity search by vector.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) + embedding = [1.0] * 10 - output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + output = in_memory_vec_store.similarity_search_by_vector(embedding, k=1) assert output == [Document(page_content="bar")] From de262f9ae52affcccc3653128dee3d7cded177dc Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 15:17:26 +0200 Subject: [PATCH 3/6] fix: clean up and add dependencies Signed-off-by: anna-charlotte --- langchain/vectorstores/__init__.py | 4 + langchain/vectorstores/hnsw_lib.py | 74 ++++--- langchain/vectorstores/in_memory.py | 195 +++--------------- .../vector_store_from_doc_index.py | 113 +++++----- poetry.lock | 23 ++- pyproject.toml | 9 +- .../vectorstores/test_hnsw_lib.py | 27 ++- .../vectorstores/test_in_memory.py | 20 +- 8 files changed, 186 insertions(+), 279 deletions(-) diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 30d1ca7ecdc3e..5360f4b8f25c2 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -7,6 +7,8 @@ from langchain.vectorstores.deeplake import DeepLake from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS +from langchain.vectorstores.hnsw_lib import HnswLib +from langchain.vectorstores.in_memory import InMemory from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch @@ -34,4 +36,6 @@ "MyScaleSettings", "SupabaseVectorStore", "AnalyticDB", + "HnswLib", + "InMemory", ] diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 51c85423ac2b7..ddc3ec7b6102c 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -1,40 +1,38 @@ -"""Wrapper around in-memory DocArray store.""" +"""Wrapper around HnswLib store.""" from __future__ import annotations -from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING -from docarray.typing import NdArray +from typing import List, Optional, Type from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VST -from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import +from langchain.vectorstores.vector_store_from_doc_index import ( + VecStoreFromDocIndex, + _check_docarray_import, +) class HnswLib(VecStoreFromDocIndex): """Wrapper around HnswLib storage. - To use it, you should have the ``docarray`` package with version >=0.31.0 installed. + To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. + You can install it with `pip install "langchain[hnswlib]"`. """ + def __init__( self, - texts: List[str], embedding: Embeddings, work_dir: str, n_dim: int, - metadatas: Optional[List[dict]], - dist_metric: str = 'cosine', - **kwargs, + dist_metric: str = "cosine", ) -> None: """Initialize HnswLib store. Args: - texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. - Defaults to None. work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for HnswLib can be one of: 'cosine', - 'ip', and 'l2'. Defaults to 'cosine'. + dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + "ip", and "l2". Defaults to "cosine". """ _check_docarray_import() from docarray.index import HnswDocumentIndex @@ -43,25 +41,13 @@ def __init__( import google.protobuf except ImportError: raise ImportError( - "Could not import protobuf python package. " - "Please install it with `pip install -U protobuf`." + "Could not import all required packages. " + "Please install it with `pip install \"langchain[hnswlib]\"`." ) - doc_cls = self._get_doc_cls(n_dim, dist_metric) + doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric}) doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) - super().__init__(doc_index, texts, embedding, metadatas) - - @staticmethod - def _get_doc_cls(n_dim: int, sim_metric: str): - from docarray import BaseDoc - from pydantic import Field - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric) - metadata: Optional[dict] - - return DocArrayDoc + super().__init__(doc_index, embedding) @classmethod def from_texts( @@ -71,21 +57,33 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, - dist_metric: str = 'cosine', - **kwargs: Any + dist_metric: str = "cosine", ) -> HnswLib: + """Create an HnswLib store and insert data. + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + "ip", and "l2". Defaults to "cosine". + + Returns: + HnswLib Vector Store + """ if work_dir is None: - raise ValueError('`work_dir` parameter hs not been set.') + raise ValueError("`work_dir` parameter hs not been set.") if n_dim is None: - raise ValueError('`n_dim` parameter has not been set.') + raise ValueError("`n_dim` parameter has not been set.") - return cls( + store = cls( work_dir=work_dir, n_dim=n_dim, - texts=texts, embedding=embedding, - metadatas=metadatas, dist_metric=dist_metric, - kwargs=kwargs, ) + store.add_texts(texts=texts, metadatas=metadatas) + return store diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py index 7a5139d898401..07e1f49d82c17 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory.py @@ -1,59 +1,42 @@ -"""Wrapper around in-memory DocArray store.""" +"""Wrapper around in-memory storage.""" from __future__ import annotations -from typing import List, Optional, Any, Type - -from docarray.typing import NdArray +from typing import List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.schema import Document from langchain.vectorstores.base import VST -from langchain.vectorstores.utils import maximal_marginal_relevance -from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex +from langchain.vectorstores.vector_store_from_doc_index import ( + VecStoreFromDocIndex, + _check_docarray_import, +) class InMemory(VecStoreFromDocIndex): """Wrapper around in-memory storage. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. + You can install it with `pip install "langchain[in_memory_store]"`. """ + def __init__( self, - texts: List[str], embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - metric: str = 'cosine_sim', + metric: str = "cosine_sim", ) -> None: """Initialize in-memory store. Args: - texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. - Defaults to None. metric (str): metric for exact nearest-neighbor search. - Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. - Defaults to 'cosine_sim'. - + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". + Defaults to "cosine_sim". """ _check_docarray_import() - from docarray.index import InMemoryDocIndex - - doc_cls = self._get_doc_cls(metric) - doc_index = InMemoryDocIndex[doc_cls]() - super().__init__(doc_index, texts, embedding, metadatas) - - @staticmethod - def _get_doc_cls(sim_metric: str): - from docarray import BaseDoc - from pydantic import Field - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(space=sim_metric) - metadata: Optional[dict] + from docarray.index import InMemoryExactNNIndex - return DocArrayDoc + doc_cls = self._get_doc_cls({"space": metric}) + doc_index = InMemoryExactNNIndex[doc_cls]() + super().__init__(doc_index, embedding) @classmethod def from_texts( @@ -61,139 +44,25 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, - metric: str = 'cosine_sim', - **kwargs: Any + metric: str = "cosine_sim", ) -> InMemory: - return cls( - texts=texts, - embedding=embedding, - metadatas=metadatas, - metric=metric, - ) - # - # def add_texts( - # self, - # texts: Iterable[str], - # metadatas: Optional[List[dict]] = None, - # **kwargs: Any - # ) -> List[str]: - # """Run more texts through the embeddings and add to the vectorstore. - # - # Args: - # texts: Iterable of strings to add to the vectorstore. - # metadatas: Optional list of metadatas associated with the texts. - # - # Returns: - # List of ids from adding the texts into the vectorstore. - # """ - # if metadatas is None: - # metadatas = [{} for _ in range(len(list(texts)))] - # - # ids = [] - # embeddings = self.embedding.embed_documents(texts) - # for t, m, e in zip(texts, metadatas, embeddings): - # doc = self.doc_cls( - # text=t, - # embedding=e, - # metadata=m - # ) - # self.docs.append(doc) - # ids.append(doc.id) # TODO return index of self.docs ? - # - # return ids - # - # def similarity_search_with_score( - # self, query: str, k: int = 4, **kwargs: Any - # ) -> List[Tuple[Document, float]]: - # """Return docs most similar to query. - # - # Args: - # query: Text to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query and score for each. - # """ - # from docarray.utils.find import find # TODO move import - # - # query_embedding = self.embedding.embed_query(query) - # query_doc = self.doc_cls(embedding=query_embedding) - # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') - # - # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - # return result - # - # def similarity_search( - # self, query: str, k: int = 4, **kwargs: Any - # ) -> List[Document]: - # """Return docs most similar to query. - # - # Args: - # query: Text to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query. - # """ - # results = self.similarity_search_with_score(query, k) - # return list(map(itemgetter(0), results)) - # - # def _similarity_search_with_relevance_scores( - # self, - # query: str, - # k: int = 4, - # **kwargs: Any, - # ) -> List[Tuple[Document, float]]: - # """Return docs and relevance scores, normalized on a scale from 0 to 1. - # - # 0 is dissimilar, 1 is most similar. - # """ - # raise NotImplementedError - # - # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - # """Return docs most similar to embedding vector. - # - # Args: - # embedding: Embedding to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query vector. - # """ - # from docarray.utils.find import find - # - # query_doc = self.doc_cls(embedding=embedding) - # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents - # - # result = [Document(page_content=doc.text) for doc in result_docs] - # return result - - def max_marginal_relevance_search( - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. + """Create an in-memory store and insert data. Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + metric (str): metric for exact nearest-neighbor search. + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". + Defaults to "cosine_sim". Returns: - List of Documents selected by maximal marginal relevance. - """ - from docarray.utils.find import find - - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - find_res = find(self.docs, query_doc, limit=k) - - embeddings = [emb for emb in find_res.documents.emb] - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [] - for idx in mmr_selected: - results.append(Document(page_content=self.docs[idx].text)) - return results - + InMemory Vector Store + """ + store = cls( + embedding=embedding, + metric=metric, + ) + store.add_texts(texts=texts, metadatas=metadatas) + return store diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py index a72c883b2e201..a471bfe1cd703 100644 --- a/langchain/vectorstores/vector_store_from_doc_index.py +++ b/langchain/vectorstores/vector_store_from_doc_index.py @@ -1,72 +1,72 @@ -from typing import TYPE_CHECKING, TypeVar, List, Optional, Type, Iterable, Any, Tuple - -from docarray import DocList, BaseDoc from operator import itemgetter +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type + +try: + from docarray import BaseDoc + from docarray.index.abstract import BaseDocIndex + from docarray.typing import NdArray +except ImportError: + BaseDoc = None + BaseDocIndex = None + NdArray = None from langchain.embeddings.base import Embeddings from langchain.schema import Document from langchain.vectorstores import VectorStore - -from docarray.index.abstract import BaseDocIndex - - -T_Doc = TypeVar('T_Doc', bound=BaseDocIndex) +from langchain.vectorstores.utils import maximal_marginal_relevance -def _check_docarray_import(): +def _check_docarray_import() -> None: try: import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + + da_version = docarray.__version__.split(".") + if int(da_version[0]) == 0 and int(da_version[1]) <= 30: raise ValueError( - f'To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' + f"To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, " + f"received: {docarray.__version__}." + f"To upgrade, please run: `pip install -U docarray`." ) except ImportError: raise ImportError( "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." + "Please install it with `pip install \"langchain[docarray]\"`." ) class VecStoreFromDocIndex(VectorStore): - doc_index: BaseDocIndex = None - doc_cls: Type[BaseDoc] = None - embedding: Embeddings = None + doc_index: BaseDocIndex + doc_cls: Type[BaseDoc] + embedding: Embeddings def __init__( self, - doc_index: T_Doc, - texts: List[str], + doc_index: BaseDocIndex, embedding: Embeddings, - metadatas: Optional[List[dict]], ): + """Initialize a vector store from DocArray's DocIndex.""" self.doc_index = doc_index self.doc_cls = doc_index._schema self.embedding = embedding - embeddings = self.embedding.embed_documents(texts) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, embeddings) - ] - ) - if len(docs) > 0: - self.doc_index.index(docs) + @staticmethod + def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]: + """Get docarray Document class describing the schema of DocIndex.""" + from docarray import BaseDoc + from pydantic import Field + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(**embeddings_params) + metadata: Optional[dict] + + return DocArrayDoc def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - **kwargs: Any + **kwargs: Any, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. @@ -80,16 +80,12 @@ def add_texts( if metadatas is None: metadatas = [{} for _ in range(len(list(texts)))] - ids = [] + ids: List[str] = [] embeddings = self.embedding.embed_documents(texts) for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) + doc = self.doc_cls(text=t, embedding=e, metadata=m) self.doc_index.index([doc]) - ids.append(doc.id) # TODO return index of self.docs ? + ids.append(str(doc.id)) return ids @@ -107,9 +103,11 @@ def similarity_search_with_score( """ query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + result = [ + (Document(page_content=doc.text), score) for doc, score in zip(docs, scores) + ] return result def similarity_search( @@ -127,7 +125,6 @@ def similarity_search( results = self.similarity_search_with_score(query, k) return list(map(itemgetter(0), results)) - def _similarity_search_with_relevance_scores( self, query: str, @@ -140,7 +137,9 @@ def _similarity_search_with_relevance_scores( """ raise NotImplementedError - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: """Return docs most similar to embedding vector. Args: @@ -152,7 +151,9 @@ def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwar """ query_doc = self.doc_cls(embedding=embedding) - docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + docs = self.doc_index.find( + query_doc, search_field="embedding", limit=k + ).documents result = [Document(page_content=doc.text) for doc in docs] return result @@ -176,11 +177,13 @@ def max_marginal_relevance_search( query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + docs = self.doc_index.find( + query_doc, search_field="embedding", limit=fetch_k + ).documents - embeddings = [emb for emb in docs.emb] - - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + mmr_selected = maximal_marginal_relevance(query_embedding, docs.embedding, k=k) + results = [ + Document(page_content=docs[idx].text, metadata=docs[idx].metadata) + for idx in mmr_selected + ] return results - diff --git a/poetry.lock b/poetry.lock index fc785b03aebfb..4109d28490ed9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1515,14 +1515,14 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.30.0" +version = "0.31.0.dev35" description = "The data structure for multimodal data" category = "main" optional = true python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.30.0-py3-none-any.whl", hash = "sha256:739dbe06bfee6f1cbc030156036764ca1c75832dcc01a07c724640c6d464651b"}, - {file = "docarray-0.30.0.tar.gz", hash = "sha256:dd73e9ff20485a1d819ac906a59ee0cbc4382e78a5061286e77eb7d7f8b28a8e"}, + {file = "docarray-0.31.0.dev35-py3-none-any.whl", hash = "sha256:a5c578cbf69853dddd17e845cc3fb2250cb1a0800ef48082d2a40a38bc9a7165"}, + {file = "docarray-0.31.0.dev35.tar.gz", hash = "sha256:f918cc5c35ed2df9b9ad7ef0abcc0bf5f3fe38a8f9e33526a33293d26a956f2e"}, ] [package.dependencies] @@ -1748,7 +1748,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2769,7 +2769,7 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -5445,7 +5445,7 @@ typing-extensions = {version = "*", markers = "python_version <= \"3.8\""} name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "main" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -6176,7 +6176,7 @@ Pillow = ">=8.0.0" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -8203,7 +8203,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -9338,10 +9338,11 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"] cohere = ["cohere"] -docarray = ["docarray", "protobuf"] embeddings = ["sentence-transformers"] +hnswlib = ["docarray", "protobuf", "hnswlib"] +in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] @@ -9349,4 +9350,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "81e7b09595d12739f056c5f5d34021ad7e3f855a8da711d3ccc23aab72cfbd83" +content-hash = "5223e3c6bdf37a28e1ee1cfb26e7f8d84fd6bc94893c96ecaca428fb9e8278eb" diff --git a/pyproject.toml b/pyproject.toml index 61406f1db2e0c..869d5f8d0d453 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,11 +69,9 @@ pytesseract = {version = "^0.3.10", optional=true} html2text = {version="^2020.1.16", optional=true} numexpr = "^2.8.4" duckduckgo-search = {version="^2.8.6", optional=true} -docarray = {version="^0.30.0", optional=true} +docarray = {version="^0.31.0.dev35", optional=true} protobuf = {version="3.19", optional=true} hnswlib = {version="^0.7.0", optional=true} -pytest = "^7.3.1" - [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -149,9 +147,10 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes qdrant = ["qdrant-client"] openai = ["openai"] cohere = ["cohere"] -docarray = ["docarray", "protobuf"] +in_memory_store = ["docarray"] +hnswlib = ["docarray", "protobuf", "hnswlib"] embeddings = ["sentence-transformers"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"] [tool.ruff] select = [ diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index 58919d37e7094..fc86321c20dd5 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -14,7 +14,7 @@ def test_hnswlib_vec_store_from_texts(tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, - sim_metric='cosine', + dist_metric='cosine', ) assert isinstance(docsearch, HnswLib) assert docsearch.doc_index.num_docs() == 3 @@ -25,10 +25,8 @@ def test_hnswlib_vec_store_add_texts(tmp_path) -> None: docsearch = HnswLib( work_dir=str(tmp_path), n_dim=10, - texts=[], embedding=FakeEmbeddings(), - metadatas=[{}], - sim_metric='cosine', + dist_metric='cosine', ) assert isinstance(docsearch, HnswLib) assert docsearch.doc_index.num_docs() == 0 @@ -53,7 +51,7 @@ def test_sim_search(metric, tmp_path) -> None: @pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) -def test_sim_search_by_vector(metric, tmp_path): +def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] hnswlib_vec_store = HnswLib.from_texts( @@ -85,3 +83,22 @@ def test_sim_search_with_score(metric, tmp_path) -> None: assert out_doc == Document(page_content="foo") assert np.isclose(out_score, 0.0, atol=1.e-6) + +@pytest.mark.parametrize('metric', ['cosine', 'l2']) +def test_max_marginal_relevance_search(metric, tmp_path) -> None: + """Test MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + dist_metric=metric, + work_dir=str(tmp_path), + n_dim=10, + ) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + ] diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py index 62834336c7c24..e90c4ed312d21 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -20,7 +20,6 @@ def test_in_memory_vec_store_from_texts() -> None: def test_in_memory_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" docsearch = InMemory( - texts=[], embedding=FakeEmbeddings(), ) assert isinstance(docsearch, InMemory) @@ -65,7 +64,7 @@ def test_sim_search_with_score(metric) -> None: @pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) -def test_sim_search_by_vector(metric): +def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = InMemory.from_texts( @@ -79,3 +78,20 @@ def test_sim_search_by_vector(metric): assert output == [Document(page_content="bar")] + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_max_marginal_relevance_search(metric) -> None: + """Test MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + metric=metric + ) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + ] From 30456bc3c30fca7a09cd115e07205a1db997159a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 15:39:30 +0200 Subject: [PATCH 4/6] Add more configurations for hnswlib Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 54 ++++++++++++++++++- .../vectorstores/test_hnsw_lib.py | 51 ++++++++++++++++-- 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index ddc3ec7b6102c..42f5c902cb5af 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -24,6 +24,13 @@ def __init__( work_dir: str, n_dim: int, dist_metric: str = "cosine", + max_elements: int = 1024, + index: bool = True, + ef_construction: int = 200, + ef: int = 10, + M: int = 16, + allow_replace_deleted: bool = True, + num_threads: int = 1, ) -> None: """Initialize HnswLib store. @@ -33,6 +40,19 @@ def __init__( n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", "ip", and "l2". Defaults to "cosine". + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. """ _check_docarray_import() from docarray.index import HnswDocumentIndex @@ -45,7 +65,19 @@ def __init__( "Please install it with `pip install \"langchain[hnswlib]\"`." ) - doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric}) + doc_cls = self._get_doc_cls( + { + "dim": n_dim, + "space": dist_metric, + "max_elements": max_elements, + "index": index, + "ef_construction": ef_construction, + "ef": ef, + "M": M, + "allow_replace_deleted": allow_replace_deleted, + "num_threads": num_threads, + } + ) doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) super().__init__(doc_index, embedding) @@ -58,6 +90,13 @@ def from_texts( work_dir: str = None, n_dim: int = None, dist_metric: str = "cosine", + max_elements: int = 1024, + index: bool = True, + ef_construction: int = 200, + ef: int = 10, + M: int = 16, + allow_replace_deleted: bool = True, + num_threads: int = 1, ) -> HnswLib: """Create an HnswLib store and insert data. @@ -70,6 +109,19 @@ def from_texts( n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", "ip", and "l2". Defaults to "cosine". + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. Returns: HnswLib Vector Store diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index fc86321c20dd5..a4a6441eec779 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -36,7 +36,7 @@ def test_hnswlib_vec_store_add_texts(tmp_path) -> None: assert docsearch.doc_index.num_docs() == 3 -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] @@ -45,12 +45,35 @@ def test_sim_search(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, + ) + output = hnswlib_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.parametrize('metric', ['cosine', 'l2']) +def test_sim_search_all_configurations(metric, tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + dist_metric=metric, + n_dim=10, + max_elements=8, + index=False, + ef_construction=300, + ef=20, + M=8, + allow_replace_deleted=False, + num_threads=2, ) output = hnswlib_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] @@ -59,6 +82,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, ) embedding = [1.0] * 10 output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1) @@ -66,7 +90,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: assert output == [Document(page_content="bar")] -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search_with_score(metric, tmp_path) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] @@ -75,6 +99,7 @@ def test_sim_search_with_score(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, ) output = hnswlib_vec_store.similarity_search_with_score("foo", k=1) assert len(output) == 1 @@ -84,6 +109,26 @@ def test_sim_search_with_score(metric, tmp_path) -> None: assert np.isclose(out_score, 0.0, atol=1.e-6) +def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: + """ + Test end to end construction and similarity search with score for ip + (inner-product) metric. + """ + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + dist_metric='ip', + ) + output = hnswlib_vec_store.similarity_search_with_score("foo", k=3) + assert len(output) == 3 + + for result in output: + assert result[1] == -8.0 + + @pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_max_marginal_relevance_search(metric, tmp_path) -> None: """Test MRR search.""" From 5d2324a65fb4c8dc78dc7ac5c886d0325ebb3b57 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 16:08:05 +0200 Subject: [PATCH 5/6] refactor: rename InMemory to InMemoryExactSearch Signed-off-by: anna-charlotte --- langchain/vectorstores/__init__.py | 4 ++-- ...{in_memory.py => in_memory_exact_search.py} | 12 ++++++------ ...emory.py => test_in_memory_exact_search.py} | 18 +++++++++--------- 3 files changed, 17 insertions(+), 17 deletions(-) rename langchain/vectorstores/{in_memory.py => in_memory_exact_search.py} (86%) rename tests/integration_tests/vectorstores/{test_in_memory.py => test_in_memory_exact_search.py} (85%) diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 5360f4b8f25c2..ed3982ad7e1ab 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -8,7 +8,7 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.hnsw_lib import HnswLib -from langchain.vectorstores.in_memory import InMemory +from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch @@ -37,5 +37,5 @@ "SupabaseVectorStore", "AnalyticDB", "HnswLib", - "InMemory", + "InMemoryExactSearch", ] diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory_exact_search.py similarity index 86% rename from langchain/vectorstores/in_memory.py rename to langchain/vectorstores/in_memory_exact_search.py index 07e1f49d82c17..bbaabe7e11c6b 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory_exact_search.py @@ -11,8 +11,8 @@ ) -class InMemory(VecStoreFromDocIndex): - """Wrapper around in-memory storage. +class InMemoryExactSearch(VecStoreFromDocIndex): + """Wrapper around in-memory storage for exact search. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. You can install it with `pip install "langchain[in_memory_store]"`. @@ -23,7 +23,7 @@ def __init__( embedding: Embeddings, metric: str = "cosine_sim", ) -> None: - """Initialize in-memory store. + """Initialize InMemoryExactSearch store. Args: embedding (Embeddings): Embedding function. @@ -45,8 +45,8 @@ def from_texts( embedding: Embeddings, metadatas: Optional[List[dict]] = None, metric: str = "cosine_sim", - ) -> InMemory: - """Create an in-memory store and insert data. + ) -> InMemoryExactSearch: + """Create an InMemoryExactSearch store and insert data. Args: texts (List[str]): Text data. @@ -58,7 +58,7 @@ def from_texts( Defaults to "cosine_sim". Returns: - InMemory Vector Store + InMemoryExactSearch Vector Store """ store = cls( embedding=embedding, diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py similarity index 85% rename from tests/integration_tests/vectorstores/test_in_memory.py rename to tests/integration_tests/vectorstores/test_in_memory_exact_search.py index e90c4ed312d21..7e0142ec8212f 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py @@ -2,27 +2,27 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.in_memory import InMemory +from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings def test_in_memory_vec_store_from_texts() -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - docsearch = InMemory.from_texts( + docsearch = InMemoryExactSearch.from_texts( texts, FakeEmbeddings(), ) - assert isinstance(docsearch, InMemory) + assert isinstance(docsearch, InMemoryExactSearch) assert docsearch.doc_index.num_docs() == 3 def test_in_memory_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = InMemory( + docsearch = InMemoryExactSearch( embedding=FakeEmbeddings(), ) - assert isinstance(docsearch, InMemory) + assert isinstance(docsearch, InMemoryExactSearch) assert docsearch.doc_index.num_docs() == 0 texts = ["foo", "bar", "baz"] @@ -34,7 +34,7 @@ def test_in_memory_vec_store_add_texts(tmp_path) -> None: def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -48,7 +48,7 @@ def test_sim_search(metric) -> None: def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -67,7 +67,7 @@ def test_sim_search_with_score(metric) -> None: def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -84,7 +84,7 @@ def test_max_marginal_relevance_search(metric) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] - docsearch = InMemory.from_texts( + docsearch = InMemoryExactSearch.from_texts( texts, FakeEmbeddings(), metadatas=metadatas, From ecc73b4bb948a9852237957944c5653f94d2b08f Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 28 Apr 2023 10:38:25 +0200 Subject: [PATCH 6/6] fix: change space default for hnswlib to l2 --- langchain/vectorstores/hnsw_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 42f5c902cb5af..2857248f0f5aa 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -89,7 +89,7 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, - dist_metric: str = "cosine", + dist_metric: str = "l2", max_elements: int = 1024, index: bool = True, ef_construction: int = 200, @@ -108,7 +108,7 @@ def from_texts( work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", - "ip", and "l2". Defaults to "cosine". + "ip", and "l2". Defaults to "l2". max_elements (int): Maximum number of vectors that can be stored. Defaults to 1024. index (bool): Whether an index should be built for this field.