From b1066d616fd132b80a146c8cb56be36fa5825f51 Mon Sep 17 00:00:00 2001 From: rigazilla Date: Tue, 20 Feb 2024 11:06:30 +0100 Subject: [PATCH] Adding support for Infinispan as VectorStore --- .../integrations/providers/infinispanvs.mdx | 17 + .../vectorstores/infinispanvs.ipynb | 408 ++++++++++++++ .../vectorstores/__init__.py | 9 + .../vectorstores/infinispanvs.py | 506 ++++++++++++++++++ .../vectorstores/test_infinispanvs.py | 135 +++++ .../vectorstores/test_public_api.py | 1 + 6 files changed, 1076 insertions(+) create mode 100644 docs/docs/integrations/providers/infinispanvs.mdx create mode 100644 docs/docs/integrations/vectorstores/infinispanvs.ipynb create mode 100644 libs/community/langchain_community/vectorstores/infinispanvs.py create mode 100644 libs/community/tests/integration_tests/vectorstores/test_infinispanvs.py diff --git a/docs/docs/integrations/providers/infinispanvs.mdx b/docs/docs/integrations/providers/infinispanvs.mdx new file mode 100644 index 0000000000000..b42e7504231bf --- /dev/null +++ b/docs/docs/integrations/providers/infinispanvs.mdx @@ -0,0 +1,17 @@ +# Infinispan VS + +> [Infinispan](https://infinispan.org) Infinispan is an open-source in-memory data grid that provides +> a key/value data store able to hold all types of data, from Java objects to plain text. +> Since version 15 Infinispan supports vector search over caches. + +## Installation and Setup +See [Get Started](https://infinispan.org/get-started/) to run an Infinispan server, you may want to disable authentication +(not supported atm) + +## Vector Store + +See a [usage example](/docs/integrations/vectorstores/infinispanvs). + +```python +from langchain_community.vectorstores import InfinispanVS +``` diff --git a/docs/docs/integrations/vectorstores/infinispanvs.ipynb b/docs/docs/integrations/vectorstores/infinispanvs.ipynb new file mode 100644 index 0000000000000..f0dff76c49dda --- /dev/null +++ b/docs/docs/integrations/vectorstores/infinispanvs.ipynb @@ -0,0 +1,408 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cffb482c-bbd8-4829-b185-0d930a5fe0bc", + "metadata": {}, + "source": [ + "# Infinispan\n", + "\n", + "Infinispan is an open-source key-value data grid, it can work as single node as well as distributed.\n", + "\n", + "Vector search is supported since release 15.x\n", + "For more: [Infinispan Home](https://infinispan.org)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03ec8f9a-7641-47ea-9fa0-f43ee9fc79a3", + "metadata": {}, + "outputs": [], + "source": [ + "# Ensure that all we need is installed\n", + "# You may want to skip this\n", + "%pip install sentence-transformers\n", + "%pip install langchain\n", + "%pip install langchain_core\n", + "%pip install langchain_community" + ] + }, + { + "cell_type": "markdown", + "id": "180d172e-cca1-481c-87d5-c4f14684604d", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "To run this demo we need a running Infinispan instance without authentication and a data file.\n", + "In the next three cells we're going to:\n", + "- create the configuration\n", + "- run Infinispan in docker\n", + "- download the data file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b251e66e-f056-4e81-a6b4-5f4d95b6537d", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "#create infinispan configuration file\n", + "echo 'infinispan:\n", + " cache-container: \n", + " name: default\n", + " transport: \n", + " cluster: cluster \n", + " stack: tcp \n", + " server:\n", + " interfaces:\n", + " interface:\n", + " name: public\n", + " inet-address:\n", + " value: 0.0.0.0 \n", + " socket-bindings:\n", + " default-interface: public\n", + " port-offset: 0 \n", + " socket-binding:\n", + " name: default\n", + " port: 11222\n", + " endpoints:\n", + " endpoint:\n", + " socket-binding: default\n", + " rest-connector:\n", + "' > infinispan-noauth.yaml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9678d5ce-894c-4e28-bf68-20d45507122f", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "#get an archive of news\n", + "wget https://raw.githubusercontent.com/rigazilla/infinispan-vector/main/bbc_news.csv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "009da6d1-9d1a-4392-90f1-5c654dd12654", + "metadata": {}, + "outputs": [], + "source": [ + "!docker run -d --name infinispanvs-demo -v $(pwd):/user-config -p 11222:11222 infinispan/server:15.0.0.Dev09 -c /user-config/infinispan-noauth.yaml " + ] + }, + { + "cell_type": "markdown", + "id": "b575cde9-4c62-47b3-af89-109ed39f56b6", + "metadata": {}, + "source": [ + "# The Code\n", + "\n", + "## Pick up an embedding model\n", + "\n", + "In this demo we're using\n", + "a HuggingFace embedding mode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2c9f46f-3c78-4865-810b-52408dff5fb7", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain_core.embeddings import Embeddings\n", + "\n", + "model_name = \"sentence-transformers/all-MiniLM-L12-v2\"\n", + "hf = HuggingFaceEmbeddings(model_name=model_name)" + ] + }, + { + "cell_type": "markdown", + "id": "61ce7e1f-51ee-4d3d-ad3c-97088b1120f6", + "metadata": {}, + "source": [ + "## Setup Infinispan cache\n", + "\n", + "Infinispan is a very flexible key-value store, it can store raw bits as well as complex data type.\n", + "We need to configure it to store data containing embedded vectors.\n", + "\n", + "In the next cells we're going to:\n", + "- create an empty Infinispan VectoreStore\n", + "- deploy a protobuf definition of our data\n", + "- create a cache" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49668bf1-778b-466d-86fb-41747ed52b74", + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a langchain_core.VectorStore\n", + "from langchain_community.vectorstores import InfinispanVS\n", + "\n", + "ispnvs = InfinispanVS.from_texts(\n", + " texts={}, embedding=hf, cache_name=\"demo_cache\", entity_name=\"demo_entity\"\n", + ")\n", + "ispn = ispnvs.ispn" + ] + }, + { + "cell_type": "markdown", + "id": "0cedf066-aaab-4185-b049-93eea9b48329", + "metadata": {}, + "source": [ + "### Protobuf definition\n", + "\n", + "Below there's the protobuf definition of our data type that contains:\n", + "- embedded vector (field 1)\n", + "- text of the news (2)\n", + "- title of the news (3)\n", + "\n", + "As you can see, there are additional annotations in the comments that tell Infinispan that:\n", + "- data type must be indexed (`@Indexed`)\n", + "- field 1 is an embeddeded vector (`@Vector`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fa0add0-8317-4667-9b8c-5d91c47f752a", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Infinispan supports protobuf schemas\n", + "schema_vector = \"\"\"\n", + "/**\n", + " * @Indexed\n", + " */\n", + "message demo_entity {\n", + "/**\n", + " * @Vector(dimension=384)\n", + " */\n", + "repeated float vector = 1;\n", + "optional string text = 2;\n", + "optional string title = 3;\n", + "}\n", + "\"\"\"\n", + "# Cleanup before deploy a new schema\n", + "ispnvs.schema_delete()\n", + "output = ispnvs.schema_create(schema_vector)\n", + "assert output.status_code == 200\n", + "assert json.loads(output.text)[\"error\"] is None\n", + "# Create the cache\n", + "ispnvs.cache_create()\n", + "# Cleanup old data and index\n", + "ispnvs.cache_clear()\n", + "ispnvs.cache_index_reindex()" + ] + }, + { + "cell_type": "markdown", + "id": "456da9e7-baf4-472a-a9ee-8473aed8cabd", + "metadata": {}, + "source": [ + "## Prepare the data\n", + "\n", + "In this demo we choose to store text,vector and metadata in the same cache, but other options\n", + "are possible: i.e. content can be store somewhere else and vector store could contain only a reference to the actual content." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6a42d3-c5ec-44ec-9b57-ebe5ca8c301a", + "metadata": {}, + "outputs": [], + "source": [ + "import csv\n", + "import gzip\n", + "import time\n", + "\n", + "# Open the news file and process it as a csv\n", + "with gzip.open(\"bbc_news.csv.gz\", \"rt\", newline=\"\") as csvfile:\n", + " spamreader = csv.reader(csvfile, delimiter=\",\", quotechar='\"')\n", + " i = 0\n", + " texts = []\n", + " metas = []\n", + " embeds = []\n", + " for row in spamreader:\n", + " # first and fifth value are joined to form the content\n", + " # to be processed\n", + " text = row[0] + \".\" + row[4]\n", + " texts.append(text)\n", + " # Storing meta\n", + " # Store text and title as metadata\n", + " meta = {}\n", + " meta[\"text\"] = row[4]\n", + " meta[\"title\"] = row[0]\n", + " metas.append(meta)\n", + " i = i + 1\n", + " # Change this to change the number of news you want to load\n", + " if i >= 5000:\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "a6b00299-94db-43ca-9da3-45d12cdf2db1", + "metadata": {}, + "source": [ + "# Populate the vector store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75e135a6-1b38-48eb-96ca-379b6f4a653f", + "metadata": {}, + "outputs": [], + "source": [ + "# add texts and fill vector db\n", + "keys = ispnvs.add_texts(texts, metas)" + ] + }, + { + "cell_type": "markdown", + "id": "2bb6f053-208d-407e-b8b7-c6c6443522d8", + "metadata": {}, + "source": [ + "# An helper func that prints the result documents\n", + "\n", + "By default InfinispanVS returns the protobuf `ŧext` field in the `Document.page_content`\n", + "and all the remaining protobuf fields (except the vector) in the `metadata`. This behaviour is\n", + "configurable via lambda functions at setup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "594fad38-37f0-4dd4-9785-a99a2f009ae5", + "metadata": {}, + "outputs": [], + "source": [ + "def print_docs(docs):\n", + " for res, i in zip(docs, range(len(docs))):\n", + " print(\"----\" + str(i + 1) + \"----\")\n", + " print(\"TITLE: \" + res.metadata[\"title\"])\n", + " print(res.page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "cfa517c7-e741-4f64-9736-6db7a6bd259a", + "metadata": {}, + "source": [ + "# Try it!!!\n", + "\n", + "Below some sample queries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e782b3-5a74-4ca1-a5d1-c0ee935a659e", + "metadata": {}, + "outputs": [], + "source": [ + "docs = ispnvs.similarity_search(\"European nations\", 5)\n", + "print_docs(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b60847f9-ef34-4c79-b276-ac62170e2d6a", + "metadata": {}, + "outputs": [], + "source": [ + "print_docs(ispnvs.similarity_search(\"Milan fashion week begins\", 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cbb5607-da55-4879-92cf-79ac690cc0c5", + "metadata": {}, + "outputs": [], + "source": [ + "print_docs(ispnvs.similarity_search(\"Stock market is rising today\", 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3bb94ca1-7b1e-41ed-9d8f-b845775d11c1", + "metadata": {}, + "outputs": [], + "source": [ + "print_docs(ispnvs.similarity_search(\"Why cats are so viral?\", 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4fca208-b580-483d-9be0-786b6b63a31d", + "metadata": {}, + "outputs": [], + "source": [ + "print_docs(ispnvs.similarity_search(\"How to stay young\", 5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "862e4af2-9f8a-4985-90cb-997477901b1e", + "metadata": {}, + "outputs": [], + "source": [ + "# Clean up\n", + "ispnvs.schema_delete()\n", + "ispnvs.cache_delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4a460b8-f0c8-4ae9-a7ff-cf550c3195f1", + "metadata": {}, + "outputs": [], + "source": [ + "!docker rm --force infinispanvs-demo" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index 806942bb3fde7..80f7420382366 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -240,6 +240,12 @@ def _import_hologres() -> Any: return Hologres +def _import_infinispanvs() -> Any: + from langchain_community.vectorstores.infinispanvs import InfinispanVS + + return InfinispanVS + + def _import_kdbai() -> Any: from langchain_community.vectorstores.kdbai import KDBAI @@ -569,6 +575,8 @@ def __getattr__(name: str) -> Any: return _import_hanavector() elif name == "Hologres": return _import_hologres() + elif name == "InfinispanVS": + return _import_infinispanvs() elif name == "KDBAI": return _import_kdbai() elif name == "DistanceStrategy": @@ -696,6 +704,7 @@ def __getattr__(name: str) -> Any: "FAISS", "HanaDB", "Hologres", + "InfinispanVS", "KDBAI", "DistanceStrategy", "Kinetica", diff --git a/libs/community/langchain_community/vectorstores/infinispanvs.py b/libs/community/langchain_community/vectorstores/infinispanvs.py new file mode 100644 index 0000000000000..9ad59ebc1395b --- /dev/null +++ b/libs/community/langchain_community/vectorstores/infinispanvs.py @@ -0,0 +1,506 @@ +"""Module providing Infinispan as a VectorStore""" + +from __future__ import annotations + +import json +import logging +import uuid +from typing import ( + Any, + Iterable, + List, + Optional, + Tuple, + Type, +) + +import requests +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +logger = logging.getLogger(__name__) + + +class InfinispanVS(VectorStore): + """`Infinispan` VectorStore interface. + + This class exposes the method to present Infinispan as a + VectorStore. It relies on the Infinispan class (below) which takes care + of the REST interface with the server. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import InfinispanVS + from mymodels import RGBEmbeddings + + vectorDb = InfinispanVS.from_documents(docs, + embedding=RGBEmbeddings(), + output_fields=["texture", "color"], + lambda_key=lambda text,meta: str(meta["_key"]), + lambda_content=lambda item: item["color"]) + + """ + + def __init__( + self, + embedding: Optional[Embeddings] = None, + ids: Optional[List[str]] = None, + clear_old: Optional[bool] = True, + **kwargs: Any, + ): + self.ispn = Infinispan(**kwargs) + self._configuration = kwargs + self._cache_name = str(self._configuration.get("cache_name", "vector")) + self._entity_name = str(self._configuration.get("entity_name", "vector")) + self._embedding = embedding + self._textfield = self._configuration.get("textfield", "text") + self._vectorfield = self._configuration.get("vectorfield", "vector") + self._to_content = self._configuration.get( + "lambda_content", lambda item: self._default_content(item) + ) + self._to_metadata = self._configuration.get( + "lambda_metadata", lambda item: self._default_metadata(item) + ) + self._output_fields = self._configuration.get("output_fields") + self._ids = ids + if clear_old: + self.ispn.cache_clear(self._cache_name) + + def _default_metadata(self, item: dict) -> dict: + meta = dict(item) + meta.pop(self._vectorfield, None) + meta.pop(self._textfield, None) + meta.pop("_type", None) + return meta + + def _default_content(self, item: dict[str, Any]) -> Any: + return item.get(self._textfield) + + def schema_create(self, proto: str) -> requests.Response: + """Deploy the schema for the vector db + Args: + proto(str): protobuf schema + Returns: + An http Response containing the result of the operation + """ + return self.ispn.schema_post(self._entity_name + ".proto", proto) + + def schema_delete(self) -> requests.Response: + """Delete the schema for the vector db + Returns: + An http Response containing the result of the operation + """ + return self.ispn.schema_delete(self._entity_name + ".proto") + + def cache_create(self, config: str = "") -> requests.Response: + """Create the cache for the vector db + Args: + config(str): configuration of the cache. + Returns: + An http Response containing the result of the operation + """ + if config == "": + config = ( + ''' + { + "distributed-cache": { + "owners": "2", + "mode": "SYNC", + "statistics": true, + "encoding": { + "media-type": "application/x-protostream" + }, + "indexing": { + "enabled": true, + "storage": "filesystem", + "startup-mode": "AUTO", + "indexing-mode": "AUTO", + "indexed-entities": [ + "''' + + self._entity_name + + """" + ] + } + } +} +""" + ) + return self.ispn.cache_post(self._cache_name, config) + + def cache_delete(self) -> requests.Response: + """Delete the cache for the vector db + Returns: + An http Response containing the result of the operation + """ + return self.ispn.cache_delete(self._cache_name) + + def cache_clear(self) -> requests.Response: + """Clear the cache for the vector db + Returns: + An http Response containing the result of the operation + """ + return self.ispn.cache_clear(self._cache_name) + + def cache_index_clear(self) -> requests.Response: + """Clear the index for the vector db + Returns: + An http Response containing the result of the operation + """ + return self.ispn.index_clear(self._cache_name) + + def cache_index_reindex(self) -> requests.Response: + """Rebuild the for the vector db + Returns: + An http Response containing the result of the operation + """ + return self.ispn.index_reindex(self._cache_name) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + result = [] + embeds = self._embedding.embed_documents(list(texts)) # type: ignore + if not metadatas: + metadatas = [{} for _ in texts] + ids = self._ids or [str(uuid.uuid4()) for _ in texts] + data_input = list(zip(metadatas, embeds, ids)) + for metadata, embed, key in data_input: + data = {"_type": self._entity_name, self._vectorfield: embed} + data.update(metadata) + data_str = json.dumps(data) + self.ispn.put(key, data_str, self._cache_name) + result.append(key) + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + documents = self.similarity_search_with_score(query=query, k=k) + return [doc for doc, _ in documents] + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Perform a search on a query string and return results with score. + + Args: + query (str): The text being searched. + k (int, optional): The amount of results to return. Defaults to 4. + + Returns: + List[Tuple[Document, float]] + """ + embed = self._embedding.embed_query(query) # type: ignore + documents = self.similarity_search_with_score_by_vector(embedding=embed, k=k) + return documents + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + res = self.similarity_search_with_score_by_vector(embedding, k) + return [doc for doc, _ in res] + + def similarity_search_with_score_by_vector( + self, embedding: List[float], k: int = 4 + ) -> List[Tuple[Document, float]]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of pair (Documents, score) most similar to the query vector. + """ + if self._output_fields is None: + query_str = ( + "select v, score(v) from " + + self._entity_name + + " v where v." + + self._vectorfield + + " <-> " + + json.dumps(embedding) + + "~" + + str(k) + ) + else: + query_proj = "select " + for field in self._output_fields[:-1]: + query_proj = query_proj + "v." + field + "," + query_proj = query_proj + "v." + self._output_fields[-1] + query_str = ( + query_proj + + ", score(v) from " + + self._entity_name + + " v where v." + + self._vectorfield + + " <-> " + + json.dumps(embedding) + + "~" + + str(k) + ) + query_res = self.ispn.req_query(query_str, self._cache_name) + result = json.loads(query_res.text) + return self._query_result_to_docs(result) + + def _query_result_to_docs( + self, result: dict[str, Any] + ) -> List[Tuple[Document, float]]: + documents = [] + for row in result["hits"]: + hit = row["hit"] or {} + if self._output_fields is None: + entity = hit["*"] + else: + entity = {key: hit.get(key) for key in self._output_fields} + doc = Document( + page_content=self._to_content(entity), + metadata=self._to_metadata(entity), + ) + documents.append((doc, hit["score()"])) + return documents + + @classmethod + def from_texts( + cls: Type[InfinispanVS], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + clear_old: Optional[bool] = None, + **kwargs: Any, + ) -> InfinispanVS: + """Return VectorStore initialized from texts and embeddings.""" + infinispanvs = cls(embedding=embedding, ids=ids, clear_old=clear_old, **kwargs) + if texts: + infinispanvs.add_texts(texts, metadatas) + return infinispanvs + + +REST_TIMEOUT = 10 + + +class Infinispan: + """Helper class for `Infinispan` REST interface. + + This class exposes the Infinispan operations needed to + create and set up a vector db. + + You need a running Infinispan (15+) server without authentication. + You can easily start one, see: https://github.com/rigazilla/infinispan-vector#run-infinispan + """ + + def __init__(self, **kwargs: Any): + self._configuration = kwargs + self._schema = str(self._configuration.get("schema", "http")) + self._host = str(self._configuration.get("hosts", ["127.0.0.1:11222"])[0]) + self._default_node = self._schema + "://" + self._host + self._cache_url = str(self._configuration.get("cache_url", "/rest/v2/caches")) + self._schema_url = str(self._configuration.get("cache_url", "/rest/v2/schemas")) + self._use_post_for_query = str( + self._configuration.get("use_post_for_query", True) + ) + + def req_query( + self, query: str, cache_name: str, local: bool = False + ) -> requests.Response: + """Request a query + Args: + query(str): query requested + cache_name(str): name of the target cache + local(boolean): whether the query is local to clustered + Returns: + An http Response containing the result set or errors + """ + if self._use_post_for_query: + return self._query_post(query, cache_name, local) + return self._query_get(query, cache_name, local) + + def _query_post( + self, query_str: str, cache_name: str, local: bool = False + ) -> requests.Response: + api_url = ( + self._default_node + + self._cache_url + + "/" + + cache_name + + "?action=search&local=" + + str(local) + ) + data = {"query": query_str} + data_json = json.dumps(data) + response = requests.post( + api_url, + data_json, + headers={"Content-Type": "application/json"}, + timeout=REST_TIMEOUT, + ) + return response + + def _query_get( + self, query_str: str, cache_name: str, local: bool = False + ) -> requests.Response: + api_url = ( + self._default_node + + self._cache_url + + "/" + + cache_name + + "?action=search&query=" + + query_str + + "&local=" + + str(local) + ) + response = requests.get(api_url, timeout=REST_TIMEOUT) + return response + + def post(self, key: str, data: str, cache_name: str) -> requests.Response: + """Post an entry + Args: + key(str): key of the entry + data(str): content of the entry in json format + cache_name(str): target cache + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._cache_url + "/" + cache_name + "/" + key + response = requests.post( + api_url, + data, + headers={"Content-Type": "application/json"}, + timeout=REST_TIMEOUT, + ) + return response + + def put(self, key: str, data: str, cache_name: str) -> requests.Response: + """Put an entry + Args: + key(str): key of the entry + data(str): content of the entry in json format + cache_name(str): target cache + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._cache_url + "/" + cache_name + "/" + key + response = requests.put( + api_url, + data, + headers={"Content-Type": "application/json"}, + timeout=REST_TIMEOUT, + ) + return response + + def get(self, key: str, cache_name: str) -> requests.Response: + """Get an entry + Args: + key(str): key of the entry + cache_name(str): target cache + Returns: + An http Response containing the entry or errors + """ + api_url = self._default_node + self._cache_url + "/" + cache_name + "/" + key + response = requests.get( + api_url, headers={"Content-Type": "application/json"}, timeout=REST_TIMEOUT + ) + return response + + def schema_post(self, name: str, proto: str) -> requests.Response: + """Deploy a schema + Args: + name(str): name of the schema. Will be used as a key + proto(str): protobuf schema + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._schema_url + "/" + name + response = requests.post(api_url, proto, timeout=REST_TIMEOUT) + return response + + def cache_post(self, name: str, config: str) -> requests.Response: + """Create a cache + Args: + name(str): name of the cache. + config(str): configuration of the cache. + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._cache_url + "/" + name + response = requests.post( + api_url, + config, + headers={"Content-Type": "application/json"}, + timeout=REST_TIMEOUT, + ) + return response + + def schema_delete(self, name: str) -> requests.Response: + """Delete a schema + Args: + name(str): name of the schema. + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._schema_url + "/" + name + response = requests.delete(api_url, timeout=REST_TIMEOUT) + return response + + def cache_delete(self, name: str) -> requests.Response: + """Delete a cache + Args: + name(str): name of the cache. + Returns: + An http Response containing the result of the operation + """ + api_url = self._default_node + self._cache_url + "/" + name + response = requests.delete(api_url, timeout=REST_TIMEOUT) + return response + + def cache_clear(self, cache_name: str) -> requests.Response: + """Clear a cache + Args: + cache_name(str): name of the cache. + Returns: + An http Response containing the result of the operation + """ + api_url = ( + self._default_node + self._cache_url + "/" + cache_name + "?action=clear" + ) + response = requests.post(api_url, timeout=REST_TIMEOUT) + return response + + def index_clear(self, cache_name: str) -> requests.Response: + """Clear an index on a cache + Args: + cache_name(str): name of the cache. + Returns: + An http Response containing the result of the operation + """ + api_url = ( + self._default_node + + self._cache_url + + "/" + + cache_name + + "/search/indexes?action=clear" + ) + return requests.post(api_url, timeout=REST_TIMEOUT) + + def index_reindex(self, cache_name: str) -> requests.Response: + """Rebuild index on a cache + Args: + cache_name(str): name of the cache. + Returns: + An http Response containing the result of the operation + """ + api_url = ( + self._default_node + + self._cache_url + + "/" + + cache_name + + "/search/indexes?action=reindex" + ) + return requests.post(api_url, timeout=REST_TIMEOUT) diff --git a/libs/community/tests/integration_tests/vectorstores/test_infinispanvs.py b/libs/community/tests/integration_tests/vectorstores/test_infinispanvs.py new file mode 100644 index 0000000000000..a5464d75151f2 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_infinispanvs.py @@ -0,0 +1,135 @@ +"""Test Infinispan functionality.""" +from typing import Any, List, Optional + +from langchain_core.documents import Document + +from langchain_community.vectorstores import InfinispanVS +from tests.integration_tests.vectorstores.fake_embeddings import ( + FakeEmbeddings, + fake_texts, +) + + +def _infinispan_setup() -> None: + ispnvs = InfinispanVS() + ispnvs.cache_delete() + ispnvs.schema_delete() + proto = """ + /** + * @Indexed + */ + message vector { + /** + * @Vector(dimension=10) + */ + repeated float vector = 1; + optional string text = 2; + optional string label = 3; + optional int32 page = 4; + } + """ + ispnvs.schema_create(proto) + ispnvs.cache_create() + ispnvs.cache_index_clear() + + +def _infinispanvs_from_texts( + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + clear_old: Optional[bool] = True, + **kwargs: Any, +) -> InfinispanVS: + texts = [{"text": t} for t in fake_texts] + if metadatas is None: + metadatas = texts + else: + [m.update(t) for (m, t) in zip(metadatas, texts)] + return InfinispanVS.from_texts( + fake_texts, + FakeEmbeddings(), + metadatas=metadatas, + ids=ids, + clear_old=clear_old, + **kwargs, + ) + + +def test_infinispan() -> None: + """Test end to end construction and search.""" + _infinispan_setup() + docsearch = _infinispanvs_from_texts() + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_infinispan_with_metadata() -> None: + """Test with metadata""" + _infinispan_setup() + meta = [] + for _ in range(len(fake_texts)): + meta.append({"label": "test"}) + docsearch = _infinispanvs_from_texts(metadatas=meta) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"label": "test"})] + + +def test_infinispan_with_metadata_with_output_fields() -> None: + """Test with metadata""" + _infinispan_setup() + metadatas = [{"page": i, "label": "label" + str(i)} for i in range(len(fake_texts))] + c = {"output_fields": ["label", "page", "text"]} + docsearch = _infinispanvs_from_texts(metadatas=metadatas, configuration=c) + output = docsearch.similarity_search("foo", k=1) + assert output == [ + Document(page_content="foo", metadata={"label": "label0", "page": 0}) + ] + + +def test_infinispanvs_with_id() -> None: + """Test with ids""" + ids = ["id_" + str(i) for i in range(len(fake_texts))] + docsearch = _infinispanvs_from_texts(ids=ids) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_infinispan_with_score() -> None: + """Test end to end construction and search with scores and IDs.""" + _infinispan_setup() + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _infinispanvs_from_texts(metadatas=metadatas) + output = docsearch.similarity_search_with_score("foo", k=3) + docs = [o[0] for o in output] + scores = [o[1] for o in output] + assert docs == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + Document(page_content="baz", metadata={"page": 2}), + ] + assert scores[0] >= scores[1] >= scores[2] + + +def test_infinispan_add_texts() -> None: + """Test end to end construction and MRR search.""" + _infinispan_setup() + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _infinispanvs_from_texts(metadatas=metadatas) + + docsearch.add_texts(texts, metadatas) + + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 6 + + +def test_infinispan_no_clear_old() -> None: + """Test end to end construction and MRR search.""" + _infinispan_setup() + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = _infinispanvs_from_texts(metadatas=metadatas) + del docsearch + docsearch = _infinispanvs_from_texts(metadatas=metadatas, clear_old=False) + output = docsearch.similarity_search("foo", k=10) + assert len(output) == 6 diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 808da73b55978..1e963c3368445 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -31,6 +31,7 @@ "FAISS", "HanaDB", "Hologres", + "InfinispanVS", "KDBAI", "Kinetica", "KineticaSettings",