From d262d41cc0667536f9da568afa8efa42327b7d4d Mon Sep 17 00:00:00 2001 From: Tari Yekorogha Date: Mon, 16 Dec 2024 20:37:55 +0100 Subject: [PATCH] =?UTF-8?q?community:=20added=20FalkorDB=20vector=20store?= =?UTF-8?q?=20support=20i.e=20implementation,=20test,=20docs=20an=E2=80=A6?= =?UTF-8?q?=20(#26245)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description:** Added support for FalkorDB Vector Store, including its implementation, unit tests, documentation, and an example notebook. The FalkorDB integration allows users to efficiently manage and query embeddings in a vector database, with relevance scoring and maximal marginal relevance search. The following components were implemented: - Core implementation for FalkorDBVector store. - Unit tests ensuring proper functionality and edge case coverage. - Example notebook demonstrating an end-to-end setup, search, and retrieval using FalkorDB. **Twitter handle:** @tariyekorogha --------- Co-authored-by: Erick Friis --- .../vectorstores/falkordbvector.ipynb | 437 ++++ .../vectorstores/falkordb_vector.py | 1858 +++++++++++++++++ .../test_falkordb_vector_integration.py | 671 ++++++ .../test_falkordb_vector_utils.py | 24 + 4 files changed, 2990 insertions(+) create mode 100644 docs/docs/integrations/vectorstores/falkordbvector.ipynb create mode 100644 libs/community/langchain_community/vectorstores/falkordb_vector.py create mode 100644 libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py create mode 100644 libs/community/tests/unit_tests/vectorstores/test_falkordb_vector_utils.py diff --git a/docs/docs/integrations/vectorstores/falkordbvector.ipynb b/docs/docs/integrations/vectorstores/falkordbvector.ipynb new file mode 100644 index 0000000000000..ecc0aa68bf008 --- /dev/null +++ b/docs/docs/integrations/vectorstores/falkordbvector.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FalkorDBVectorStore\n", + "FalkorDB is an open-source graph database with integrated support for vector similarity search\n", + "\n", + "it supports:\n", + "- approximate nearest neighbor search\n", + "- Euclidean similarity & Cosine Similarity\n", + "- Hybrid search combining vector and keyword searches\n", + "\n", + "This notebook shows how to use the FalkorDB vector index (`FalkorDB`)\n", + "\n", + "See the installation instruction\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: falkordb in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (1.0.10)Note: you may need to restart the kernel to use updated packages.\n", + "\n", + "Requirement already satisfied: redis<6.0.0,>=5.0.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from falkordb) (5.2.0)\n", + "Requirement already satisfied: async-timeout>=4.0.3 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from redis<6.0.0,>=5.0.1->falkordb) (4.0.3)\n", + "Requirement already satisfied: tiktoken in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (0.8.0)\n", + "Requirement already satisfied: regex>=2022.1.18 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from tiktoken) (2024.11.6)\n", + "Requirement already satisfied: requests>=2.26.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from tiktoken) (2.32.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests>=2.26.0->tiktoken) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests>=2.26.0->tiktoken) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests>=2.26.0->tiktoken) (1.26.20)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests>=2.26.0->tiktoken) (2024.8.30)\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Requirement already satisfied: langchain in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (0.3.9)Note: you may need to restart the kernel to use updated packages.\n", + "\n", + "Requirement already satisfied: langchain_huggingface in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (0.1.2)\n", + "Requirement already satisfied: PyYAML>=5.3 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (6.0.2)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (2.0.36)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (3.11.8)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (4.0.3)\n", + "Requirement already satisfied: langchain-core<0.4.0,>=0.3.21 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (0.3.21)\n", + "Requirement already satisfied: langchain-text-splitters<0.4.0,>=0.3.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (0.3.2)\n", + "Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (0.1.147)\n", + "Requirement already satisfied: numpy<2,>=1.22.4 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (1.26.4)\n", + "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (2.9.2)\n", + "Requirement already satisfied: requests<3,>=2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (2.32.3)\n", + "Requirement already satisfied: tenacity!=8.4.0,<10,>=8.1.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain) (9.0.0)\n", + "Requirement already satisfied: huggingface-hub>=0.23.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain_huggingface) (0.26.3)\n", + "Requirement already satisfied: sentence-transformers>=2.6.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain_huggingface) (3.3.1)\n", + "Requirement already satisfied: tokenizers>=0.19.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain_huggingface) (0.20.3)\n", + "Requirement already satisfied: transformers>=4.39.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain_huggingface) (4.46.3)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.4)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.2.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.0)\n", + "Requirement already satisfied: filelock in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from huggingface-hub>=0.23.0->langchain_huggingface) (3.16.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from huggingface-hub>=0.23.0->langchain_huggingface) (2024.10.0)\n", + "Requirement already satisfied: packaging>=20.9 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from huggingface-hub>=0.23.0->langchain_huggingface) (24.2)\n", + "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from huggingface-hub>=0.23.0->langchain_huggingface) (4.67.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from huggingface-hub>=0.23.0->langchain_huggingface) (4.12.2)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langchain-core<0.4.0,>=0.3.21->langchain) (1.33)\n", + "Requirement already satisfied: httpx<1,>=0.23.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (0.27.2)\n", + "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.12)\n", + "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (1.0.0)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.4 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from pydantic<3.0.0,>=2.7.4->langchain) (2.23.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (1.26.20)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from requests<3,>=2->langchain) (2024.8.30)\n", + "Requirement already satisfied: torch>=1.11.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from sentence-transformers>=2.6.0->langchain_huggingface) (2.5.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from sentence-transformers>=2.6.0->langchain_huggingface) (1.5.2)\n", + "Requirement already satisfied: scipy in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from sentence-transformers>=2.6.0->langchain_huggingface) (1.13.1)\n", + "Requirement already satisfied: Pillow in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from sentence-transformers>=2.6.0->langchain_huggingface) (11.0.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from transformers>=4.39.0->langchain_huggingface) (2024.11.6)\n", + "Requirement already satisfied: safetensors>=0.4.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from transformers>=4.39.0->langchain_huggingface) (0.4.5)\n", + "Requirement already satisfied: anyio in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (4.6.2.post1)\n", + "Requirement already satisfied: httpcore==1.* in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.0.7)\n", + "Requirement already satisfied: sniffio in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from httpcore==1.*->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (0.14.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain-core<0.4.0,>=0.3.21->langchain) (3.0.0)\n", + "Requirement already satisfied: networkx in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface) (3.2.1)\n", + "Requirement already satisfied: jinja2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface) (3.1.4)\n", + "Requirement already satisfied: sympy==1.13.1 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface) (1.13.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface) (1.3.0)\n", + "Requirement already satisfied: colorama in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from tqdm>=4.42.1->huggingface-hub>=0.23.0->langchain_huggingface) (0.4.6)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from scikit-learn->sentence-transformers>=2.6.0->langchain_huggingface) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from scikit-learn->sentence-transformers>=2.6.0->langchain_huggingface) (3.5.0)\n", + "Requirement already satisfied: exceptiongroup>=1.0.2 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from anyio->httpx<1,>=0.23.0->langsmith<0.2.0,>=0.1.17->langchain) (1.2.2)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\dell\\desktop\\langchain\\.venv\\lib\\site-packages (from jinja2->torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface) (3.0.2)\n" + ] + } + ], + "source": [ + "# Pip install necessary package\n", + "%pip install --upgrade falkordb\n", + "%pip install --upgrade tiktoken\n", + "%pip install --upgrade langchain langchain_huggingface" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Credentials\n", + "We want to use `HuggingFace` so we have to get the HuggingFace API Key" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if \"HUGGINGFACE_API_KEY\" not in os.environ:\n", + " os.environ[\"HUGGINGFACE_API_KEY\"] = getpass.getpass(\"HuggingFace API Key:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get automated tracing of your model calls you can also set your LangSmith API key by uncommenting below:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", + "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores.falkordb_vector import FalkorDBVector\n", + "from langchain_core.documents import Document\n", + "from langchain_huggingface import HuggingFaceEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use FalkorDBVector locally with docker. See installation instruction" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "host = \"localhost\"\n", + "port = 6379" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or you can use FalkorDBVector with FalkorDB Cloud" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# E.g\n", + "# host = \"r-6jissuruar.instance-zwb082gpf.hc-v8noonp0c.europe-west1.gcp.f2e0a955bb84.cloud\"\n", + "# port = 62471\n", + "# username = \"falkordb\" # SET ON FALKORDB CLOUD\n", + "# password = \"password\" # SET ON FALKORDB CLOUD" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = FalkorDBVector(host=host, port=port, embedding=HuggingFaceEmbeddings())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manage vector store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add items to vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['1', '2', '3']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_core.documents import Document\n", + "\n", + "document_1 = Document(page_content=\"foo\", metadata={\"source\": \"https://example.com\"})\n", + "\n", + "document_2 = Document(page_content=\"bar\", metadata={\"source\": \"https://example.com\"})\n", + "\n", + "document_3 = Document(page_content=\"baz\", metadata={\"source\": \"https://example.com\"})\n", + "\n", + "documents = [document_1, document_2, document_3]\n", + "\n", + "vector_store.add_documents(documents=documents, ids=[\"1\", \"2\", \"3\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Update items in vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "updated_document = Document(\n", + " page_content=\"qux\", metadata={\"source\": \"https://another-example.com\"}\n", + ")\n", + "\n", + "vector_store.update_documents(document_id=\"1\", document=updated_document)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete items from vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store.delete(ids=[\"3\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Query vector store\n", + "\n", + "Once your vector store has been created and the relevant documents have been added you will most likely wish to query it during the running of your chain or agent." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query directly\n", + "\n", + "Performing a simple similarity search can be done as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* qux [{'text': 'qux', 'id': '1', 'source': 'https://another-example.com'}]\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search(\n", + " query=\"thud\", k=1, filter={\"source\": \"https://another-example.com\"}\n", + ")\n", + "for doc in results:\n", + " print(f\"* {doc.page_content} [{doc.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to execute a similarity search and receive the corresponding scores you can run:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* [SIM=0.000001] bar [{'text': 'bar', 'id': '2', 'source': 'https://example.com'}]\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search_with_score(query=\"bar\")\n", + "for doc, score in results:\n", + " print(f\"* [SIM={score:3f}] {doc.page_content} [{doc.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query by turning into retriever\n", + "You can also transform the vector store into a retriever for easier usage in your chains." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(metadata={'text': 'qux', 'id': '1', 'source': 'https://another-example.com'}, page_content='qux')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = vector_store.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 1})\n", + "retriever.invoke(\"thud\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage for retrieval-augmented generation\n", + "For guides on how to use this vector store for retrieval-augmented generation (RAG), see the following sections:\n", + "- Tutorials: working with external knowledge\n", + "- How-to: Question and answer with RAG\n", + "- Retrieval conceptual docs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "For detailed documentation of all `FalkorDBVector` features and configurations head to the API reference: https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.falkordb_vector.FalkorDBVector.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/vectorstores/falkordb_vector.py b/libs/community/langchain_community/vectorstores/falkordb_vector.py new file mode 100644 index 0000000000000..d3a74177f93f4 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/falkordb_vector.py @@ -0,0 +1,1858 @@ +from __future__ import annotations + +import enum +import os +import random +import string +from hashlib import md5 +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type + +import numpy as np +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +from langchain_community.graphs import FalkorDBGraph +from langchain_community.vectorstores.utils import ( + DistanceStrategy, + maximal_marginal_relevance, +) + + +def generate_random_string(length: int) -> str: + # Define the characters to use: uppercase, lowercase, digits, and + # punctuation + characters = string.ascii_letters + # Randomly choose 'length' characters from the pool of possible characters + random_string = "".join(random.choice(characters) for _ in range(length)) + return random_string + + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE +DISTANCE_MAPPING = { + DistanceStrategy.EUCLIDEAN_DISTANCE: "euclidean", + DistanceStrategy.COSINE: "cosine", +} + + +class SearchType(str, enum.Enum): + """ + Enumerator for different search strategies in FalkorDB VectorStore. + + - `SearchType.VECTOR`: This option searches using only + the vector indexes in the vectorstore, relying on the + similarity between vector embeddings to return + relevant results. + + - `SearchType.HYBRID`: This option performs a combined search, + querying both the full-text indexes and the vector indexes. + It integrates traditional text search with vector-based + search for more comprehensive results. + + """ + + VECTOR = "vector" + HYBRID = "hybrid" + + +DEFAULT_SEARCH_TYPE = SearchType.VECTOR + + +class IndexType(str, enum.Enum): + """Enumerator of the index types.""" + + NODE = "NODE" + RELATIONSHIP = "RELATIONSHIP" + + +DEFAULT_INDEX_TYPE = IndexType.NODE + + +def dict_to_yaml_str(input_dict: Dict, indent: int = 0) -> str: + """ + Convert a dictionary to a YAML-like string without using external libraries. + + Parameters: + - input_dict (dict): The dictionary to convert. + - indent (int): The current indentation level. + + Returns: + - str: The YAML-like string representation of the input dictionary. + """ + yaml_str = "" + for key, value in input_dict.items(): + padding = " " * indent + if isinstance(value, dict): + yaml_str += f"{padding}{key}:\n{dict_to_yaml_str(value, indent + 1)}" + elif isinstance(value, list): + yaml_str += f"{padding}{key}:\n" + for item in value: + yaml_str += f"{padding}- {item}\n" + else: + yaml_str += f"{padding}{key}: {value}\n" + return yaml_str + + +def construct_metadata_filter( + filter: Optional[Dict[str, Any]] = None, +) -> Tuple[str, Dict[str, Any]]: + """ + Construct a metadata filter by directly injecting + the filter values into the query. + + Args: + filter (Optional[Dict[str, Any]]): Dictionary + representing the filter condition. + + Returns: + Tuple[str, Dict[str, Any]]: Filter snippet + and an empty dictionary (since + we don't need parameters). + """ + if not filter: + return "", {} + + filter_snippet = "" + + for i, (key, value) in enumerate(filter.items(), start=1): + if filter_snippet: + filter_snippet += " AND " + + # If the value is a string, wrap it in quotes. Otherwise, directly + # inject the value. + if isinstance(value, str): + filter_snippet += f"n.{key} = '{value}'" + else: + filter_snippet += f"n.{key} = {value}" + + return filter_snippet, {} + + +def _get_search_index_query( + search_type: SearchType, index_type: IndexType = DEFAULT_INDEX_TYPE +) -> str: + if index_type == IndexType.NODE: + if search_type == SearchType.VECTOR: + return ( + "CALL db.idx.vector.queryNodes($entity_label, " + "$entity_property, $k, vecf32($embedding)) " + "YIELD node, score " + ) + elif search_type == SearchType.HYBRID: + return ( + "CALL { " + "CALL db.idx.vector.queryNodes($entity_label, " + "$entity_property, $k, vecf32($embedding)) " + "YIELD node, score " + "WITH collect({node: node, score: score})" + " AS nodes, max(score) AS max_score " + "UNWIND nodes AS n " + "RETURN n.node AS node, (n.score / max_score) AS score " + "UNION " + "CALL db.idx.fulltext.queryNodes($entity_label, $query) " + "YIELD node, score " + "WITH collect({node: node, score: score})" + " AS nodes, max(score) AS max_score " + "UNWIND nodes AS n " + "RETURN n.node AS node, (n.score / max_score) AS score " + "} " + "WITH node, max(score) AS score " + "ORDER BY score DESC LIMIT $k " + ) + elif index_type == IndexType.RELATIONSHIP: + return ( + "CALL db.idx.vector.queryRelationships" + "($entity_label, $entity_property, $k, vecf32($embedding)) " + "YIELD relationship, score " + ) + + +def process_index_data(data: List[List[Any]]) -> List[Dict[str, Any]]: + """ + Processes a nested list of entity data + to extract information about labels, + entity types, properties, index types, + and index details (if applicable). + + Args: + data (List[List[Any]]): A nested list containing + details about entitys, their properties, index + types, and configuration information. + + Returns: + List[Dict[str, Any]]: A list of dictionaries where each dictionary + contains: + - entity_label (str): The label or name of the + entity or relationship (e.g., 'Person', 'Song'). + - entity_property (str): The property of the entity + or relationship on which an index + was created (e.g., 'first_name'). + - index_type (str or List[str]): The type(s) + of index applied to the property (e.g., + 'FULLTEXT', 'VECTOR'). + - index_status (str): The status of the index + (e.g., 'OPERATIONAL', 'PENDING'). + - index_dimension (Optional[int]): The dimension + of the vector index, if applicable. + - index_similarityFunction (Optional[str]): The + similarity function used by the vector + index, if applicable. + - entity_type (str): The type of entity. That is + either entity or relationship + + Notes: + - The entity label is extracted from the first + element of each entity list. + - The entity property and associated index types + are extracted from the second element. + - If the index type includes 'VECTOR', additional + details such as dimension and similarity function + are extracted from the entity configuration. + - The function handles cases where entitys have + multiple index types (e.g., both 'FULLTEXT' and 'VECTOR'). + """ + + result = [] + + for entity in data: + # Extract basic information + + entity_label = entity[0] + + index_type_dict = entity[2] + + index_status = entity[7] + + entity_type = entity[6] + + # Process each property and its index type(s) + for prop, index_types in index_type_dict.items(): + entity_info = { + "entity_label": entity_label, + "entity_property": prop, + "entity_type": entity_type, + "index_type": index_types[0], + "index_status": index_status, + "index_dimension": None, + "index_similarityFunction": None, + } + + # Check for VECTOR type and extract additional details + if "VECTOR" in index_types: + if isinstance(entity[3], str): + entity_info["index_dimension"] = None + entity_info["index_similarityFunction"] = None + else: + vector_info = entity[3].get(prop, {}) + entity_info["index_dimension"] = vector_info.get("dimension") + entity_info["index_similarityFunction"] = vector_info.get( + "similarityFunction" + ) + + result.append(entity_info) + + return result + + +class FalkorDBVector(VectorStore): + """`FalkorDB` vector index. + + To use, you should have the ``falkordb`` python package installed + + Args: + host: FalkorDB host + port: FalkorDB port + username: Optionally provide your username + details if you are connecting to a + FalkorDB Cloud database instance + password: Optionally provide your password + details if you are connecting to a + FalkorDB Cloud database instance + embedding: Any embedding function implementing + `langchain.embeddings.base.Embeddings` interface. + distance_strategy The distance strategy to use. + (default: "EUCLIDEAN") + pre_delete_collection: If True, will delete + existing data if it exists.(default: + False). Useful for testing. + search_type: Similiarity search type to use. + Could be either SearchType.VECTOR or + SearchType.HYBRID (default: + SearchType.VECTOR) + database: Optionally provide the name of the + database to use else FalkorDBVector will + generate a random database for you. + node_label: Provide the label of the node you + want the embeddings of your data to be + stored in. (default: "Chunk") + relation_type: Provide the relationship type + of the relationship you want the + embeddings of your data to be stored in. + (default: "") + embedding_node_property: Provide the name of + the property in which you want your + embeddings to be stored. (default: "embedding") + text_node_property: Provide the name of + the property in which you want your texts + to be stored. (default: "text") + embedding_dimension: Provide the dimension + of your embeddings or it will be + calculated for you. + retrieval_query: Optionally a provide a + retrieval_query else the default + retrieval query will be used. + index_type: Provide the index type for the + VectorStore else the default index + type will be used. + graph: Optionally provide the graph you + would like to use + relevance_score_fn: Optionally provide a + function that computes a relevance score + based on the similarity score returned by + the search. + ssl: Specify whether the connection to the + database should be secured using SSL/TLS + encryption (default: False) + + Example: + .. code-block:: python + + from langchain_community.vectorstores.falkordb_vector import FalkorDBVector + from langchain_community.embeddings.openai import OpenAIEmbeddings + from langchain_text_splitters import CharacterTextSplitter + + + host="localhost" + port=6379 + raw_documents = TextLoader('../../../state_of_the_union.txt').load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + documents = text_splitter.split_documents(raw_documents) + + embeddings=OpenAIEmbeddings() + vectorstore = FalkorDBVector.from_documents( + embedding=embeddings, + documents=documents, + host=host, + port=port, + ) + """ + + def __init__( + self, + embedding: Embeddings, + *, + search_type: SearchType = SearchType.VECTOR, + username: Optional[str] = None, + password: Optional[str] = None, + host: str = "localhost", + port: int = 6379, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + database: Optional[str] = generate_random_string(4), + node_label: str = "Chunk", + relation_type: str = "", + embedding_node_property: str = "embedding", + text_node_property: str = "text", + embedding_dimension: Optional[int] = None, + retrieval_query: Optional[str] = "", + index_type: IndexType = DEFAULT_INDEX_TYPE, + graph: Optional[FalkorDBGraph] = None, + relevance_score_fn: Optional[Callable[[float], float]] = None, + ssl: bool = False, + pre_delete_collection: bool = False, + metadata: List[Any] = [], + ) -> None: + try: + import falkordb + except ImportError: + raise ImportError( + "Could not import falkordb python package." + "Please install it with `pip install falkordb`" + ) + + try: + import redis.exceptions + except ImportError: + raise ImportError( + "Could not import redis.exceptions." + "Please install it with `pip install redis`" + ) + + # Allow only cosine and euclidean distance strategies + if distance_strategy not in [ + DistanceStrategy.EUCLIDEAN_DISTANCE, + DistanceStrategy.COSINE, + ]: + raise ValueError( + "`distance_strategy` must be either 'EULIDEAN_DISTANCE` or `COSINE`" + ) + + # Graph object takes precedent over env or input params + if graph: + self._database = graph._graph + self._driver = graph._driver + else: + # Handle credentials via environment variables or input params + self._host = host + self._port = port + self._username = username or os.environ.get("FALKORDB_USERNAME") + self._password = password or os.environ.get("FALKORDB_PASSWORD") + self._ssl = ssl + + # Initialize the FalkorDB connection + try: + self._driver = falkordb.FalkorDB( + host=self._host, + port=self._port, + username=self._username, + password=self._password, + ssl=self._ssl, + ) + except redis.exceptions.ConnectionError: + raise ValueError( + "Could not connect to FalkorDB database." + "Please ensure that the host and port is correct" + ) + except redis.exceptions.AuthenticationError: + raise ValueError( + "Could not connect to FalkorDB database. " + "Please ensure that the username and password are correct" + ) + + # Verify that required values are not null + if not embedding_node_property: + raise ValueError( + "The `embedding_node_property` must not be None or empty string" + ) + if not node_label: + raise ValueError("The `node_label` must not be None or empty string") + + self._database = self._driver.select_graph(database) + self.database_name = database + self.embedding = embedding + self.node_label = node_label + self.relation_type = relation_type + self.embedding_node_property = embedding_node_property + self.text_node_property = text_node_property + self._distance_strategy = distance_strategy + self.override_relevance_score_fn = relevance_score_fn + self.pre_delete_collection = pre_delete_collection + self.retrieval_query = retrieval_query + self.search_type = search_type + self._index_type = index_type + self.metadata = metadata + + # Calculate embedding_dimensions if not given + if not embedding_dimension: + self.embedding_dimension = len(self.embedding.embed_query("foo")) + + # Delete existing data if flagged + if pre_delete_collection: + self._database.query(f"""MATCH (n:`{self.node_label}`) DELETE n""") + + @property + def embeddings(self) -> Embeddings: + """Returns the `Embeddings` model being used by the Vectorstore""" + return self.embedding + + def _query( + self, + query: str, + *, + params: Optional[dict] = None, + retry_on_timeout: bool = True, + ) -> List[List]: + """ + This method sends a Cypher query to the connected FalkorDB database + and returns the results as a list of lists. + + Args: + query (str): The Cypher query to execute. + params (dict, optional): Dictionary of query parameters. Defaults to {}. + + Returns: + List[List]: List of Lists containing the query results + """ + params = params or {} + try: + data = self._database.query(query, params) + return data.result_set + except Exception as e: + if "Invalid input" in str(e): + raise ValueError(f"Cypher Statement is not valid\n{e}") + if retry_on_timeout: + return self._query(query, params=params, retry_on_timeout=False) + else: + raise e + + def retrieve_existing_node_index( + self, node_label: Optional[str] = "" + ) -> Tuple[Optional[int], Optional[str], Optional[str], Optional[str]]: + """ + Check if the vector index exists in the FalkorDB database + and returns its embedding dimension, entity_type, + entity_label, entity_property + + This method; + 1. queries the FalkorDB database for existing indexes + 2. attempts to retrieve the dimension of + the vector index with the specified node label + & index type + 3. If the index exists, its dimension is returned. + 4. Else if the index doesn't exist, `None` is returned. + + Returns: + int or None: The embedding dimension of the + existing index if found, + str or None: The entity type found. + str or None: The label of the entity that the + vector index was created with + str or None: The property of the entity for + which the vector index was created on + + + """ + if node_label: + pass + elif self.node_label: + node_label = self.node_label + else: + raise ValueError("`node_label` property must be set to use this function") + + embedding_dimension = None + entity_type = None + entity_label = None + entity_property = None + index_information = self._database.query("CALL db.indexes()") + + if index_information: + processed_index_information = process_index_data( + index_information.result_set + ) + for dict in processed_index_information: + if ( + dict.get("entity_label", False) == node_label + and dict.get("entity_type", False) == "NODE" + ): + if dict["index_type"] == "VECTOR": + embedding_dimension = int(dict["index_dimension"]) + entity_type = str(dict["entity_type"]) + entity_label = str(dict["entity_label"]) + entity_property = str(dict["entity_property"]) + break + if embedding_dimension and entity_type and entity_label and entity_property: + self._index_type = IndexType(entity_type) + return embedding_dimension, entity_type, entity_label, entity_property + else: + return None, None, None, None + else: + return None, None, None, None + + def retrieve_existing_relationship_index( + self, relation_type: Optional[str] = "" + ) -> Tuple[Optional[int], Optional[str], Optional[str], Optional[str]]: + """ + Check if the vector index exists in the FalkorDB database + and returns its embedding dimension, entity_type, entity_label, entity_property + + This method; + 1. queries the FalkorDB database for existing indexes + 2. attempts to retrieve the dimension of the vector + index with the specified label & index type + 3. If the index exists, its dimension is returned. + 4. Else if the index doesn't exist, `None` is returned. + + Returns: + int or None: The embedding dimension of the existing index if found, + str or None: The entity type found. + str or None: The label of the entity that + the vector index was created with + str or None: The property of the entity for + which the vector index was created on + + + """ + if relation_type: + pass + elif self.relation_type: + relation_type = self.relation_type + else: + raise ValueError( + "Couldn't find any specified `relation_type`." + " Check if you spelled it correctly" + ) + + embedding_dimension = None + entity_type = None + entity_label = None + entity_property = None + index_information = self._database.query("CALL db.indexes()") + + if index_information: + processed_index_information = process_index_data( + index_information.result_set + ) + for dict in processed_index_information: + if ( + dict.get("entity_label", False) == relation_type + and dict.get("entity_type", False) == "RELATIONSHIP" + ): + if dict["index_type"] == "VECTOR": + embedding_dimension = int(dict["index_dimension"]) + entity_type = str(dict["entity_type"]) + entity_label = str(dict["entity_label"]) + entity_property = str(dict["entity_property"]) + break + if embedding_dimension and entity_type and entity_label and entity_property: + self._index_type = IndexType(entity_type) + return embedding_dimension, entity_type, entity_label, entity_property + else: + return None, None, None, None + else: + return None, None, None, None + + def retrieve_existing_fts_index(self) -> Optional[str]: + """ + Check if the fulltext index exists in the FalkorDB database + + This method queries the FalkorDB database for existing fts indexes + with the specified name. + + Returns: + str: fulltext index entity label + """ + + entity_label = None + index_information = self._database.query("CALL db.indexes()") + if index_information: + processed_index_information = process_index_data( + index_information.result_set + ) + for dict in processed_index_information: + if dict.get("entity_label", False) == self.node_label: + if dict["index_type"] == "FULLTEXT": + entity_label = str(dict["entity_label"]) + break + + if entity_label: + return entity_label + else: + return None + else: + return None + + def create_new_node_index( + self, + node_label: Optional[str] = "", + embedding_node_property: Optional[str] = "", + embedding_dimension: Optional[int] = None, + ) -> None: + """ + This method creates a new vector index + on a node in FalkorDB. + """ + if node_label: + pass + elif self.node_label: + node_label = self.node_label + else: + raise ValueError("`node_label` property must be set to use this function") + + if embedding_node_property: + pass + elif self.embedding_node_property: + embedding_node_property = self.embedding_node_property + else: + raise ValueError( + "`embedding_node_property` property must be set to use this function" + ) + + if embedding_dimension: + pass + elif self.embedding_dimension: + embedding_dimension = self.embedding_dimension + else: + raise ValueError( + "`embedding_dimension` property must be set to use this function" + ) + try: + self._database.create_node_vector_index( + node_label, + embedding_node_property, + dim=embedding_dimension, + similarity_function=DISTANCE_MAPPING[self._distance_strategy], + ) + except Exception as e: + if "already indexed" in str(e): + raise ValueError( + f"A vector index on (:{node_label}" + "{" + f"{embedding_node_property}" + "}) has already been created" + ) + else: + raise ValueError(f"Error occured: {e}") + + def create_new_index_on_relationship( + self, + relation_type: str = "", + embedding_node_property: str = "", + embedding_dimension: int = 0, + ) -> None: + """ + This method creates an new vector index + on a relationship/edge in FalkorDB. + """ + if relation_type: + pass + elif self.relation_type: + relation_type = self.relation_type + else: + raise ValueError("`relation_type` must be set to use this function") + if embedding_node_property: + pass + elif self.embedding_node_property: + embedding_node_property = self.embedding_node_property + else: + raise ValueError( + "`embedding_node_property` must be set to use this function" + ) + if embedding_dimension and embedding_dimension != 0: + pass + elif self.embedding_dimension: + embedding_dimension = self.embedding_dimension + else: + raise ValueError("`embedding_dimension` must be set to use this function") + + try: + self._database.create_edge_vector_index( + relation_type, + embedding_node_property, + dim=embedding_dimension, + similarity_function=DISTANCE_MAPPING[DEFAULT_DISTANCE_STRATEGY], + ) + except Exception as e: + if "already indexed" in str(e): + raise ValueError( + f"A vector index on [:{relation_type}" + "{" + f"{embedding_node_property}" + "}] has already been created" + ) + else: + raise ValueError(f"Error occured: {e}") + + def create_new_keyword_index(self, text_node_properties: List[str] = []) -> None: + """ + This method constructs a Cypher query and executes it + to create a new full text index in FalkorDB + Args: + text_node_properties (List[str]): List of node properties + to be indexed.If not provided, defaults to + self.text_node_property. + """ + # Use the provided properties or default to self.text_node_property + node_props = text_node_properties or [self.text_node_property] + + # Dynamically pass node label and properties to create the full-text + # index + self._database.create_node_fulltext_index(self.node_label, *node_props) + + def add_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Add embeddings to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + embeddings: List of list of embedding vectors. + metadatas: List of metadatas associated with the texts. + kwargs: vectorstore specific parameters + """ + if ids is None: + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] + + if not metadatas: + metadatas = [{} for _ in texts] + + self.metadata = [] + + # Check if all dictionaries are empty + if all(not metadata for metadata in metadatas): + pass + else: + # Initialize a set to keep track of unique non-empty keys + unique_non_empty_keys: set[str] = set() + + # Iterate over each metadata dictionary + for metadata in metadatas: + # Add keys with non-empty values to the set + unique_non_empty_keys.update( + key for key, value in metadata.items() if value + ) + + # Print unique non-empty keys + if unique_non_empty_keys: + self.metadata = list(unique_non_empty_keys) + + parameters = { + "data": [ + {"text": text, "metadata": metadata, "embedding": embedding, "id": id} + for text, metadata, embedding, id in zip( + texts, metadatas, embeddings, ids + ) + ] + } + + self._database.query( + "UNWIND $data AS row " + f"MERGE (c:`{self.node_label}` {{id: row.id}}) " + f"SET c.`{self.embedding_node_property}`" + f" = vecf32(row.embedding), c.`{self.text_node_property}`" + " = row.text, c += row.metadata", + params=parameters, + ) + + return ids + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + Returns: + List of ids from adding the texts into the vectorstore. + """ + embeddings = self.embedding.embed_documents(list(texts)) + return self.add_embeddings( + texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + + def add_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """ + This function takes List[Document] element(s) and populates + the existing store with a default node or default node(s) that + represent the element(s) and returns the id(s) of the newly created node(s). + + Args: + documents: the List[Document] element(s). + ids: Optional List of custom IDs to assign to the documents. + + Returns: + A list containing the id(s) of the newly created node in the store. + """ + # Ensure the length of the ids matches the length of the documents if + # provided + if ids and len(ids) != len(documents): + raise ValueError("The number of ids must match the number of documents.") + + result_ids = [] + + # Add the documents to the store with custom or generated IDs + self.from_documents( + embedding=self.embedding, + documents=documents, + ) + + for i, doc in enumerate(documents): + page_content = doc.page_content + if ids: + # If custom IDs are provided, use them directly + assigned_id = ids[i] + self._query( + """ + MATCH (n) + WHERE n.text = $page_content + SET n.id = $assigned_id + """, + params={"page_content": page_content, "assigned_id": assigned_id}, + ) + result_ids.append(assigned_id) + + else: + # Use the existing logic to query the ID if no custom IDs were + # provided + result = self._query( + """ + MATCH (n) + WHERE n.text = $page_content + RETURN n.id + """, + params={"page_content": page_content}, + ) + try: + result_ids.append(result[0][0]) + + except Exception: + raise ValueError( + "Your document wasn't added to the store" + " successfully. Check your spellings." + ) + + return result_ids + + @classmethod + def from_texts( + cls: type[FalkorDBVector], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[Dict]] = None, # Optional + distance_strategy: Optional[DistanceStrategy] = None, # Optional + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> FalkorDBVector: + """ + Return FalkorDBVector initialized from texts and embeddings. + """ + embeddings = embedding.embed_documents(list(texts)) + + # Set default values if None + if metadatas is None: + metadatas = [{} for _ in texts] + if distance_strategy is None: + distance_strategy = DEFAULT_DISTANCE_STRATEGY + + return cls.__from( + texts, + embeddings, + embedding, + metadatas=metadatas, + ids=ids, + distance_strategy=distance_strategy, + **kwargs, + ) + + @classmethod + def __from( + cls, + texts: List[str], + embeddings: List[List[float]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + search_type: SearchType = SearchType.VECTOR, + **kwargs: Any, + ) -> FalkorDBVector: + if ids is None: + ids = [md5(text.encode("utf-8")).hexdigest() for text in texts] + + if not metadatas: + metadatas = [{} for _ in texts] + + store = cls( + embedding=embedding, + search_type=search_type, + **kwargs, + ) + + # Check if the vector index already exists + embedding_dimension, index_type, entity_label, entity_property = ( + store.retrieve_existing_node_index() + ) + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "Data ingestion is not supported with relationship vector index" + ) + + # If the vector index doesn't exist yet + if not index_type: + store.create_new_node_index() + embedding_dimension, index_type, entity_label, entity_property = ( + store.retrieve_existing_node_index() + ) + + # If the index already exists, check if embedding dimensions match + elif ( + embedding_dimension and not store.embedding_dimension == embedding_dimension + ): + raise ValueError( + f"A Vector index for {entity_label} on {entity_property} exists" + "The provided embedding function and vector index " + "dimensions do not match.\n" + f"Embedding function dimension: {store.embedding_dimension}\n" + f"Vector index dimension: {embedding_dimension}" + ) + + if search_type == SearchType.HYBRID: + fts_node_label = store.retrieve_existing_fts_index() + # If the FTS index doesn't exist yet + if not fts_node_label: + store.create_new_keyword_index() + else: # Validate that FTS and Vector Index use the same information + if not fts_node_label == store.node_label: + raise ValueError( + "Vector and keyword index don't index the same node label" + ) + + store.add_embeddings( + texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + + return store + + @classmethod + def from_existing_index( + cls: Type[FalkorDBVector], + embedding: Embeddings, + node_label: str, + search_type: SearchType = DEFAULT_SEARCH_TYPE, + **kwargs: Any, + ) -> FalkorDBVector: + """ + Get instance of an existing FalkorDB vector index. This method will + return the instance of the store without inserting any new + embeddings. + """ + + store = cls( + embedding=embedding, + node_label=node_label, + search_type=search_type, + **kwargs, + ) + + embedding_dimension, index_type, entity_label, entity_property = ( + store.retrieve_existing_node_index() + ) + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "Relationship vector index is not supported with " + "`from_existing_index` method. Please use the " + "`from_existing_relationship_index` method." + ) + + if not index_type: + raise ValueError( + f"The specified vector index node label `{node_label}` does not exist. " + "Make sure to check if you spelled the node label correctly" + ) + + # Check if embedding function and vector index dimensions match + if embedding_dimension and not store.embedding_dimension == embedding_dimension: + raise ValueError( + "The provided embedding function and vector index " + "dimensions do not match.\n" + f"Embedding function dimension: {store.embedding_dimension}\n" + f"Vector index dimension: {embedding_dimension}" + ) + + if search_type == SearchType.HYBRID: + fts_node_label = store.retrieve_existing_fts_index() + # If the FTS index doesn't exist yet + if not fts_node_label: + raise ValueError( + "The specified keyword index name does not exist. " + "Make sure to check if you spelled it correctly" + ) + else: # Validate that FTS and Vector index use the same information + if not fts_node_label == store.node_label: + raise ValueError( + "Vector and keyword index don't index the same node label" + ) + + return store + + @classmethod + def from_existing_relationship_index( + cls: Type[FalkorDBVector], + embedding: Embeddings, + relation_type: str, + search_type: SearchType = DEFAULT_SEARCH_TYPE, + **kwargs: Any, + ) -> FalkorDBVector: + """ + Get instance of an existing FalkorDB relationship vector index. + This method will return the instance of the store without + inserting any new embeddings. + """ + if search_type == SearchType.HYBRID: + raise ValueError( + "Hybrid search is not supported in combination " + "with relationship vector index" + ) + + store = cls( + embedding=embedding, + relation_type=relation_type, + **kwargs, + ) + + embedding_dimension, index_type, entity_label, entity_property = ( + store.retrieve_existing_relationship_index() + ) + + if not index_type: + raise ValueError( + "The specified vector index on the relationship" + f" {relation_type} does not exist. " + "Make sure to check if you spelled it correctly" + ) + # Raise error if not relationship index type + if index_type == "NODE": + raise ValueError( + "Node vector index is not supported with " + "`from_existing_relationship_index` method. Please use the " + "`from_existing_index` method." + ) + + # Check if embedding function and vector index dimensions match + if embedding_dimension and not store.embedding_dimension == embedding_dimension: + raise ValueError( + "The provided embedding function and vector index " + "dimensions do not match.\n" + f"Embedding function dimension: {store.embedding_dimension}\n" + f"Vector index dimension: {embedding_dimension}" + ) + + return store + + @classmethod + def from_existing_graph( + cls: Type[FalkorDBVector], + embedding: Embeddings, + database: str, + node_label: str, + embedding_node_property: str, + text_node_properties: List[str], + *, + search_type: SearchType = DEFAULT_SEARCH_TYPE, + retrieval_query: str = "", + **kwargs: Any, + ) -> FalkorDBVector: + """ + Initialize and return a FalkorDBVector instance + from an existing graph using the database name + + This method initializes a FalkorDBVector instance + using the provided parameters and the existing graph. + It validates the existence of the indices and creates + new ones if they don't exist. + + Args: + embedding: The `Embeddings` model you would like to use + database: The name of the existing graph/database you + would like to intialize + node_label: The label of the node you want to initialize. + embedding_node_property: The name of the property you + want your embeddings to be stored in. + + Returns: + FalkorDBVector: An instance of FalkorDBVector initialized + with the provided parameters and existing graph. + + Example: + >>> falkordb_vector = FalkorDBVector.from_existing_graph( + ... embedding=my_embedding, + ... node_label="Document", + ... embedding_node_property="embedding", + ... text_node_properties=["title", "content"] + ... ) + + """ + # Validate that database and text_node_properties is not empty + if not database: + raise ValueError("Parameter `database` must be given") + if not text_node_properties: + raise ValueError( + "Parameter `text_node_properties` must not be an empty list" + ) + + # Prefer retrieval query from params, otherwise construct it + if not retrieval_query: + retrieval_query = ( + f"RETURN reduce(str='', k IN {text_node_properties} |" + " str + '\\n' + k + ': ' + coalesce(node[k], '')) AS text, " + "node {.*, `" + + embedding_node_property + + "`: Null, id: Null, " + + ", ".join([f"`{prop}`: Null" for prop in text_node_properties]) + + "} AS metadata, score" + ) + + store = cls( + database=database, + embedding=embedding, + search_type=search_type, + retrieval_query=retrieval_query, + node_label=node_label, + embedding_node_property=embedding_node_property, + **kwargs, + ) + + embedding_dimension, index_type, entity_label, entity_property = ( + store.retrieve_existing_node_index() + ) + + # Raise error if relationship index type + if index_type == "RELATIONSHIP": + raise ValueError( + "`from_existing_graph` method does not support " + " existing relationship vector index. " + "Please use `from_existing_relationship_index` method" + ) + + # If the vector index doesn't exist yet + if not index_type: + store.create_new_node_index(node_label=node_label) + # If the index already exists, check if embedding dimensions match + elif ( + embedding_dimension and not store.embedding_dimension == embedding_dimension + ): + raise ValueError( + f"Index on Node {store.node_label} already exists." + "The provided embedding function and vector index " + "dimensions do not match.\n" + f"Embedding function dimension: {store.embedding_dimension}\n" + f"Vector index dimension: {embedding_dimension}" + ) + # FTS index for Hybrid search + if search_type == SearchType.HYBRID: + fts_node_label = store.retrieve_existing_fts_index() + # If the FTS index doesn't exist yet + if not fts_node_label: + store.create_new_keyword_index(text_node_properties) + else: # Validate that FTS and Vector index use the same information + if not fts_node_label == store.node_label: + raise ValueError( + "Vector and keyword index don't index the same node label" + ) + + # Populate embeddings + + while True: + fetch_query = ( + f"MATCH (n:`{node_label}`) " + f"WHERE n.`{embedding_node_property}` IS null " + "AND any(k IN $props WHERE n[k] IS NOT null) " + "RETURN id(n) AS id, " + "coalesce(n.text, '') AS text " + "LIMIT 1000" + ) + data = store._query(fetch_query, params={"props": text_node_properties}) + if not data: + break + text_embeddings = embedding.embed_documents([el[1] for el in data]) + + params = { + "data": [ + {"id": el[0], "embedding": embedding} + for el, embedding in zip(data, text_embeddings) + ] + } + + store._query( + "UNWIND $data AS row " + f"MATCH (n:`{node_label}`) " + "WHERE id(n) = row.id " + f"SET n.`{embedding_node_property}` = vecf32(row.embedding)" + "RETURN count(*)", + params=params, + ) + # If embedding calculation should be stopped + if len(data) < 1000: + break + return store + + @classmethod + def from_documents( + cls: Type[FalkorDBVector], + documents: List[Document], + embedding: Embeddings, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> FalkorDBVector: + """ + Return FalkorDBVector initialized from documents and embeddings. + """ + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + + return cls.from_texts( + texts=texts, + embedding=embedding, + distance_strategy=distance_strategy, + metadatas=metadatas, + ids=ids, + **kwargs, + ) + + @classmethod + def from_embeddings( + cls, + text_embeddings: List[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ids: Optional[List[str]] = None, + pre_delete_collection: bool = False, + **kwargs: Any, + ) -> FalkorDBVector: + """Construct FalkorDBVector wrapper from raw documents and pre- + generated embeddings. + + Return FalkorDBVector initialized from documents and embeddings. + + Example: + .. code-block:: python + + from langchain_community.vectorstores.falkordb_vector import ( + FalkorDBVector ) + from langchain_community.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + text_embeddings = embeddings.embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + vectorstore = FalkorDBVector.from_embeddings( + text_embedding_pairs, embeddings + ) + """ + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + + return cls.__from( + texts, + embeddings, + embedding, + metadatas=metadatas, + ids=ids, + distance_strategy=distance_strategy, + pre_delete_collection=pre_delete_collection, + **kwargs, + ) + + def similarity_search( + self, + query: str, + k: int = 4, + params: Dict[str, Any] = {}, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Document]: + """Run similarity search with FalkorDBVector. + + Args: + query (str): Query text to search for. + k (int): Number of results to return. Defaults to 4. + params (Dict[str, Any]): The search params for the index type. + Defaults to empty dict. + filter (Optional[Dict[str, Any]]): Dictionary of arguments(s) to + filter on metadata. + Defaults to None. + + Returns: + List of Documents most similar to the query. + """ + embedding = self.embedding.embed_query(text=query) + return self.similarity_search_by_vector( + embedding=embedding, + k=k, + query=query, + params=params, + filter=filter, + **kwargs, + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + params: Dict[str, Any] = {}, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to + filter on metadata. + Defaults to None. + params (Dict[str, Any]): The search params for the index type. + Defaults to empty dict. + + Returns: + List of Documents most similar to the query vector. + """ + docs_and_scores = self.similarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter, params=params, **kwargs + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + params: Dict[str, Any] = {}, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Perform a similarity search in the FalkorDB database using a + given vector and return the top k similar documents with their scores. + + This method uses a Cypher query to find the top k documents that + are most similar to a given embedding. The similarity is measured + using a vector index in the FalkorDB database. The results are returned + as a list of tuples, each containing a Document object and its similarity + score. + + Args: + embedding (List[float]): The embedding vector to compare against. + k (int, optional): The number of top similar documents to retrieve. + filter (Optional[Dict[str, Any]]): Dictionary of argument(s) to + filter on metadata. + Defaults to None. + params (Dict[str, Any]): The Search params for the index type. + Defaults to empty dict. + + Returns: + List[Tuple[Document, float]]: A list of tuples, each containing + a Document object and its similarity score. + """ + if filter: + if self.search_type == SearchType.HYBRID: + raise ValueError( + "Metadata filtering can't be use in combination with " + "a hybrid search approach" + ) + + base_index_query = ( + f"MATCH (n:{self.node_label}) WHERE " + f"n.{self.embedding_node_property} IS NOT NULL AND " + ) + + base_cosine_query = ( + " WITH n as node, " + f" vec.cosineDistance(n.{self.embedding_node_property}" + ", vecf32($embedding)) as score " + ) + + filter_snippets, filter_params = construct_metadata_filter(filter) + + index_query = base_index_query + filter_snippets + base_cosine_query + else: + index_query = _get_search_index_query(self.search_type, self._index_type) + filter_params = {} + + if self._index_type == IndexType.RELATIONSHIP: + if kwargs.get("return_embeddings"): + if self.metadata: + # Construct the metadata part based on self.metadata + metadata_fields = ", ".join( + f"`{key}`: relationship.{key}" for key in self.metadata + ) + default_retrieval = ( + f"RETURN relationship.{self.text_node_property} " + "AS text, score, " + f"{{text: relationship.{self.text_node_property}, " + f"embedding: relationship.{self.embedding_node_property}, " + f"id: relationship.id, source: relationship.source, " + f"{metadata_fields}}} AS metadata" + ) + else: + default_retrieval = ( + f"RETURN relationship.{self.text_node_property}" + " AS text, score, " + f"{{text: relationship.{self.text_node_property}, " + f"embedding: relationship.{self.embedding_node_property}, " + f"id: relationship.id, source: relationship.source}}" + " AS metadata" + ) + else: + if self.metadata: + # Construct the metadata part based on self.metadata + metadata_fields = ", ".join( + f"`{key}`: relationship.{key}" for key in self.metadata + ) + default_retrieval = ( + f"RETURN relationship.{self.text_node_property} " + "AS text, score, " + f"{{text: relationship.{self.text_node_property}, " + f"id: relationship.id, source: relationship.source, " + f"{metadata_fields}}} AS metadata" + ) + else: + default_retrieval = ( + f"RETURN relationship.{self.text_node_property}" + " AS text, score, " + f"{{text: relationship.{self.text_node_property}, " + f"id: relationship.id, source: relationship.source}}" + " AS metadata" + ) + else: + if kwargs.get("return_embeddings"): + if self.metadata: + # Construct the metadata part based on self.metadata + metadata_fields = ", ".join( + f"`{key}`: node.`{key}`" for key in self.metadata + ) + default_retrieval = ( + f"RETURN node.{self.text_node_property} AS text, score, " + f"{{text: node.{self.text_node_property}, " + f"embedding: node.{self.embedding_node_property}, " + f"id: node.id, source: node.source, " + f"{metadata_fields}}} AS metadata" + ) + else: + default_retrieval = ( + f"RETURN node.{self.text_node_property} AS text, score, " + f"{{text: node.{self.text_node_property}, " + f"embedding: node.{self.embedding_node_property}, " + f"id: node.id, source: node.source}} AS metadata" + ) + else: + if self.metadata: + # Construct the metadata part based on self.metadata + metadata_fields = ", ".join( + f"`{key}`: node.`{key}`" for key in self.metadata + ) + default_retrieval = ( + f"RETURN node.{self.text_node_property} AS text, score, " + f"{{text: node.{self.text_node_property}, " + f"id: node.id, source: node.source, " + f"{metadata_fields}}} AS metadata" + ) + else: + default_retrieval = ( + f"RETURN node.{self.text_node_property} AS text, score, " + f"{{text: node.{self.text_node_property}, " + f"id: node.id, source: node.source}} AS metadata" + ) + + retrieval_query = ( + self.retrieval_query if self.retrieval_query else default_retrieval + ) + + read_query = index_query + retrieval_query + parameters = { + "entity_property": self.embedding_node_property, + "k": k, + "embedding": embedding, + "query": kwargs["query"], + **params, + **filter_params, + } + if self._index_type == "NODE": + parameters["entity_label"] = self.node_label + elif self._index_type == "RELATIONSHIP": + parameters["entity_label"] = self.relation_type + + results = self._query(read_query, params=parameters) + + if not results: + if not self.retrieval_query: + raise ValueError( + f"Make sure that none of the `{self.text_node_property}` " + f"properties on nodes with label `{self.node_label}` " + "are missing or empty" + ) + else: + raise ValueError( + "Inspect the `retrieval_query` and ensure it doesn't " + "return None for the `text` column" + ) + elif any(result[0] is None for result in results): + if not self.retrieval_query: + raise ValueError( + f"Make sure that none of the `{self.text_node_property}` " + f"properties on nodes with label `{self.node_label}` " + "are missing or empty" + ) + else: + raise ValueError( + "Inspect the `retrieval_query` and ensure it doesn't " + "return None for the `text` column" + ) + + # Check if embeddings are missing when they are expected + if kwargs.get("return_embeddings") and any( + result[2]["embedding"] is None for result in results + ): + if not self.retrieval_query: + raise ValueError( + f"Make sure that none of the `{self.embedding_node_property}` " + f"properties on nodes with label `{self.node_label}` " + "are missing or empty" + ) + else: + raise ValueError( + "Inspect the `retrieval_query` and ensure it doesn't " + "return None for the `embedding` metadata column" + ) + + try: + docs = [ + ( + Document( + # Use the first element for text + page_content=result[0], + metadata={ + k: v for k, v in result[2].items() if v is not None + }, # Use the third element for metadata + ), + result[1], # Use the second element for score + ) + for result in results + ] + except AttributeError: + try: + sorted_results = sorted(results, key=lambda r: r[2], reverse=True) + docs = [ + ( + Document( + # Use the first element for text + page_content=result[0], + metadata={ + k: v for k, v in result[1].items() if v is not None + }, # Use the second element as metadata + ), + result[2], # Use the second element for score + ) + for result in sorted_results + ] + except Exception as e: + raise ValueError(f"An error occured: {e}") + + return docs + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + params: Dict[str, Any] = {}, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + params (Dict[str, Any]): The search params + for the index type. Defaults to empty dict. + filter (Optional[Dict[str, Any]]): Dictionary of + argument(s) to filter on metadata. Defaults + to None. + + Returns: + List of Documents most similar to the query and score for each + """ + embedding = self.embedding.embed_query(query) + docs = self.similarity_search_with_score_by_vector( + embedding=embedding, + k=k, + query=query, + params=params, + filter=filter, + **kwargs, + ) + return docs + + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + docs_with_scores = self.similarity_search_with_score( + query=query, k=k, filter=filter, **kwargs + ) + + return docs_with_scores + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[dict] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: search query text. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter: Filter on metadata properties, e.g. + { + "str_property": "foo", + "int_property": 123 + } + Returns: + List of Documents selected by maximal marginal relevance. + """ + # Embed the query + query_embedding = self.embedding.embed_query(query) + + # Fetch the initial documents + got_docs = self.similarity_search_with_score_by_vector( + embedding=query_embedding, + query=query, + k=fetch_k, + return_embeddings=True, + filter=filter, + **kwargs, + ) + + got_embeddings = [doc.metadata["embedding"] for doc, _ in got_docs] + + # Select documents using maximal marginal relevance + selected_indices = maximal_marginal_relevance( + np.array(query_embedding), got_embeddings, lambda_mult=lambda_mult, k=k + ) + selected_docs = [got_docs[i][0] for i in selected_indices] + + # Remove embedding values from metadata + for doc in selected_docs: + del doc.metadata["embedding"] + + return selected_docs + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided + # in vectorstore constructor + if self._distance_strategy == DistanceStrategy.COSINE: + return lambda x: x + elif self._distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: + return lambda x: x + else: + raise ValueError( + "No supported normalization function" + f" for distance_strategy of {self._distance_strategy}." + "Consider providing relevance_score_fn to PGVector constructor." + ) + + def update_documents( + self, + document_id: str, + document: Document, + ) -> None: + """ + This function updates an existing document in + the store based on the document_id. + + Args: + document_id: The id of the document to be updated. + document: The new Document instance with the + updated content. + + Returns: + None + """ + + # Ensure the document_id exists in the store + existing_document = self._query( + """ + MATCH (n) + WHERE n.id = $document_id + RETURN n + """, + params={"document_id": document_id}, + ) + + if not existing_document: + raise ValueError(f"Document with id {document_id} not found in the store.") + + # Update the document's text content + self._query( + """ + MATCH (n) + WHERE n.id = $document_id + SET n.text = $new_content + """, + params={"document_id": document_id, "new_content": document.page_content}, + ) + + # Optionally, update any other properties like metadata + if document.metadata: + for key, value in document.metadata.items(): + self._query( + f""" + MATCH (n) + WHERE n.id = $document_id + SET n.{key} = $value + """, + params={"document_id": document_id, "value": value}, + ) + + def delete( + self, + ids: Optional[List[str]] = None, # Make `ids` optional + **kwargs: Any, + ) -> Optional[bool]: # Return type matches the superclass signature + """ + This function deletes an item from the store based on the item_id. + Args: + ids: A list of IDs of the documents to be deleted. + If None, deletes all documents. + Returns: + Optional[bool]: True if documents were deleted, False otherwise. + """ + if ids is None: + raise ValueError("You must provide at least one ID to delete.") + for id in ids: + item_id = id + # Ensure the document exists in the store + existing_document = self._query( + """ + MATCH (n) + WHERE n.id = $item_id + RETURN n + """, + params={"item_id": item_id}, + ) + if not existing_document: + raise ValueError(f"Document with id {item_id} not found in the store.") + # Delete the document node from the store + self._query( + """ + MATCH (n) + WHERE n.id = $item_id + DELETE n + """, + params={"item_id": item_id}, + ) + return True diff --git a/libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py b/libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py new file mode 100644 index 0000000000000..9220033f01b35 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_falkordb_vector_integration.py @@ -0,0 +1,671 @@ +""" +Integration tests for FalkorDB vector store functionality. + +These tests validate the end-to-end process of constructing, indexing, +and searching vector embeddings in a FalkorDB instance. They include: +- Setting up the FalkorDB vector store with a local instance. +- Indexing documents with fake embeddings. +- Performing vector searches and validating results. + +Note: +These tests are conducted using a local FalkorDB instance but can also +be run against a Cloud FalkorDB instance. Ensure that appropriate host +and port configurations are set up before running the tests. +""" + +import os +from math import isclose +from typing import Any, Dict, List + +from dotenv import load_dotenv +from langchain_core.documents import Document + +from langchain_community.vectorstores.falkordb_vector import ( + FalkorDBVector, + SearchType, + process_index_data, +) +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +# Load environment variables from .env file +load_dotenv() + +host = os.getenv("FALKORDB_HOST", "localhost") +port = int(os.getenv("FALKORDB_PORT", 6379)) + +OS_TOKEN_COUNT = 1535 + +texts = ["foo", "bar", "baz", "It is the end of the world. Take shelter!"] + + +def drop_vector_indexes(store: FalkorDBVector) -> None: + """Cleanup all vector indexes""" + index_entity_labels: List[Any] = [] + index_entity_properties: List[Any] = [] + index_entity_types: List[Any] = [] + + # get all indexes + result = store._query( + """ + CALL db.indexes() + """ + ) + processed_result: List[Dict[str, Any]] = process_index_data(result) + + # get all vector indexs entity labels, entity properties, entity_types + if isinstance(processed_result, list): + for index in processed_result: + if isinstance(index, dict): + if index.get("index_type") == "VECTOR": + index_entity_labels.append(index["entity_label"]) + index_entity_properties.append(index["entity_property"]) + index_entity_types.append(index["entity_type"]) + + # drop vector indexs + for entity_label, entity_property, entity_type in zip( + index_entity_labels, index_entity_properties, index_entity_types + ): + if entity_type == "NODE": + store._database.drop_node_vector_index( + label=entity_label, + attribute=entity_property, + ) + elif entity_type == "RELATIONSHIP": + store._database.drop_edge_vector_index( + label=entity_label, + attribute=entity_property, + ) + + +class FakeEmbeddingsWithOsDimension(FakeEmbeddings): + """Fake embeddings functionality for testing.""" + + def embed_documents(self, embedding_texts: List[str]) -> List[List[float]]: + """Return simple embeddings.""" + return [ + [float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(i + 1)] + for i in range(len(embedding_texts)) + ] + + def embed_query(self, text: str) -> List[float]: + """Return simple embeddings.""" + return [float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(texts.index(text) + 1)] + + +def test_falkordbvector() -> None: + """Test end to end construction and search.""" + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert type(output) is list + assert type(output[0]) is Document + assert output[0].page_content == "foo" + + drop_vector_indexes(docsearch) + + +def test_falkordbvector_embeddings() -> None: + """Test end to end construction with embeddings and search.""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert type(output) is list + assert type(output[0]) is Document + assert output[0].page_content == "foo" + + drop_vector_indexes(docsearch) + + +def test_falkordbvector_catch_wrong_node_label() -> None: + """Test if node label is misspelled, but index name is correct.""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + ) + try: + FalkorDBVector.from_existing_index( + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label="test", + ) + except Exception as e: + assert type(e) is ValueError + assert str(e) == ( + "The specified vector index node label " + + "`test` does not exist. Make sure to" + + " check if you spelled the node label correctly" + ) + drop_vector_indexes(docsearch) + + +def test_falkordbvector_with_metadatas() -> None: + """Test end to end construction and search.""" + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + metadatas=metadatas, + host=host, + port=port, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert type(output) is list + assert type(output[0]) is Document + assert output[0].metadata.get("page") == "0" + + drop_vector_indexes(docsearch) + + +def test_falkordbvector_with_metadatas_with_scores() -> None: + """Test end to end construction and search.""" + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + metadatas=metadatas, + host=host, + port=port, + pre_delete_collection=True, + ) + output = [ + (doc, round(score, 1)) + for doc, score in docsearch.similarity_search_with_score("foo", k=1) + ] + assert output == [ + ( + Document( + metadata={ + "text": "foo", + "id": "acbd18db4cc2f85cedef654fccc4a4d8", + "page": "0", + }, + page_content="foo", + ), + 0.0, + ) + ] + drop_vector_indexes(docsearch) + + +def test_falkordb_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-2.""" + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + metadatas=metadatas, + host=host, + port=port, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + expected_output = [ + ( + Document( + metadata={ + "text": "foo", + "id": "acbd18db4cc2f85cedef654fccc4a4d8", + "page": "0", + }, + page_content="foo", + ), + 0.0, + ), + ( + Document( + metadata={ + "text": "bar", + "id": "37b51d194a7513e45b56f6524f2d51f2", + "page": "1", + }, + page_content="bar", + ), + 1.0, + ), + ( + Document( + metadata={ + "text": "baz", + "id": "73feffa4b7f6bb68e44cf984c85f6e88", + "page": "2", + }, + page_content="baz", + ), + 2.0, + ), + ] + + # Check if the length of the outputs matches + assert len(output) == len(expected_output) + + # Check if each document and its relevance score is close to the expected value + for (doc, score), (expected_doc, expected_score) in zip(output, expected_output): + assert doc.page_content == expected_doc.page_content + assert doc.metadata == expected_doc.metadata + assert isclose(score, expected_score, rel_tol=1e-5) + + drop_vector_indexes(docsearch) + + +def test_falkordbvector_retriever_search_threshold() -> None: + """Test using retriever for searching with threshold.""" + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddingsWithOsDimension(), + metadatas=metadatas, + host=host, + port=port, + pre_delete_collection=True, + ) + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 1, "score_threshold": 0.9999}, + ) + output = retriever.invoke("foo") + assert output == [ + Document( + metadata={ + "text": "foo", + "id": "acbd18db4cc2f85cedef654fccc4a4d8", + "page": "0", + }, + page_content="foo", + ) + ] + + drop_vector_indexes(docsearch) + + +def test_custom_return_falkordbvector() -> None: + """Test end to end construction and search.""" + docsearch = FalkorDBVector.from_texts( + texts=["test"], + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + retrieval_query="RETURN 'foo' AS text, score, {test: 'test'} AS metadata", + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"test": "test"})] + + drop_vector_indexes(docsearch) + + +def test_falkordb_hybrid() -> None: + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + search_type=SearchType.HYBRID, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [ + Document( + metadata={"text": "foo", "id": "acbd18db4cc2f85cedef654fccc4a4d8"}, + page_content="foo", + ) + ] + + drop_vector_indexes(docsearch) + + +def test_falkordb_hybrid_deduplicate() -> None: + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + search_type=SearchType.HYBRID, + ) + output = docsearch.similarity_search("foo", k=3) + assert output == [ + Document( + metadata={"text": "baz", "id": "73feffa4b7f6bb68e44cf984c85f6e88"}, + page_content="baz", + ), + Document( + metadata={"text": "foo", "id": "acbd18db4cc2f85cedef654fccc4a4d8"}, + page_content="foo", + ), + Document( + metadata={"text": "bar", "id": "37b51d194a7513e45b56f6524f2d51f2"}, + page_content="bar", + ), + ] + + drop_vector_indexes(docsearch) + + +def test_falkordb_hybrid_retrieval_query() -> None: + """Test custom retrieval_query with hybrid search.""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + search_type=SearchType.HYBRID, + retrieval_query="RETURN 'moo' AS text, score, {test: 'test'} AS metadata", + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="moo", metadata={"test": "test"})] + drop_vector_indexes(docsearch) + + +def test_falkordbvector_missing_keyword() -> None: + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + node_label = "vector" + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + ) + try: + FalkorDBVector.from_existing_index( + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label=node_label, + search_type=SearchType.HYBRID, + ) + except Exception as e: + assert str(e) == ( + "The specified vector index node label " + + f"`{node_label}` does not exist. Make sure" + + " to check if you spelled the node label correctly" + ) + + drop_vector_indexes(docsearch) + + +def test_falkordb_hybrid_from_existing() -> None: + """Test hybrid search with missing keyword_index_search.""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + search_type=SearchType.HYBRID, + ) + existing = FalkorDBVector.from_existing_index( + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label="Chunk", # default node label + search_type=SearchType.HYBRID, + ) + output = existing.similarity_search("foo", k=1) + assert output == [ + Document( + metadata={"text": "foo", "id": "acbd18db4cc2f85cedef654fccc4a4d8"}, + page_content="foo", + ) + ] + + drop_vector_indexes(existing) + drop_vector_indexes(docsearch) + + +def test_falkordbvector_from_existing_graph() -> None: + """Test from_existing_graph with a single property""" + graph = FalkorDBVector.from_texts( + texts=["test"], + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label="Foo", + embedding_node_property="vector", + text_node_property="info", + pre_delete_collection=True, + ) + graph._query("MATCH (n) DELETE n") + graph._query("CREATE (:Test {name:'Foo'}), (:Test {name:'Bar'})") + assert graph.database_name, "Database name cannot be empty or None" + existing = FalkorDBVector.from_existing_graph( + embedding=FakeEmbeddingsWithOsDimension(), + database=graph.database_name, + host=host, + port=port, + node_label="Test", + text_node_properties=["name"], + embedding_node_property="embedding", + ) + + output = existing.similarity_search("foo", k=2) + + assert [output[0]] == [Document(page_content="\nname: Foo")] + + drop_vector_indexes(existing) + + +def test_falkordb_from_existing_graph_mulitiple_properties() -> None: + """Test from_existing_graph with two properties.""" + graph = FalkorDBVector.from_texts( + texts=["test"], + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label="Foo", + embedding_node_property="vector", + text_node_property="info", + pre_delete_collection=True, + ) + graph._query("MATCH (n) DELETE n") + graph._query("CREATE (:Test {name:'Foo', name2: 'Fooz'}), (:Test {name:'Bar'})") + assert graph.database_name, "Database name cannot be empty or None" + existing = FalkorDBVector.from_existing_graph( + embedding=FakeEmbeddingsWithOsDimension(), + database=graph.database_name, + host=host, + port=port, + node_label="Test", + text_node_properties=["name", "name2"], + embedding_node_property="embedding", + ) + + output = existing.similarity_search("foo", k=2) + assert [output[0]] == [Document(page_content="\nname: Foo\nname2: Fooz")] + + drop_vector_indexes(existing) + drop_vector_indexes(graph) + + +def test_falkordbvector_special_character() -> None: + """Test removing lucene.""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + pre_delete_collection=True, + search_type=SearchType.HYBRID, + ) + output = docsearch.similarity_search( + "It is the end of the world. Take shelter!", k=1 + ) + + assert output == [ + Document( + metadata={ + "text": "It is the end of the world. Take shelter!", + "id": "84768c9c477cbe05fbafbe7247990051", + }, + page_content="It is the end of the world. Take shelter!", + ) + ] + drop_vector_indexes(docsearch) + + +def test_falkordb_from_existing_graph_mulitiple_properties_hybrid() -> None: + """Test from_existing_graph with a two property.""" + graph = FalkorDBVector.from_texts( + texts=["test"], + embedding=FakeEmbeddingsWithOsDimension(), + host=host, + port=port, + node_label="Foo", + embedding_node_property="vector", + text_node_property="info", + pre_delete_collection=True, + ) + + graph._query("MATCH (n) DELETE n") + graph._query("CREATE (:Test {name:'Foo', name2: 'Fooz'}), (:Test {name:'Bar'})") + assert graph.database_name, "Database name cannot be empty or None" + existing = FalkorDBVector.from_existing_graph( + embedding=FakeEmbeddingsWithOsDimension(), + database=graph.database_name, + host=host, + port=port, + node_label="Test", + text_node_properties=["name", "name2"], + embedding_node_property="embedding", + search_type=SearchType.HYBRID, + ) + + output = existing.similarity_search("foo", k=2) + + assert [output[0]] == [Document(page_content="\nname: Foo\nname2: Fooz")] + + drop_vector_indexes(existing) + + +def test_index_fetching() -> None: + """testing correct index creation and fetching""" + text_embeddings = FakeEmbeddingsWithOsDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + embeddings = FakeEmbeddingsWithOsDimension() + + def create_store(node_label: str, text_properties: List[str]) -> FalkorDBVector: + return FalkorDBVector.from_embeddings( + text_embeddings=text_embedding_pairs, + embedding=FakeEmbeddingsWithOsDimension(), + node_label=node_label, + host=host, + port=port, + pre_delete_collection=True, + ) + + def fetch_store(node_label: str) -> FalkorDBVector: + store = FalkorDBVector.from_existing_index( + embedding=embeddings, + host=host, + port=port, + node_label=node_label, + ) + return store + + index_0_str = "label0" + create_store(index_0_str, ["text"]) + + # create index 1 + index_1_str = "label1" + create_store("label1", ["text"]) + + index_1_store = fetch_store(index_1_str) + assert index_1_store.node_label == index_1_str + + index_0_store = fetch_store(index_0_str) + assert index_0_store.node_label == index_0_str + + drop_vector_indexes(index_1_store) + drop_vector_indexes(index_0_store) + + +def test_retrieval_params() -> None: + """Test if we use parameters in retrieval query""" + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + pre_delete_collection=True, + retrieval_query=""" + RETURN $test as text, score, {test: $test1} AS metadata + """, + ) + output = docsearch.similarity_search( + "Foo", k=2, params={"test": "test", "test1": "test1"} + ) + assert output == [ + Document(page_content="test", metadata={"test": "test1"}), + Document(page_content="test", metadata={"test": "test1"}), + ] + drop_vector_indexes(docsearch) + + +def test_falkordb_relationship_index() -> None: + """Test end to end construction and search.""" + embeddings = FakeEmbeddingsWithOsDimension() + docsearch = FalkorDBVector.from_texts( + texts=texts, + embedding=embeddings, + host=host, + port=port, + pre_delete_collection=True, + ) + # Ingest data + docsearch._query( + ( + "MERGE (p1:Person)" + "MERGE (p2:Person)" + "MERGE (p3:Person)" + "MERGE (p4:Person)" + "MERGE (p1)-[:REL {text: 'foo', embedding: vecf32($e1)}]->(p2)" + "MERGE (p3)-[:REL {text: 'far', embedding: vecf32($e2)}]->(p4)" + ), + params={ + "e1": embeddings.embed_query("foo"), + "e2": embeddings.embed_query("bar"), + }, + ) + # Create relationship index + docsearch.create_new_index_on_relationship( + relation_type="REL", + embedding_node_property="embedding", + embedding_dimension=OS_TOKEN_COUNT, + ) + relationship_index = FalkorDBVector.from_existing_relationship_index( + embeddings, relation_type="REL" + ) + output = relationship_index.similarity_search("foo", k=1) + assert output == [Document(metadata={"text": "foo"}, page_content="foo")] + + drop_vector_indexes(docsearch) + drop_vector_indexes(relationship_index) diff --git a/libs/community/tests/unit_tests/vectorstores/test_falkordb_vector_utils.py b/libs/community/tests/unit_tests/vectorstores/test_falkordb_vector_utils.py new file mode 100644 index 0000000000000..bbdcb60f52edf --- /dev/null +++ b/libs/community/tests/unit_tests/vectorstores/test_falkordb_vector_utils.py @@ -0,0 +1,24 @@ +"""Test utils function in falkordb_vector.py""" + +from langchain_community.vectorstores.falkordb_vector import ( + dict_to_yaml_str, +) + + +def test_converting_to_yaml() -> None: + example_dict = { + "name": "John Doe", + "age": 30, + "skills": ["Python", "Data Analysis", "Machine Learning"], + "location": {"city": "Ljubljana", "country": "Slovenia"}, + } + + yaml_str = dict_to_yaml_str(example_dict) + + expected_output = ( + "name: John Doe\nage: 30\nskills:\n- Python\n- " + "Data Analysis\n- Machine Learning\nlocation:\n city: Ljubljana\n" + " country: Slovenia\n" + ) + + assert yaml_str == expected_output