diff --git a/docs/docs/integrations/vectorstores/couchbase.ipynb b/docs/docs/integrations/vectorstores/couchbase.ipynb new file mode 100644 index 0000000000000..2f379951d81c0 --- /dev/null +++ b/docs/docs/integrations/vectorstores/couchbase.ipynb @@ -0,0 +1,787 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f63dfcf9-fd9d-4ac1-a0b3-c02d4dce7faf", + "metadata": {}, + "source": [ + "# Couchbase \n", + "[Couchbase](http://couchbase.com/) is an award-winning distributed NoSQL cloud database that delivers unmatched versatility, performance, scalability, and financial value for all of your cloud, mobile, AI, and edge computing applications. Couchbase embraces AI with coding assistance for developers and vector search for their applications.\n", + "\n", + "Vector Search is a part of the [Full Text Search Service](https://docs.couchbase.com/server/current/learn/services-and-indexes/services/search-service.html) (Search Service) in Couchbase.\n", + "\n", + "This tutorial explains how to use Vector Search in Couchbase. You can work with both [Couchbase Capella](https://www.couchbase.com/products/capella/) and your self-managed Couchbase Server." + ] + }, + { + "cell_type": "markdown", + "id": "43326be4-4433-4de2-ad42-6eb91a722bad", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec8d532-fec7-4dc7-9be3-020aa7bdb01f", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet langchain langchain-openai couchbase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a972cbc-bf59-46eb-9b50-e5dc3a69dcf0", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "markdown", + "id": "acf1b168-622f-465c-a9a5-d27a6d7e7a8f", + "metadata": {}, + "source": [ + "## Import the Vector Store and Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "23ce45ab-bfd2-42e1-b681-514a550f0232", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import CouchbaseVectorStore\n", + "from langchain_openai import OpenAIEmbeddings" + ] + }, + { + "cell_type": "markdown", + "id": "3144ba02-1eaa-4449-853e-f034ca5706bf", + "metadata": {}, + "source": [ + "## Create Couchbase Connection Object\n", + "We create a connection to the Couchbase cluster initially and then pass the cluster object to the Vector Store. \n", + "\n", + "Here, we are connecting using the username and password. You can also connect using any other supported way to your cluster. \n", + "\n", + "For more information on connecting to the Couchbase cluster, please check the [Python SDK documentation](https://docs.couchbase.com/python-sdk/current/hello-world/start-using-sdk.html#connect)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "52fe583a-12db-4dc2-9281-1174bf1d4e5c", + "metadata": {}, + "outputs": [], + "source": [ + "COUCHBASE_CONNECTION_STRING = (\n", + " \"couchbase://localhost\" # or \"couchbases://localhost\" if using TLS\n", + ")\n", + "DB_USERNAME = \"Administrator\"\n", + "DB_PASSWORD = \"Password\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9986c6b9", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "\n", + "from couchbase.auth import PasswordAuthenticator\n", + "from couchbase.cluster import Cluster\n", + "from couchbase.options import ClusterOptions\n", + "\n", + "auth = PasswordAuthenticator(DB_USERNAME, DB_PASSWORD)\n", + "options = ClusterOptions(auth)\n", + "cluster = Cluster(COUCHBASE_CONNECTION_STRING, options)\n", + "\n", + "# Wait until the cluster is ready for use.\n", + "cluster.wait_until_ready(timedelta(seconds=5))" + ] + }, + { + "cell_type": "markdown", + "id": "90c5dec9-f6cb-41eb-9f30-13cab7b107db", + "metadata": {}, + "source": [ + "We will now set the bucket, scope, and collection names in the Couchbase cluster that we want to use for Vector Search. \n", + "\n", + "For this example, we are using the default scope & collections." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1b1d0a26-e9d4-4823-9800-9549d24d3d16", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET_NAME = \"testing\"\n", + "SCOPE_NAME = \"_default\"\n", + "COLLECTION_NAME = \"_default\"\n", + "SEARCH_INDEX_NAME = \"vector-index\"" + ] + }, + { + "cell_type": "markdown", + "id": "efbac6ff-c2ac-4443-9250-7cc88061346b", + "metadata": {}, + "source": [ + "For this tutorial, we will use OpenAI embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "87625579-86d7-4de4-8a4d-cee674a6b676", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "3677b4b0-3711-419c-89ff-32ef4d3e3022", + "metadata": {}, + "source": [ + "## Create the Search Index\n", + "Currently, the Search index needs to be created from the Couchbase Capella or Server UI or using the REST interface. \n", + "\n", + "Let us define a Search index with the name `vector-index` on the testing bucket\n", + "\n", + "For this example, let us use the Import Index feature on the Search Service on the UI. \n", + "\n", + "We are defining an index on the `testing` bucket's `_default` scope on the `_default` collection with the vector field set to `embedding` with 1536 dimensions and the text field set to `text`. We are also indexing and storing all the fields under `metadata` in the document as a dynamic mapping to account for varying document structures. The similarity metric is set to `dot_product`." + ] + }, + { + "cell_type": "markdown", + "id": "655117ae-9b1f-4139-b437-ca7685975a54", + "metadata": {}, + "source": [ + "### How to Import an Index to the Full Text Search service?\n", + " - [Couchbase Server](https://docs.couchbase.com/server/current/search/import-search-index.html)\n", + " - Click on Search -> Add Index -> Import\n", + " - Copy the following Index definition in the Import screen\n", + " - Click on Create Index to create the index.\n", + " - [Couchbase Capella](https://docs.couchbase.com/cloud/search/import-search-index.html)\n", + " - Copy the index definition to a new file `index.json`\n", + " - Import the file in Capella using the instructions in the documentation.\n", + " - Click on Create Index to create the index.\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "id": "f85bc468-d9b8-487d-999a-3b5d2fb78e41", + "metadata": {}, + "source": [ + "### Index Definition\n", + "```\n", + "{\n", + " \"name\": \"vector-index\",\n", + " \"type\": \"fulltext-index\",\n", + " \"params\": {\n", + " \"doc_config\": {\n", + " \"docid_prefix_delim\": \"\",\n", + " \"docid_regexp\": \"\",\n", + " \"mode\": \"type_field\",\n", + " \"type_field\": \"type\"\n", + " },\n", + " \"mapping\": {\n", + " \"default_analyzer\": \"standard\",\n", + " \"default_datetime_parser\": \"dateTimeOptional\",\n", + " \"default_field\": \"_all\",\n", + " \"default_mapping\": {\n", + " \"dynamic\": true,\n", + " \"enabled\": true,\n", + " \"properties\": {\n", + " \"metadata\": {\n", + " \"dynamic\": true,\n", + " \"enabled\": true\n", + " },\n", + " \"embedding\": {\n", + " \"enabled\": true,\n", + " \"dynamic\": false,\n", + " \"fields\": [\n", + " {\n", + " \"dims\": 1536,\n", + " \"index\": true,\n", + " \"name\": \"embedding\",\n", + " \"similarity\": \"dot_product\",\n", + " \"type\": \"vector\",\n", + " \"vector_index_optimized_for\": \"recall\"\n", + " }\n", + " ]\n", + " },\n", + " \"text\": {\n", + " \"enabled\": true,\n", + " \"dynamic\": false,\n", + " \"fields\": [\n", + " {\n", + " \"index\": true,\n", + " \"name\": \"text\",\n", + " \"store\": true,\n", + " \"type\": \"text\"\n", + " }\n", + " ]\n", + " }\n", + " }\n", + " },\n", + " \"default_type\": \"_default\",\n", + " \"docvalues_dynamic\": false,\n", + " \"index_dynamic\": true,\n", + " \"store_dynamic\": true,\n", + " \"type_field\": \"_type\"\n", + " },\n", + " \"store\": {\n", + " \"indexType\": \"scorch\",\n", + " \"segmentVersion\": 16\n", + " }\n", + " },\n", + " \"sourceType\": \"gocbcore\",\n", + " \"sourceName\": \"testing\",\n", + " \"sourceParams\": {},\n", + " \"planParams\": {\n", + " \"maxPartitionsPerPIndex\": 103,\n", + " \"indexPartitions\": 10,\n", + " \"numReplicas\": 0\n", + " }\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "556dc68c-9089-4390-8dc9-b77051e7fc34", + "metadata": {}, + "source": [ + "For more details on how to create a Search index with support for Vector fields, please refer to the documentation.\n", + "\n", + "- [Couchbase Capella](https://docs.couchbase.com/cloud/vector-search/create-vector-search-index-ui.html)\n", + " \n", + "- [Couchbase Server](https://docs.couchbase.com/server/current/vector-search/create-vector-search-index-ui.html)" + ] + }, + { + "cell_type": "markdown", + "id": "75f4037d-e509-4de7-a8d1-63a05de24e9d", + "metadata": {}, + "source": [ + "## Create Vector Store\n", + "We create the vector store object with the cluster information and the search index name." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "33db4670-76c5-49ba-94d6-a8fa35583058", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = CouchbaseVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding=embeddings,\n", + " index_name=SEARCH_INDEX_NAME,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0aa98793-5ac2-4f76-bbba-2d40856c2d58", + "metadata": {}, + "source": [ + "### Specify the Text & Embeddings Field\n", + "You can optionally specify the text & embeddings field for the document using the `text_key` and `embedding_key` fields.\n", + "```\n", + "vector_store = CouchbaseVectorStore(\n", + " cluster=cluster,\n", + " bucket_name=BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " embedding=embeddings,\n", + " index_name=SEARCH_INDEX_NAME,\n", + " text_key=\"text\",\n", + " embedding_key=\"embedding\",\n", + ")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "790dc1ac-0ab8-4cb5-989d-31ca7c241068", + "metadata": {}, + "source": [ + "## Basic Vector Search Example\n", + "For this example, we are going to load the \"state_of_the_union.txt\" file via the TextLoader, chunk the text into 500 character chunks with no overlaps and index all these chunks into Couchbase.\n", + "\n", + "After the data is indexed, we perform a simple query to find the top 4 chunks that are similar to the query \"What did president say about Ketanji Brown Jackson\".\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "440350df-cbc6-48f7-8009-2e783be18306", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain_community.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9d3b4c7c-abd6-4dfa-ad63-470f16661319", + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = CouchbaseVectorStore.from_documents(\n", + " documents=docs,\n", + " embedding=embeddings,\n", + " cluster=cluster,\n", + " bucket_name=BUCKET_NAME,\n", + " scope_name=SCOPE_NAME,\n", + " collection_name=COLLECTION_NAME,\n", + " index_name=SEARCH_INDEX_NAME,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "91fdce6c-8f7c-4060-865a-2fd742846664", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.' metadata={'source': '../../modules/state_of_the_union.txt'}\n" + ] + } + ], + "source": [ + "query = \"What did president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search(query)\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "d9b46c93-65f6-4e4f-87a2-5cebea3b7a6b", + "metadata": {}, + "source": [ + "## Similarity Search with Score\n", + "You can fetch the scores for the results by calling the `similarity_search_with_score` method." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "24b146b2-55a2-4fe8-8659-3649032f5dc7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.' metadata={'source': '../../modules/state_of_the_union.txt'}\n", + "Score: 0.8211871385574341\n" + ] + } + ], + "source": [ + "query = \"What did president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search_with_score(query)\n", + "document, score = results[0]\n", + "print(document)\n", + "print(f\"Score: {score}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9983e83d-efd0-4b75-80db-150e0694e822", + "metadata": {}, + "source": [ + "## Specifying Fields to Return\n", + "You can specify the fields to return from the document using `fields` parameter in the searches. These fields are returned as part of the `metadata` object in the returned Document. You can fetch any field that is stored in the Search index. The `text_key` of the document is returned as part of the document's `page_content`.\n", + "\n", + "If you do not specify any fields to be fetched, all the fields stored in the index are returned.\n", + "\n", + "If you want to fetch one of the fields in the metadata, you need to specify it using `.`\n", + "\n", + "For example, to fetch the `source` field in the metadata, you need to specify `metadata.source`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ffa743dc-4e89-405b-ad71-7390338889e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.' metadata={'source': '../../modules/state_of_the_union.txt'}\n" + ] + } + ], + "source": [ + "query = \"What did president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search(query, fields=[\"metadata.source\"])\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "a5e45eb2-aa97-45df-bcc5-410e9626e506", + "metadata": {}, + "source": [ + "## Hybrid Search\n", + "Couchbase allows you to do hybrid searches by combining Vector Search results with searches on non-vector fields of the document like the `metadata` object. \n", + "\n", + "The results will be based on the combination of the results from both Vector Search and the searches supported by Search Service. The scores of each of the component searches are added up to get the total score of the result.\n", + "\n", + "To perform hybrid searches, there is an optional parameter, `search_options` that can be passed to all the similarity searches. \n", + "The different search/query possibilities for the `search_options` can be found [here](https://docs.couchbase.com/server/current/search/search-request-params.html#query-object)." + ] + }, + { + "cell_type": "markdown", + "id": "a5db3685-1918-4c63-8148-0bb3a71ea677", + "metadata": {}, + "source": [ + "### Create Diverse Metadata for Hybrid Search\n", + "In order to simulate hybrid search, let us create some random metadata from the existing documents. \n", + "We uniformly add three fields to the metadata, `date` between 2010 & 2020, `rating` between 1 & 5 and `author` set to either John Doe or Jane Doe. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7d2e607d-6bbc-4cef-83e3-b6a28bb269ea", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'author': 'John Doe', 'date': '2016-01-01', 'rating': 2, 'source': '../../modules/state_of_the_union.txt'}\n" + ] + } + ], + "source": [ + "# Adding metadata to documents\n", + "for i, doc in enumerate(docs):\n", + " doc.metadata[\"date\"] = f\"{range(2010, 2020)[i % 10]}-01-01\"\n", + " doc.metadata[\"rating\"] = range(1, 6)[i % 5]\n", + " doc.metadata[\"author\"] = [\"John Doe\", \"Jane Doe\"][i % 2]\n", + "\n", + "vector_store.add_documents(docs)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search(query)\n", + "print(results[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "6cad893b-3977-4556-ab1d-d12bce68b306", + "metadata": {}, + "source": [ + "### Example: Search by Exact Value\n", + "We can search for exact matches on a textual field like the author in the `metadata` object." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "dc06ba4a-8a6b-4c55-bb69-95cd92db273f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='This is personal to me and Jill, to Kamala, and to so many of you. \\n\\nCancer is the #2 cause of death in America–second only to heart disease. \\n\\nLast month, I announced our plan to supercharge \\nthe Cancer Moonshot that President Obama asked me to lead six years ago. \\n\\nOur goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases. \\n\\nMore support for patients and families.' metadata={'author': 'John Doe'}\n" + ] + } + ], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search(\n", + " query,\n", + " search_options={\"query\": {\"field\": \"metadata.author\", \"match\": \"John Doe\"}},\n", + " fields=[\"metadata.author\"],\n", + ")\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "9106b594-b41e-4329-b98c-9b9f8a34d6f7", + "metadata": {}, + "source": [ + "### Example: Search by Partial Match\n", + "We can search for partial matches by specifying a fuzziness for the search. This is useful when you want to search for slight variations or misspellings of a search query.\n", + "\n", + "Here, \"Jae\" is close (fuzziness of 1) to \"Jane\"." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "fd4749e6-ef4f-4cb5-95ff-37c4fa8283d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system.' metadata={'author': 'Jane Doe'}\n" + ] + } + ], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "results = vector_store.similarity_search(\n", + " query,\n", + " search_options={\n", + " \"query\": {\"field\": \"metadata.author\", \"match\": \"Jae\", \"fuzziness\": 1}\n", + " },\n", + " fields=[\"metadata.author\"],\n", + ")\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "1bbf9449-6e30-4bd1-9eeb-f3b60952fcab", + "metadata": {}, + "source": [ + "### Example: Search by Date Range Query\n", + "We can search for documents that are within a date range query on a date field like `metadata.date`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b7b47e7d-c32f-4999-bce9-3c3c3cebffd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "page_content='He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \\n\\nWe meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \\n\\nThe pandemic has been punishing. \\n\\nAnd so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \\n\\nI understand.' metadata={'author': 'Jane Doe', 'date': '2017-01-01', 'rating': 3, 'source': '../../modules/state_of_the_union.txt'}\n" + ] + } + ], + "source": [ + "query = \"Any mention about independence?\"\n", + "results = vector_store.similarity_search(\n", + " query,\n", + " search_options={\n", + " \"query\": {\n", + " \"start\": \"2016-12-31\",\n", + " \"end\": \"2017-01-02\",\n", + " \"inclusive_start\": True,\n", + " \"inclusive_end\": False,\n", + " \"field\": \"metadata.date\",\n", + " }\n", + " },\n", + ")\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "a18d4ea2-bfab-4f15-9839-674faf1c6f0d", + "metadata": {}, + "source": [ + "### Example: Search by Numeric Range Query\n", + "We can search for documents that are within a range for a numeric field like `metadata.rating`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "7e8bf7c5-07d1-4c3f-86d7-1fa3a454dc7f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Document(page_content='He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \\n\\nWe meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \\n\\nThe pandemic has been punishing. \\n\\nAnd so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \\n\\nI understand.', metadata={'author': 'Jane Doe', 'date': '2017-01-01', 'rating': 3, 'source': '../../modules/state_of_the_union.txt'}), 0.9000703597577832)\n" + ] + } + ], + "source": [ + "query = \"Any mention about independence?\"\n", + "results = vector_store.similarity_search_with_score(\n", + " query,\n", + " search_options={\n", + " \"query\": {\n", + " \"min\": 3,\n", + " \"max\": 5,\n", + " \"inclusive_min\": True,\n", + " \"inclusive_max\": True,\n", + " \"field\": \"metadata.rating\",\n", + " }\n", + " },\n", + ")\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "0f16bf86-f01c-4a77-8406-275f7313f493", + "metadata": {}, + "source": [ + "### Example: Combining Multiple Search Queries\n", + "Different search queries can be combined using AND (conjuncts) or OR (disjuncts) operators.\n", + "\n", + "In this example, we are checking for documents with a rating between 3 & 4 and dated between 2015 & 2018." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "dd0fe7f1-aa40-4c6f-889b-99ad5efcd88b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(Document(page_content='He will never extinguish their love of freedom. He will never weaken the resolve of the free world. \\n\\nWe meet tonight in an America that has lived through two of the hardest years this nation has ever faced. \\n\\nThe pandemic has been punishing. \\n\\nAnd so many families are living paycheck to paycheck, struggling to keep up with the rising cost of food, gas, housing, and so much more. \\n\\nI understand.', metadata={'author': 'Jane Doe', 'date': '2017-01-01', 'rating': 3, 'source': '../../modules/state_of_the_union.txt'}), 1.3598770370389914)\n" + ] + } + ], + "source": [ + "query = \"Any mention about independence?\"\n", + "results = vector_store.similarity_search_with_score(\n", + " query,\n", + " search_options={\n", + " \"query\": {\n", + " \"conjuncts\": [\n", + " {\"min\": 3, \"max\": 4, \"inclusive_max\": True, \"field\": \"metadata.rating\"},\n", + " {\"start\": \"2016-12-31\", \"end\": \"2017-01-02\", \"field\": \"metadata.date\"},\n", + " ]\n", + " }\n", + " },\n", + ")\n", + "print(results[0])" + ] + }, + { + "cell_type": "markdown", + "id": "39258571-3233-45c3-a6ad-5c3c90ea2b1c", + "metadata": {}, + "source": [ + "### Other Queries\n", + "Similarly, you can use any of the supported Query methods like Geo Distance, Polygon Search, Wildcard, Regular Expressions, etc in the `search_options` parameter. Please refer to the documentation for more details on the available query methods and their syntax.\n", + "\n", + "- [Couchbase Capella](https://docs.couchbase.com/cloud/search/search-request-params.html#query-object)\n", + "- [Couchbase Server](https://docs.couchbase.com/server/current/search/search-request-params.html#query-object)" + ] + }, + { + "cell_type": "markdown", + "id": "80958c2b-6a67-45e6-b7f0-fd2461d75e0f", + "metadata": {}, + "source": [ + "# Frequently Asked Questions" + ] + }, + { + "cell_type": "markdown", + "id": "4f7f9838-cc20-44bc-a72d-06f2cb6c3fca", + "metadata": {}, + "source": [ + "## Question: Should I create the Search index before creating the CouchbaseVectorStore object?\n", + "Yes, currently you need to create the Search index before creating the `CouchbaseVectoreStore` object.\n" + ] + }, + { + "cell_type": "markdown", + "id": "3f0dbc1b-9e82-4ec3-9330-6b54de00661e", + "metadata": {}, + "source": [ + "## Question: I am not seeing all the fields that I specified in my search results. \n", + "\n", + "In Couchbase, we can only return the fields stored in the Search index. Please ensure that the field that you are trying to access in the search results is part of the Search index.\n", + "\n", + "One way to handle this is to index and store a document's fields dynamically in the index. \n", + "\n", + "- In Capella, you need to go to \"Advanced Mode\" then under the chevron \"General Settings\" you can check \"[X] Store Dynamic Fields\" or \"[X] Index Dynamic Fields\"\n", + "- In Couchbase Server, in the Index Editor (not Quick Editor) under the chevron \"Advanced\" you can check \"[X] Store Dynamic Fields\" or \"[X] Index Dynamic Fields\"\n", + "\n", + "Note that these options will increase the size of the index.\n", + "\n", + "For more details on dynamic mappings, please refer to the [documentation](https://docs.couchbase.com/cloud/search/customize-index.html).\n" + ] + }, + { + "cell_type": "markdown", + "id": "3702977a-2e25-48b6-b662-edd5cb94cdec", + "metadata": {}, + "source": [ + "## Question: I am unable to see the metadata object in my search results. \n", + "This is most likely due to the `metadata` field in the document not being indexed and/or stored by the Couchbase Search index. In order to index the `metadata` field in the document, you need to add it to the index as a child mapping. \n", + "\n", + "If you select to map all the fields in the mapping, you will be able to search by all metadata fields. Alternatively, to optimize the index, you can select the specific fields inside `metadata` object to be indexed. You can refer to the [docs](https://docs.couchbase.com/cloud/search/customize-index.html) to learn more about indexing child mappings.\n", + "\n", + "Creating Child Mappings\n", + "\n", + "* [Couchbase Capella](https://docs.couchbase.com/cloud/search/create-child-mapping.html)\n", + "* [Couchbase Server](https://docs.couchbase.com/server/current/search/create-child-mapping.html)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/docs/modules/data_connection/indexing.ipynb b/docs/docs/modules/data_connection/indexing.ipynb index 64de9591104ca..2264c8e70afd1 100644 --- a/docs/docs/modules/data_connection/indexing.ipynb +++ b/docs/docs/modules/data_connection/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", + "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index a32efe7553b80..aa4f4d8980550 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -42,6 +42,7 @@ "Clarifai": "langchain_community.vectorstores.clarifai", "Clickhouse": "langchain_community.vectorstores.clickhouse", "ClickhouseSettings": "langchain_community.vectorstores.clickhouse", + "CouchbaseVectorStore": "langchain_community.vectorstores.couchbase", "DashVector": "langchain_community.vectorstores.dashvector", "DatabricksVectorSearch": "langchain_community.vectorstores.databricks_vector_search", # noqa: E501 "DeepLake": "langchain_community.vectorstores.deeplake", diff --git a/libs/community/langchain_community/vectorstores/couchbase.py b/libs/community/langchain_community/vectorstores/couchbase.py new file mode 100644 index 0000000000000..881d31b70fc02 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/couchbase.py @@ -0,0 +1,617 @@ +from __future__ import annotations + +import uuid +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore + +if TYPE_CHECKING: + from couchbase.cluster import Cluster + + +class CouchbaseVectorStore(VectorStore): + """`Couchbase Vector Store` vector store. + + To use it, you need + - a recent installation of the `couchbase` library + - a Couchbase database with a pre-defined Search index with support for + vector fields + + Example: + .. code-block:: python + + from langchain_community.vectorstores import CouchbaseVectorStore + from langchain_openai import OpenAIEmbeddings + + from couchbase.cluster import Cluster + from couchbase.auth import PasswordAuthenticator + from couchbase.options import ClusterOptions + from datetime import timedelta + + auth = PasswordAuthenticator(username, password) + options = ClusterOptions(auth) + connect_string = "couchbases://localhost" + cluster = Cluster(connect_string, options) + + # Wait until the cluster is ready for use. + cluster.wait_until_ready(timedelta(seconds=5)) + + embeddings = OpenAIEmbeddings() + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + bucket_name="", + scope_name="", + collection_name="", + embedding=embeddings, + index_name="vector-index", + ) + + vectorstore.add_texts(["hello", "world"]) + results = vectorstore.similarity_search("ola", k=1) + """ + + # Default batch size + DEFAULT_BATCH_SIZE = 100 + _metadata_key = "metadata" + _default_text_key = "text" + _default_embedding_key = "embedding" + + def _check_bucket_exists(self) -> bool: + """Check if the bucket exists in the linked Couchbase cluster""" + bucket_manager = self._cluster.buckets() + try: + bucket_manager.get_bucket(self._bucket_name) + return True + except Exception: + return False + + def _check_scope_and_collection_exists(self) -> bool: + """Check if the scope and collection exists in the linked Couchbase bucket + Raises a ValueError if either is not found""" + scope_collection_map: Dict[str, Any] = {} + + # Get a list of all scopes in the bucket + for scope in self._bucket.collections().get_all_scopes(): + scope_collection_map[scope.name] = [] + + # Get a list of all the collections in the scope + for collection in scope.collections: + scope_collection_map[scope.name].append(collection.name) + + # Check if the scope exists + if self._scope_name not in scope_collection_map.keys(): + raise ValueError( + f"Scope {self._scope_name} not found in Couchbase " + f"bucket {self._bucket_name}" + ) + + # Check if the collection exists in the scope + if self._collection_name not in scope_collection_map[self._scope_name]: + raise ValueError( + f"Collection {self._collection_name} not found in scope " + f"{self._scope_name} in Couchbase bucket {self._bucket_name}" + ) + + return True + + def _check_index_exists(self) -> bool: + """Check if the Search index exists in the linked Couchbase cluster + Raises a ValueError if the index does not exist""" + if self._scoped_index: + all_indexes = [ + index.name for index in self._scope.search_indexes().get_all_indexes() + ] + if self._index_name not in all_indexes: + raise ValueError( + f"Index {self._index_name} does not exist. " + " Please create the index before searching." + ) + else: + all_indexes = [ + index.name for index in self._cluster.search_indexes().get_all_indexes() + ] + if self._index_name not in all_indexes: + raise ValueError( + f"Index {self._index_name} does not exist. " + " Please create the index before searching." + ) + + return True + + def __init__( + self, + cluster: Cluster, + bucket_name: str, + scope_name: str, + collection_name: str, + embedding: Embeddings, + index_name: str, + *, + text_key: Optional[str] = _default_text_key, + embedding_key: Optional[str] = _default_embedding_key, + scoped_index: bool = True, + ) -> None: + """ + Initialize the Couchbase Vector Store. + + Args: + + cluster (Cluster): couchbase cluster object with active connection. + bucket_name (str): name of bucket to store documents in. + scope_name (str): name of scope in the bucket to store documents in. + collection_name (str): name of collection in the scope to store documents in + embedding (Embeddings): embedding function to use. + index_name (str): name of the Search index to use. + text_key (optional[str]): key in document to use as text. + Set to text by default. + embedding_key (optional[str]): key in document to use for the embeddings. + Set to embedding by default. + scoped_index (optional[bool]): specify whether the index is a scoped index. + Set to True by default. + """ + try: + from couchbase.cluster import Cluster + except ImportError as e: + raise ImportError( + "Could not import couchbase python package. " + "Please install couchbase SDK with `pip install couchbase`." + ) from e + + if not isinstance(cluster, Cluster): + raise ValueError( + f"cluster should be an instance of couchbase.Cluster, " + f"got {type(cluster)}" + ) + + self._cluster = cluster + + if not embedding: + raise ValueError("Embeddings instance must be provided.") + + if not bucket_name: + raise ValueError("bucket_name must be provided.") + + if not scope_name: + raise ValueError("scope_name must be provided.") + + if not collection_name: + raise ValueError("collection_name must be provided.") + + if not index_name: + raise ValueError("index_name must be provided.") + + self._bucket_name = bucket_name + self._scope_name = scope_name + self._collection_name = collection_name + self._embedding_function = embedding + self._text_key = text_key + self._embedding_key = embedding_key + self._index_name = index_name + self._scoped_index = scoped_index + + # Check if the bucket exists + if not self._check_bucket_exists(): + raise ValueError( + f"Bucket {self._bucket_name} does not exist. " + " Please create the bucket before searching." + ) + + try: + self._bucket = self._cluster.bucket(self._bucket_name) + self._scope = self._bucket.scope(self._scope_name) + self._collection = self._scope.collection(self._collection_name) + except Exception as e: + raise ValueError( + "Error connecting to couchbase. " + "Please check the connection and credentials." + ) from e + + # Check if the scope and collection exists. Throws ValueError if they don't + try: + self._check_scope_and_collection_exists() + except Exception as e: + raise e + + # Check if the index exists. Throws ValueError if it doesn't + try: + self._check_index_exists() + except Exception as e: + raise e + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[Dict[str, Any]]] = None, + ids: Optional[List[str]] = None, + batch_size: Optional[int] = None, + **kwargs: Any, + ) -> List[str]: + """Run texts through the embeddings and persist in vectorstore. + + If the document IDs are passed, the existing documents (if any) will be + overwritten with the new ones. + + Args: + texts (Iterable[str]): Iterable of strings to add to the vectorstore. + metadatas (Optional[List[Dict]]): Optional list of metadatas associated + with the texts. + ids (Optional[List[str]]): Optional list of ids associated with the texts. + IDs have to be unique strings across the collection. + If it is not specified uuids are generated and used as ids. + batch_size (Optional[int]): Optional batch size for bulk insertions. + Default is 100. + + Returns: + List[str]:List of ids from adding the texts into the vectorstore. + """ + from couchbase.exceptions import DocumentExistsException + + if not batch_size: + batch_size = self.DEFAULT_BATCH_SIZE + doc_ids: List[str] = [] + + if ids is None: + ids = [uuid.uuid4().hex for _ in texts] + + if metadatas is None: + metadatas = [{} for _ in texts] + + embedded_texts = self._embedding_function.embed_documents(list(texts)) + + documents_to_insert = [ + { + id: { + self._text_key: text, + self._embedding_key: vector, + self._metadata_key: metadata, + } + for id, text, vector, metadata in zip( + ids, texts, embedded_texts, metadatas + ) + } + ] + + # Insert in batches + for i in range(0, len(documents_to_insert), batch_size): + batch = documents_to_insert[i : i + batch_size] + try: + result = self._collection.upsert_multi(batch[0]) + if result.all_ok: + doc_ids.extend(batch[0].keys()) + except DocumentExistsException as e: + raise ValueError(f"Document already exists: {e}") + + return doc_ids + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete documents from the vector store by ids. + + Args: + ids (List[str]): List of IDs of the documents to delete. + batch_size (Optional[int]): Optional batch size for bulk deletions. + + Returns: + bool: True if all the documents were deleted successfully, False otherwise. + + """ + from couchbase.exceptions import DocumentNotFoundException + + if ids is None: + raise ValueError("No document ids provided to delete.") + + batch_size = kwargs.get("batch_size", self.DEFAULT_BATCH_SIZE) + deletion_status = True + + # Delete in batches + for i in range(0, len(ids), batch_size): + batch = ids[i : i + batch_size] + try: + result = self._collection.remove_multi(batch) + except DocumentNotFoundException as e: + deletion_status = False + raise ValueError(f"Document not found: {e}") + + deletion_status &= result.all_ok + + return deletion_status + + @property + def embeddings(self) -> Embeddings: + """Return the query embedding object.""" + return self._embedding_function + + def _format_metadata(self, row_fields: Dict[str, Any]) -> Dict[str, Any]: + """Helper method to format the metadata from the Couchbase Search API. + Args: + row_fields (Dict[str, Any]): The fields to format. + + Returns: + Dict[str, Any]: The formatted metadata. + """ + metadata = {} + for key, value in row_fields.items(): + # Couchbase Search returns the metadata key with a prefix + # `metadata.` We remove it to get the original metadata key + if key.startswith(self._metadata_key): + new_key = key.split(self._metadata_key + ".")[-1] + metadata[new_key] = value + else: + metadata[key] = value + + return metadata + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + search_options: Optional[Dict[str, Any]] = {}, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to embedding vector with their scores. + + Args: + embedding (List[float]): Embedding vector to look up documents similar to. + k (int): Number of Documents to return. + Defaults to 4. + search_options (Optional[Dict[str, Any]]): Optional search options that are + passed to Couchbase search. + Defaults to empty dictionary. + fields (Optional[List[str]]): Optional list of fields to include in the + metadata of results. Note that these need to be stored in the index. + If nothing is specified, defaults to all the fields stored in the index. + + Returns: + List of (Document, score) that are the most similar to the query vector. + """ + import couchbase.search as search + from couchbase.options import SearchOptions + from couchbase.vector_search import VectorQuery, VectorSearch + + fields = kwargs.get("fields", ["*"]) + + # Document text field needs to be returned from the search + if fields != ["*"] and self._text_key not in fields: + fields.append(self._text_key) + + search_req = search.SearchRequest.create( + VectorSearch.from_vector_query( + VectorQuery( + self._embedding_key, + embedding, + k, + ) + ) + ) + try: + if self._scoped_index: + search_iter = self._scope.search( + self._index_name, + search_req, + SearchOptions( + limit=k, + fields=fields, + raw=search_options, + ), + ) + + else: + search_iter = self._cluster.search( + index=self._index_name, + request=search_req, + options=SearchOptions(limit=k, fields=fields, raw=search_options), + ) + + docs_with_score = [] + + # Parse the results + for row in search_iter.rows(): + text = row.fields.pop(self._text_key, "") + + # Format the metadata from Couchbase + metadata = self._format_metadata(row.fields) + + score = row.score + doc = Document(page_content=text, metadata=metadata) + docs_with_score.append((doc, score)) + + except Exception as e: + raise ValueError(f"Search failed with error: {e}") + + return docs_with_score + + def similarity_search( + self, + query: str, + k: int = 4, + search_options: Optional[Dict[str, Any]] = {}, + **kwargs: Any, + ) -> List[Document]: + """Return documents most similar to embedding vector with their scores. + + Args: + query (str): Query to look up for similar documents + k (int): Number of Documents to return. + Defaults to 4. + search_options (Optional[Dict[str, Any]]): Optional search options that are + passed to Couchbase search. + Defaults to empty dictionary + fields (Optional[List[str]]): Optional list of fields to include in the + metadata of results. Note that these need to be stored in the index. + If nothing is specified, defaults to all the fields stored in the index. + + Returns: + List of Documents most similar to the query. + """ + query_embedding = self.embeddings.embed_query(query) + docs_with_scores = self.similarity_search_with_score_by_vector( + query_embedding, k, search_options, **kwargs + ) + return [doc for doc, _ in docs_with_scores] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + search_options: Optional[Dict[str, Any]] = {}, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return documents that are most similar to the query with their scores. + + Args: + query (str): Query to look up for similar documents + k (int): Number of Documents to return. + Defaults to 4. + search_options (Optional[Dict[str, Any]]): Optional search options that are + passed to Couchbase search. + Defaults to empty dictionary. + fields (Optional[List[str]]): Optional list of fields to include in the + metadata of results. Note that these need to be stored in the index. + If nothing is specified, defaults to text and metadata fields. + + Returns: + List of (Document, score) that are most similar to the query. + """ + query_embedding = self.embeddings.embed_query(query) + docs_with_score = self.similarity_search_with_score_by_vector( + query_embedding, k, search_options, **kwargs + ) + return docs_with_score + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + search_options: Optional[Dict[str, Any]] = {}, + **kwargs: Any, + ) -> List[Document]: + """Return documents that are most similar to the vector embedding. + + Args: + embedding (List[float]): Embedding to look up documents similar to. + k (int): Number of Documents to return. + Defaults to 4. + search_options (Optional[Dict[str, Any]]): Optional search options that are + passed to Couchbase search. + Defaults to empty dictionary. + fields (Optional[List[str]]): Optional list of fields to include in the + metadata of results. Note that these need to be stored in the index. + If nothing is specified, defaults to document text and metadata fields. + + Returns: + List of Documents most similar to the query. + """ + docs_with_score = self.similarity_search_with_score_by_vector( + embedding, k, search_options, **kwargs + ) + return [doc for doc, _ in docs_with_score] + + @classmethod + def _from_kwargs( + cls: Type[CouchbaseVectorStore], + embedding: Embeddings, + **kwargs: Any, + ) -> CouchbaseVectorStore: + """Initialize the Couchbase vector store from keyword arguments for the + vector store. + + Args: + embedding: Embedding object to use to embed text. + **kwargs: Keyword arguments to initialize the vector store with. + Accepted arguments are: + - cluster + - bucket_name + - scope_name + - collection_name + - index_name + - text_key + - embedding_key + - scoped_index + + """ + cluster = kwargs.get("cluster", None) + bucket_name = kwargs.get("bucket_name", None) + scope_name = kwargs.get("scope_name", None) + collection_name = kwargs.get("collection_name", None) + index_name = kwargs.get("index_name", None) + text_key = kwargs.get("text_key", cls._default_text_key) + embedding_key = kwargs.get("embedding_key", cls._default_embedding_key) + scoped_index = kwargs.get("scoped_index", True) + + return cls( + embedding=embedding, + cluster=cluster, + bucket_name=bucket_name, + scope_name=scope_name, + collection_name=collection_name, + index_name=index_name, + text_key=text_key, + embedding_key=embedding_key, + scoped_index=scoped_index, + ) + + @classmethod + def from_texts( + cls: Type[CouchbaseVectorStore], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[Dict[Any, Any]]] = None, + **kwargs: Any, + ) -> CouchbaseVectorStore: + """Construct a Couchbase vector store from a list of texts. + + Example: + .. code-block:: python + + from langchain_community.vectorstores import CouchbaseVectorStore + from langchain_openai import OpenAIEmbeddings + + from couchbase.cluster import Cluster + from couchbase.auth import PasswordAuthenticator + from couchbase.options import ClusterOptions + from datetime import timedelta + + auth = PasswordAuthenticator(username, password) + options = ClusterOptions(auth) + connect_string = "couchbases://localhost" + cluster = Cluster(connect_string, options) + + # Wait until the cluster is ready for use. + cluster.wait_until_ready(timedelta(seconds=5)) + + embeddings = OpenAIEmbeddings() + + texts = ["hello", "world"] + + vectorstore = CouchbaseVectorStore.from_texts( + texts, + embedding=embeddings, + cluster=cluster, + bucket_name="", + scope_name="", + collection_name="", + index_name="vector-index", + ) + + Args: + texts (List[str]): list of texts to add to the vector store. + embedding (Embeddings): embedding function to use. + metadatas (optional[List[Dict]): list of metadatas to add to documents. + **kwargs: Keyword arguments used to initialize the vector store with and/or + passed to `add_texts` method. Check the constructor and/or `add_texts` + for the list of accepted arguments. + + Returns: + A Couchbase vector store. + + """ + vector_store = cls._from_kwargs(embedding, **kwargs) + batch_size = kwargs.get("batch_size", vector_store.DEFAULT_BATCH_SIZE) + ids = kwargs.get("ids", None) + vector_store.add_texts( + texts, metadatas=metadatas, ids=ids, batch_size=batch_size + ) + + return vector_store diff --git a/libs/community/tests/integration_tests/vectorstores/test_couchbase.py b/libs/community/tests/integration_tests/vectorstores/test_couchbase.py new file mode 100644 index 0000000000000..c6b5ee0a0fa5b --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_couchbase.py @@ -0,0 +1,367 @@ +"""Test Couchbase Vector Store functionality""" + +import os +import time +from typing import Any + +import pytest +from langchain_core.documents import Document + +from langchain_community.vectorstores.couchbase import CouchbaseVectorStore +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, +) + +CONNECTION_STRING = os.getenv("COUCHBASE_CONNECTION_STRING", "") +BUCKET_NAME = os.getenv("COUCHBASE_BUCKET_NAME", "") +SCOPE_NAME = os.getenv("COUCHBASE_SCOPE_NAME", "") +COLLECTION_NAME = os.getenv("COUCHBASE_COLLECTION_NAME", "") +USERNAME = os.getenv("COUCHBASE_USERNAME", "") +PASSWORD = os.getenv("COUCHBASE_PASSWORD", "") +INDEX_NAME = os.getenv("COUCHBASE_INDEX_NAME", "") +SLEEP_DURATION = 1 + + +def set_all_env_vars() -> bool: + return all( + [ + CONNECTION_STRING, + BUCKET_NAME, + SCOPE_NAME, + COLLECTION_NAME, + USERNAME, + PASSWORD, + INDEX_NAME, + ] + ) + + +def get_cluster() -> Any: + """Get a couchbase cluster object""" + from datetime import timedelta + + from couchbase.auth import PasswordAuthenticator + from couchbase.cluster import Cluster + from couchbase.options import ClusterOptions + + auth = PasswordAuthenticator(USERNAME, PASSWORD) + options = ClusterOptions(auth) + connect_string = CONNECTION_STRING + cluster = Cluster(connect_string, options) + + # Wait until the cluster is ready for use. + cluster.wait_until_ready(timedelta(seconds=5)) + + return cluster + + +@pytest.fixture() +def cluster() -> Any: + """Get a couchbase cluster object""" + return get_cluster() + + +def delete_documents( + cluster: Any, bucket_name: str, scope_name: str, collection_name: str +) -> None: + """Delete all the documents in the collection""" + query = f"DELETE FROM `{bucket_name}`.`{scope_name}`.`{collection_name}`" + cluster.query(query).execute() + + +@pytest.mark.requires("couchbase") +@pytest.mark.skipif( + not set_all_env_vars(), reason="Missing Couchbase environment variables" +) +class TestCouchbaseVectorStore: + @classmethod + def setup_method(self) -> None: + cluster = get_cluster() + # Delete all the documents in the collection + delete_documents(cluster, BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME) + + def test_from_documents(self, cluster: Any) -> None: + """Test end to end search using a list of documents.""" + + documents = [ + Document(page_content="foo", metadata={"page": 1}), + Document(page_content="bar", metadata={"page": 2}), + Document(page_content="baz", metadata={"page": 3}), + ] + + vectorstore = CouchbaseVectorStore.from_documents( + documents, + ConsistentFakeEmbeddings(), + cluster=cluster, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + index_name=INDEX_NAME, + ) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("baz", k=1) + assert output[0].page_content == "baz" + assert output[0].metadata["page"] == 3 + + def test_from_texts(self, cluster: Any) -> None: + """Test end to end search using a list of texts.""" + + texts = [ + "foo", + "bar", + "baz", + ] + + vectorstore = CouchbaseVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + cluster=cluster, + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("foo", k=1) + assert len(output) == 1 + assert output[0].page_content == "foo" + + def test_from_texts_with_metadatas(self, cluster: Any) -> None: + """Test end to end search using a list of texts and metadatas.""" + + texts = [ + "foo", + "bar", + "baz", + ] + + metadatas = [{"a": 1}, {"b": 2}, {"c": 3}] + + vectorstore = CouchbaseVectorStore.from_texts( + texts, + ConsistentFakeEmbeddings(), + metadatas=metadatas, + cluster=cluster, + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("baz", k=1) + assert output[0].page_content == "baz" + assert output[0].metadata["c"] == 3 + + def test_add_texts_with_ids_and_metadatas(self, cluster: Any) -> None: + """Test end to end search by adding a list of texts, ids and metadatas.""" + + texts = [ + "foo", + "bar", + "baz", + ] + + ids = ["a", "b", "c"] + + metadatas = [{"a": 1}, {"b": 2}, {"c": 3}] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + results = vectorstore.add_texts( + texts, + ids=ids, + metadatas=metadatas, + ) + assert results == ids + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("foo", k=1) + assert output[0].page_content == "foo" + assert output[0].metadata["a"] == 1 + + def test_delete_texts_with_ids(self, cluster: Any) -> None: + """Test deletion of documents by ids.""" + texts = [ + "foo", + "bar", + "baz", + ] + + ids = ["a", "b", "c"] + + metadatas = [{"a": 1}, {"b": 2}, {"c": 3}] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + results = vectorstore.add_texts( + texts, + ids=ids, + metadatas=metadatas, + ) + assert results == ids + assert vectorstore.delete(ids) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("foo", k=1) + assert len(output) == 0 + + def test_similarity_search_with_scores(self, cluster: Any) -> None: + """Test similarity search with scores.""" + + texts = ["foo", "bar", "baz"] + + metadatas = [{"a": 1}, {"b": 2}, {"c": 3}] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + vectorstore.add_texts(texts, metadatas=metadatas) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search_with_score("foo", k=2) + + assert len(output) == 2 + assert output[0][0].page_content == "foo" + + # check if the scores are sorted + assert output[0][0].metadata["a"] == 1 + assert output[0][1] > output[1][1] + + def test_similarity_search_by_vector(self, cluster: Any) -> None: + """Test similarity search by vector.""" + + texts = ["foo", "bar", "baz"] + + metadatas = [{"a": 1}, {"b": 2}, {"c": 3}] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + vectorstore.add_texts(texts, metadatas=metadatas) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + vector = ConsistentFakeEmbeddings().embed_query("foo") + vector_output = vectorstore.similarity_search_by_vector(vector, k=1) + + assert vector_output[0].page_content == "foo" + + similarity_output = vectorstore.similarity_search("foo", k=1) + + assert similarity_output == vector_output + + def test_output_fields(self, cluster: Any) -> None: + """Test that output fields are set correctly.""" + + texts = [ + "foo", + "bar", + "baz", + ] + + metadatas = [{"page": 1, "a": 1}, {"page": 2, "b": 2}, {"page": 3, "c": 3}] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + ids = vectorstore.add_texts(texts, metadatas) + assert len(ids) == len(texts) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + output = vectorstore.similarity_search("foo", k=1, fields=["metadata.page"]) + assert output[0].page_content == "foo" + assert output[0].metadata["page"] == 1 + assert "a" not in output[0].metadata + + def test_hybrid_search(self, cluster: Any) -> None: + """Test hybrid search.""" + + texts = [ + "foo", + "bar", + "baz", + ] + + metadatas = [ + {"section": "index"}, + {"section": "glossary"}, + {"section": "appendix"}, + ] + + vectorstore = CouchbaseVectorStore( + cluster=cluster, + embedding=ConsistentFakeEmbeddings(), + index_name=INDEX_NAME, + bucket_name=BUCKET_NAME, + scope_name=SCOPE_NAME, + collection_name=COLLECTION_NAME, + ) + + vectorstore.add_texts(texts, metadatas=metadatas) + + # Wait for the documents to be indexed + time.sleep(SLEEP_DURATION) + + result, score = vectorstore.similarity_search_with_score("foo", k=1)[0] + + # Wait for the documents to be indexed for hybrid search + time.sleep(SLEEP_DURATION) + + hybrid_result, hybrid_score = vectorstore.similarity_search_with_score( + "foo", + k=1, + search_options={"query": {"match": "index", "field": "metadata.section"}}, + )[0] + + assert result == hybrid_result + assert score <= hybrid_score diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index d1f226cd5dcdc..07f410eb99bd3 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -55,6 +55,7 @@ def check_compatibility(vector_store: VectorStore) -> bool: "BigQueryVectorSearch", "Cassandra", "Chroma", + "CouchbaseVectorStore", "DashVector", "DatabricksVectorSearch", "TiDBVectorStore", diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index 400b67658bd7c..ce06b0816129e 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -85,6 +85,7 @@ "VectorStore", "Yellowbrick", "NeuralDBVectorStore", + "CouchbaseVectorStore", ]