diff --git a/docs/docs/integrations/retrievers/elasticsearch_retriever.ipynb b/docs/docs/integrations/retrievers/elasticsearch_retriever.ipynb new file mode 100644 index 0000000000000..9144955c1db46 --- /dev/null +++ b/docs/docs/integrations/retrievers/elasticsearch_retriever.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ab66dd43", + "metadata": {}, + "source": [ + "# ElasticsearchRetriever\n", + "\n", + "[Elasticsearch](https://www.elastic.co/elasticsearch/) is a distributed, RESTful search and analytics engine. It provides a distributed, multitenant-capable full-text search engine with an HTTP web interface and schema-free JSON documents. It support keyword search, vector search, hybrid search and complex filtering.\n", + "\n", + "The `ElasticsearchRetriever` is a generic wrapper to enable flexible access to all Elasticsearch features through the [Query DSL](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html). For most use cases the other classes (`ElasticsearchStore`, `ElasticsearchEmbeddings`, etc.) should suffice, but if they don't you can use `ElasticsearchRetriever`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51b49135-a61a-49e8-869d-7c1d76794cd7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet elasticsearch langchain-elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "393ac030", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Any, Dict, Iterable\n", + "\n", + "from elasticsearch import Elasticsearch\n", + "from elasticsearch.helpers import bulk\n", + "from langchain.embeddings import DeterministicFakeEmbedding\n", + "from langchain_core.documents import Document\n", + "from langchain_core.embeddings import Embeddings\n", + "from langchain_elasticsearch import ElasticsearchRetriever" + ] + }, + { + "cell_type": "markdown", + "id": "24c0d140", + "metadata": {}, + "source": [ + "## Configure\n", + "\n", + "Here we define the conncection to Elasticsearch. In this example we use a locally running instance. Alternatively, you can make an account in [Elastic Cloud](https://cloud.elastic.co/) and start a [free trial](https://www.elastic.co/cloud/cloud-trial-overview)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbb2f592", + "metadata": {}, + "outputs": [], + "source": [ + "es_url = \"http://localhost:9200\"\n", + "es_client = Elasticsearch(hosts=[es_url])\n", + "es_client.info()" + ] + }, + { + "cell_type": "markdown", + "id": "60aa7c20", + "metadata": {}, + "source": [ + "For vector search, we are going to use random embeddings just for illustration. For real use cases, pick one of the available LangChain `Embeddings` classes." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8e2997f3", + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = DeterministicFakeEmbedding(size=3)" + ] + }, + { + "cell_type": "markdown", + "id": "b4eea654", + "metadata": {}, + "source": [ + "## Define example data" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "166331fd", + "metadata": {}, + "outputs": [], + "source": [ + "index_name = \"test-langchain-retriever\"\n", + "text_field = \"text\"\n", + "dense_vector_field = \"fake_embedding\"\n", + "num_characters_field = \"num_characters\"\n", + "texts = [\n", + " \"foo\",\n", + " \"bar\",\n", + " \"world\",\n", + " \"hello world\",\n", + " \"hello\",\n", + " \"foo bar\",\n", + " \"bla bla foo\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "1c518c42", + "metadata": {}, + "source": [ + "## Index data\n", + "\n", + "Typically, users make use of `ElasticsearchRetriever` when they already have data in an Elasticsearch index. Here we index some example text documents. If you created an index for example using `ElasticsearchStore.from_documents` that's also fine." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cbc15217", + "metadata": {}, + "outputs": [], + "source": [ + "def create_index(\n", + " es_client: Elasticsearch,\n", + " index_name: str,\n", + " text_field: str,\n", + " dense_vector_field: str,\n", + " num_characters_field: str,\n", + "):\n", + " es_client.indices.create(\n", + " index=index_name,\n", + " mappings={\n", + " \"properties\": {\n", + " text_field: {\"type\": \"text\"},\n", + " dense_vector_field: {\"type\": \"dense_vector\"},\n", + " num_characters_field: {\"type\": \"integer\"},\n", + " }\n", + " },\n", + " )\n", + "\n", + "\n", + "def index_data(\n", + " es_client: Elasticsearch,\n", + " index_name: str,\n", + " text_field: str,\n", + " dense_vector_field: str,\n", + " embeddings: Embeddings,\n", + " texts: Iterable[str],\n", + " refresh: bool = True,\n", + ") -> None:\n", + " create_index(\n", + " es_client, index_name, text_field, dense_vector_field, num_characters_field\n", + " )\n", + "\n", + " vectors = embeddings.embed_documents(list(texts))\n", + " requests = [\n", + " {\n", + " \"_op_type\": \"index\",\n", + " \"_index\": index_name,\n", + " \"_id\": i,\n", + " text_field: text,\n", + " dense_vector_field: vector,\n", + " num_characters_field: len(text),\n", + " }\n", + " for i, (text, vector) in enumerate(zip(texts, vectors))\n", + " ]\n", + "\n", + " bulk(es_client, requests)\n", + "\n", + " if refresh:\n", + " es_client.indices.refresh(index=index_name)\n", + "\n", + " return len(requests)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "0a46bb52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index_data(es_client, index_name, text_field, dense_vector_field, embeddings, texts)" + ] + }, + { + "cell_type": "markdown", + "id": "08437fa2", + "metadata": {}, + "source": [ + "## Usage examples" + ] + }, + { + "cell_type": "markdown", + "id": "469aa295", + "metadata": {}, + "source": [ + "### Vector search\n", + "\n", + "Dense vector retrival using fake embeddings in this example." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9e80ec4b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='foo', metadata={'_index': 'test-langchain-index', '_id': '0', '_score': 1.0, '_source': {'fake_embedding': [-2.336764233933763, 0.27510289545940503, -0.7957597268194339], 'num_characters': 3}}),\n", + " Document(page_content='world', metadata={'_index': 'test-langchain-index', '_id': '2', '_score': 0.6770179, '_source': {'fake_embedding': [-0.7041151202179595, -1.4652961969276497, -0.25786766898672847], 'num_characters': 5}}),\n", + " Document(page_content='hello world', metadata={'_index': 'test-langchain-index', '_id': '3', '_score': 0.4816144, '_source': {'fake_embedding': [0.42728413221815387, -1.1889908285425348, -1.445433230084671], 'num_characters': 11}}),\n", + " Document(page_content='hello', metadata={'_index': 'test-langchain-index', '_id': '4', '_score': 0.46853775, '_source': {'fake_embedding': [-0.28560441330564046, 0.9958894823084921, 1.5489829880195058], 'num_characters': 5}}),\n", + " Document(page_content='foo bar', metadata={'_index': 'test-langchain-index', '_id': '5', '_score': 0.2086992, '_source': {'fake_embedding': [0.2533670476638539, 0.08100381646160418, 0.7763644080870179], 'num_characters': 7}})]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def vector_query(search_query: str) -> Dict:\n", + " vector = embeddings.embed_query(search_query) # same embeddings as for indexing\n", + " return {\n", + " \"knn\": {\n", + " \"field\": dense_vector_field,\n", + " \"query_vector\": vector,\n", + " \"k\": 5,\n", + " \"num_candidates\": 10,\n", + " }\n", + " }\n", + "\n", + "\n", + "vector_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=vector_query,\n", + " content_field=text_field,\n", + " url=es_url,\n", + ")\n", + "\n", + "vector_retriever.get_relevant_documents(\"foo\")" + ] + }, + { + "cell_type": "markdown", + "id": "74bd9256", + "metadata": {}, + "source": [ + "### BM25\n", + "\n", + "Traditional keyword matching." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e2dd95c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='foo', metadata={'_index': 'test-langchain-index', '_id': '0', '_score': 0.9711467, '_source': {'fake_embedding': [-2.336764233933763, 0.27510289545940503, -0.7957597268194339], 'num_characters': 3}}),\n", + " Document(page_content='foo bar', metadata={'_index': 'test-langchain-index', '_id': '5', '_score': 0.7437035, '_source': {'fake_embedding': [0.2533670476638539, 0.08100381646160418, 0.7763644080870179], 'num_characters': 7}}),\n", + " Document(page_content='bla bla foo', metadata={'_index': 'test-langchain-index', '_id': '6', '_score': 0.6025789, '_source': {'fake_embedding': [1.7365927060137358, -0.5230400847844948, 0.7978339724186192], 'num_characters': 11}})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def bm25_query(search_query: str) -> Dict:\n", + " return {\n", + " \"query\": {\n", + " \"match\": {\n", + " text_field: search_query,\n", + " },\n", + " },\n", + " }\n", + "\n", + "\n", + "bm25_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=bm25_query,\n", + " content_field=text_field,\n", + " url=es_url,\n", + ")\n", + "\n", + "bm25_retriever.get_relevant_documents(\"foo\")" + ] + }, + { + "cell_type": "markdown", + "id": "ed19b62c", + "metadata": {}, + "source": [ + "### Hybrid search\n", + "\n", + "The combination of vector search and BM25 search using [Reciprocal Rank Fusion](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html) (RRF) to combine the result sets." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6a672180", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='foo', metadata={'_index': 'test-langchain-index', '_id': '0', '_score': 0.9711467, '_source': {'fake_embedding': [-2.336764233933763, 0.27510289545940503, -0.7957597268194339], 'num_characters': 3}}),\n", + " Document(page_content='foo bar', metadata={'_index': 'test-langchain-index', '_id': '5', '_score': 0.7437035, '_source': {'fake_embedding': [0.2533670476638539, 0.08100381646160418, 0.7763644080870179], 'num_characters': 7}}),\n", + " Document(page_content='bla bla foo', metadata={'_index': 'test-langchain-index', '_id': '6', '_score': 0.6025789, '_source': {'fake_embedding': [1.7365927060137358, -0.5230400847844948, 0.7978339724186192], 'num_characters': 11}})]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def hybrid_query(search_query: str) -> Dict:\n", + " vector = embeddings.embed_query(search_query) # same embeddings as for indexing\n", + " return {\n", + " \"query\": {\n", + " \"match\": {\n", + " text_field: search_query,\n", + " },\n", + " },\n", + " \"knn\": {\n", + " \"field\": dense_vector_field,\n", + " \"query_vector\": vector,\n", + " \"k\": 5,\n", + " \"num_candidates\": 10,\n", + " },\n", + " \"rank\": {\"rrf\": {}},\n", + " }\n", + "\n", + "\n", + "hybrid_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=bm25_query,\n", + " content_field=text_field,\n", + " url=es_url,\n", + ")\n", + "\n", + "hybrid_retriever.get_relevant_documents(\"foo\")" + ] + }, + { + "cell_type": "markdown", + "id": "766b6da9", + "metadata": {}, + "source": [ + "### Fuzzy matching\n", + "\n", + "Keyword matching with typo tolerance." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9605b00a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='foo', metadata={'_index': 'test-langchain-index', '_id': '0', '_score': 0.6474311, '_source': {'fake_embedding': [-2.336764233933763, 0.27510289545940503, -0.7957597268194339], 'num_characters': 3}}),\n", + " Document(page_content='foo bar', metadata={'_index': 'test-langchain-index', '_id': '5', '_score': 0.49580228, '_source': {'fake_embedding': [0.2533670476638539, 0.08100381646160418, 0.7763644080870179], 'num_characters': 7}}),\n", + " Document(page_content='bla bla foo', metadata={'_index': 'test-langchain-index', '_id': '6', '_score': 0.40171927, '_source': {'fake_embedding': [1.7365927060137358, -0.5230400847844948, 0.7978339724186192], 'num_characters': 11}})]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def fuzzy_query(search_query: str) -> Dict:\n", + " return {\n", + " \"query\": {\n", + " \"match\": {\n", + " text_field: {\n", + " \"query\": search_query,\n", + " \"fuzziness\": \"AUTO\",\n", + " }\n", + " },\n", + " },\n", + " }\n", + "\n", + "\n", + "fuzzy_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=fuzzy_query,\n", + " content_field=text_field,\n", + " url=es_url,\n", + ")\n", + "\n", + "fuzzy_retriever.get_relevant_documents(\"fox\") # note the character tolernace" + ] + }, + { + "cell_type": "markdown", + "id": "16949537", + "metadata": {}, + "source": [ + "### Complex filtering\n", + "\n", + "Combination of filters on different fields." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d9e64ce5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='foo bar', metadata={'_index': 'test-langchain-index', '_id': '5', '_score': 1.7437035, '_source': {'fake_embedding': [0.2533670476638539, 0.08100381646160418, 0.7763644080870179], 'num_characters': 7}}),\n", + " Document(page_content='world', metadata={'_index': 'test-langchain-index', '_id': '2', '_score': 1.0, '_source': {'fake_embedding': [-0.7041151202179595, -1.4652961969276497, -0.25786766898672847], 'num_characters': 5}}),\n", + " Document(page_content='hello world', metadata={'_index': 'test-langchain-index', '_id': '3', '_score': 1.0, '_source': {'fake_embedding': [0.42728413221815387, -1.1889908285425348, -1.445433230084671], 'num_characters': 11}}),\n", + " Document(page_content='hello', metadata={'_index': 'test-langchain-index', '_id': '4', '_score': 1.0, '_source': {'fake_embedding': [-0.28560441330564046, 0.9958894823084921, 1.5489829880195058], 'num_characters': 5}})]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_query_func(search_query: str) -> Dict:\n", + " return {\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"must\": [\n", + " {\"range\": {num_characters_field: {\"gte\": 5}}},\n", + " ],\n", + " \"must_not\": [\n", + " {\"prefix\": {text_field: \"bla\"}},\n", + " ],\n", + " \"should\": [\n", + " {\"match\": {text_field: search_query}},\n", + " ],\n", + " }\n", + " }\n", + " }\n", + "\n", + "\n", + "filtering_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=filter_query_func,\n", + " content_field=text_field,\n", + " url=es_url,\n", + ")\n", + "\n", + "filtering_retriever.get_relevant_documents(\"foo\")" + ] + }, + { + "cell_type": "markdown", + "id": "b415cfc0", + "metadata": {}, + "source": [ + "Note that the query match is on top. The other documents that got passed the filter are also in the result set, but they all have the same score." + ] + }, + { + "cell_type": "markdown", + "id": "c57b7bb1", + "metadata": {}, + "source": [ + "### Custom document mapper\n", + "\n", + "It is possible to cusomize the function tha maps an Elasticsearch result (hit) to a LangChain document." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "df679007", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='This document has 7 characters', metadata={'text_content': 'foo bar'}),\n", + " Document(page_content='This document has 5 characters', metadata={'text_content': 'world'}),\n", + " Document(page_content='This document has 11 characters', metadata={'text_content': 'hello world'}),\n", + " Document(page_content='This document has 5 characters', metadata={'text_content': 'hello'})]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def num_characters_mapper(hit: Dict[str, Any]) -> Document:\n", + " num_chars = hit[\"_source\"][num_characters_field]\n", + " content = hit[\"_source\"][text_field]\n", + " return Document(\n", + " page_content=f\"This document has {num_chars} characters\",\n", + " metadata={\"text_content\": content},\n", + " )\n", + "\n", + "\n", + "custom_mapped_retriever = ElasticsearchRetriever.from_es_params(\n", + " index_name=index_name,\n", + " body_func=filter_query_func,\n", + " document_mapper=num_characters_mapper,\n", + " url=es_url,\n", + ")\n", + "\n", + "custom_mapped_retriever.get_relevant_documents(\"foo\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/partners/elasticsearch/langchain_elasticsearch/retrievers.py b/libs/partners/elasticsearch/langchain_elasticsearch/retrievers.py index e2375a83c901a..1e58e7c1aa4d4 100644 --- a/libs/partners/elasticsearch/langchain_elasticsearch/retrievers.py +++ b/libs/partners/elasticsearch/langchain_elasticsearch/retrievers.py @@ -21,8 +21,9 @@ class ElasticsearchRetriever(BaseRetriever): `from_es_params` method with parameters to initialize the client. index_name: The name of the index to query. body_func: Function to create an Elasticsearch DSL query body from a search - string. All parameters (including for example the `size` parameter to limit - the number of results) must also be set in the body. + string. The returned query body must fit what you would normally send in a + POST request the the _search endpoint. If applicable, it also includes + parameters the `size` parameter etc. content_field: The document field name that contains the page content. document_mapper: Function to map Elasticsearch hits to LangChain Documents. """