Skip to content

Commit

Permalink
partners: add Elasticsearch package (#17467)
Browse files Browse the repository at this point in the history
### Description
This PR moves the Elasticsearch classes to a partners package.

Note that we will not move (and later remove) `ElasticKnnSearch`. It
were previously deprecated.
`ElasticVectorSearch` is going to stay in the community package since it
is used quite a lot still.

Also note that I left the `ElasticsearchTranslator` for self query
untouched because it resides in main `langchain` package.

### Dependencies
There will be another PR that updates the notebooks (potentially pulling
them into the partners package) and templates and removes the classes
from the community package, see
#17468

#### Open question
How to make the transition smooth for users? Do we move the import
aliases and require people to install `langchain-elasticsearch`? Or do
we remove the import aliases from the `langchain` package all together?
What has worked well for other partner packages?

---------

Co-authored-by: Erick Friis <[email protected]>
  • Loading branch information
maxjakob and efriis authored Feb 26, 2024
1 parent a4896da commit 5ab69f9
Show file tree
Hide file tree
Showing 33 changed files with 4,916 additions and 16 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/_integration_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ jobs:
ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }}
ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }}
ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }}
ES_URL: ${{ secrets.ES_URL }}
ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
ES_API_KEY: ${{ secrets.ES_API_KEY }}
run: |
make integration_tests
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ jobs:
ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }}
ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }}
ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }}
ES_URL: ${{ secrets.ES_URL }}
ES_CLOUD_ID: ${{ secrets.ES_CLOUD_ID }}
ES_API_KEY: ${{ secrets.ES_API_KEY }}
run: make integration_tests
working-directory: ${{ inputs.working-directory }}

Expand Down
2 changes: 1 addition & 1 deletion cookbook/self_query_hotel_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1083,7 +1083,7 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.vectorstores import ElasticsearchStore\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_openai import OpenAIEmbeddings\n",
"\n",
"embeddings = OpenAIEmbeddings()"
Expand Down
4 changes: 2 additions & 2 deletions docs/docs/integrations/providers/elasticsearch.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ Elastic Cloud is a managed Elasticsearch service. Signup for a [free trial](http
### Install Client

```bash
pip install elasticsearch
pip install langchain-elasticsearch
```

## Vector Store

The vector store is a simple wrapper around Elasticsearch. It provides a simple interface to store and retrieve vectors.

```python
from langchain_community.vectorstores import ElasticsearchStore
from langchain_elasticsearch import ElasticsearchStore

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@
"import getpass\n",
"import os\n",
"\n",
"from langchain_community.vectorstores import ElasticsearchStore\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_openai import OpenAIEmbeddings\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n",
Expand Down
4 changes: 2 additions & 2 deletions docs/docs/integrations/text_embedding/elasticsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
"outputs": [],
"source": [
"!pip -q install elasticsearch langchain"
"!pip -q install langchain-elasticsearch"
]
},
{
Expand All @@ -36,7 +36,7 @@
},
"outputs": [],
"source": [
"from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings"
"from langchain_elasticsearch import ElasticsearchEmbeddings"
]
},
{
Expand Down
18 changes: 9 additions & 9 deletions docs/docs/integrations/vectorstores/elasticsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install --upgrade --quiet elasticsearch langchain-openai tiktoken langchain"
"%pip install --upgrade --quiet langchain-elasticsearch langchain-openai tiktoken langchain"
]
},
{
Expand Down Expand Up @@ -64,7 +64,7 @@
"\n",
"Example:\n",
"```python\n",
" from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
" from langchain_elasticsearch import ElasticsearchStore\n",
" from langchain_openai import OpenAIEmbeddings\n",
"\n",
" embedding = OpenAIEmbeddings()\n",
Expand All @@ -79,7 +79,7 @@
"\n",
"Example:\n",
"```python\n",
" from langchain_community.vectorstores import ElasticsearchStore\n",
" from langchain_elasticsearch import ElasticsearchStore\n",
" from langchain_openai import OpenAIEmbeddings\n",
"\n",
" embedding = OpenAIEmbeddings()\n",
Expand All @@ -97,7 +97,7 @@
"Example:\n",
"```python\n",
" import elasticsearch\n",
" from langchain_community.vectorstores import ElasticsearchStore\n",
" from langchain_elasticsearch import ElasticsearchStore\n",
"\n",
" es_client= elasticsearch.Elasticsearch(\n",
" hosts=[\"http://localhost:9200\"],\n",
Expand Down Expand Up @@ -137,7 +137,7 @@
"\n",
"Example:\n",
"```python\n",
" from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
" from langchain_elasticsearch import ElasticsearchStore\n",
" from langchain_openai import OpenAIEmbeddings\n",
"\n",
" embedding = OpenAIEmbeddings()\n",
Expand Down Expand Up @@ -202,7 +202,7 @@
},
"outputs": [],
"source": [
"from langchain_community.vectorstores import ElasticsearchStore\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_openai import OpenAIEmbeddings"
]
},
Expand Down Expand Up @@ -817,7 +817,7 @@
"source": [
"from typing import Dict\n",
"\n",
"from langchain.docstore.document import Document\n",
"from langchain_core.documents import Document\n",
"\n",
"\n",
"def custom_document_builder(hit: Dict) -> Document:\n",
Expand Down Expand Up @@ -902,7 +902,7 @@
"\n",
"```python\n",
"\n",
"from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"\n",
"db = ElasticsearchStore(\n",
" es_url=\"http://localhost:9200\",\n",
Expand Down Expand Up @@ -936,7 +936,7 @@
"\n",
"```python\n",
"\n",
"from langchain_community.vectorstores.elasticsearch import ElasticsearchStore\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"\n",
"db = ElasticsearchStore(\n",
" es_url=\"http://localhost:9200\",\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/modules/data_connection/indexing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@
"outputs": [],
"source": [
"from langchain.indexes import SQLRecordManager, index\n",
"from langchain_community.vectorstores import ElasticsearchStore\n",
"from langchain_core.documents import Document\n",
"from langchain_elasticsearch import ElasticsearchStore\n",
"from langchain_openai import OpenAIEmbeddings"
]
},
Expand Down
1 change: 1 addition & 0 deletions libs/partners/elasticsearch/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__pycache__
21 changes: 21 additions & 0 deletions libs/partners/elasticsearch/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 LangChain, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
60 changes: 60 additions & 0 deletions libs/partners/elasticsearch/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests

# Default target executed when no arguments are given to make.
all: help

install:
poetry install

# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE=tests/integration_tests/

test tests integration_test integration_tests:
poetry run pytest $(TEST_FILE)


######################
# LINTING AND FORMATTING
######################

# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/elasticsearch --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_elasticsearch
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test

lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)

format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)

spell_check:
poetry run codespell --toml pyproject.toml

spell_fix:
poetry run codespell --toml pyproject.toml -w

check_imports: $(shell find langchain_elasticsearch -name '*.py')
poetry run python ./scripts/check_imports.py $^

######################
# HELP
######################

help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'
29 changes: 29 additions & 0 deletions libs/partners/elasticsearch/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# langchain-elasticsearch

This package contains the LangChain integration with Elasticsearch.

## Installation

```bash
pip install -U langchain-elasticsearch
```

TODO document how to get id and key

## Usage

The `ElasticsearchStore` class exposes the connection to the Pinecone vector store.

```python
from langchain_elasticsearch import ElasticsearchStore

embeddings = ... # use a LangChain Embeddings class

vectorstore = ElasticsearchStore(
es_cloud_id="your-cloud-id",
es_api_key="your-api-key",
index_name="your-index-name",
embeddings=embeddings,
)
```

17 changes: 17 additions & 0 deletions libs/partners/elasticsearch/langchain_elasticsearch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from langchain_elasticsearch.chat_history import ElasticsearchChatMessageHistory
from langchain_elasticsearch.embeddings import ElasticsearchEmbeddings
from langchain_elasticsearch.vectorstores import (
ApproxRetrievalStrategy,
ElasticsearchStore,
ExactRetrievalStrategy,
SparseRetrievalStrategy,
)

__all__ = [
"ApproxRetrievalStrategy",
"ElasticsearchChatMessageHistory",
"ElasticsearchEmbeddings",
"ElasticsearchStore",
"ExactRetrievalStrategy",
"SparseRetrievalStrategy",
]
82 changes: 82 additions & 0 deletions libs/partners/elasticsearch/langchain_elasticsearch/_utilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from enum import Enum
from typing import List, Union

import numpy as np

Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]


class DistanceStrategy(str, Enum):
"""Enumerator of the Distance strategies for calculating distances
between vectors."""

EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
MAX_INNER_PRODUCT = "MAX_INNER_PRODUCT"
DOT_PRODUCT = "DOT_PRODUCT"
JACCARD = "JACCARD"
COSINE = "COSINE"


def maximal_marginal_relevance(
query_embedding: np.ndarray,
embedding_list: list,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance."""
if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
query_embedding = np.expand_dims(query_embedding, axis=0)
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
most_similar = int(np.argmax(similarity_to_query))
idxs = [most_similar]
selected = np.array([embedding_list[most_similar]])
while len(idxs) < min(k, len(embedding_list)):
best_score = -np.inf
idx_to_add = -1
similarity_to_selected = cosine_similarity(embedding_list, selected)
for i, query_score in enumerate(similarity_to_query):
if i in idxs:
continue
redundant_score = max(similarity_to_selected[i])
equation_score = (
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
)
if equation_score > best_score:
best_score = equation_score
idx_to_add = i
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs


def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices."""
if len(X) == 0 or len(Y) == 0:
return np.array([])

X = np.array(X)
Y = np.array(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError(
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
f"and Y has shape {Y.shape}."
)
try:
import simsimd as simd # type: ignore

X = np.array(X, dtype=np.float32)
Y = np.array(Y, dtype=np.float32)
Z = 1 - simd.cdist(X, Y, metric="cosine")
if isinstance(Z, float):
return np.array([Z])
return Z
except ImportError:
X_norm = np.linalg.norm(X, axis=1)
Y_norm = np.linalg.norm(Y, axis=1)
# Ignore divide by zero errors run time warnings as those are handled below.
with np.errstate(divide="ignore", invalid="ignore"):
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
return similarity
Loading

0 comments on commit 5ab69f9

Please sign in to comment.