forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
community[minor]: Add
DuckDB
as a vectorstore (langchain-ai#18916)
DuckDB has a cosine similarity function along list and array data types, which can be used as a vector store. - **Description:** The latest version of DuckDB features a cosine similarity function, which can be used with its support for list or array column types. This PR surfaces this functionality to langchain. - **Dependencies:** duckdb 0.10.0 - **Twitter handle:** @igocrite --------- Co-authored-by: Eugene Yurtsev <[email protected]> Co-authored-by: Bagatur <[email protected]>
- Loading branch information
1 parent
25e8868
commit 150204b
Showing
5 changed files
with
533 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# DuckDB\n", | ||
"This notebook shows how to use `DuckDB` as a vector store." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"! pip install duckdb" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import getpass\n", | ||
"import os\n", | ||
"\n", | ||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.embeddings import OpenAIEmbeddings\n", | ||
"from langchain.vectorstores import DuckDB" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import TextLoader\n", | ||
"from langchain_text_splitters import CharacterTextSplitter\n", | ||
"\n", | ||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", | ||
"documents = loader.load()\n", | ||
"\n", | ||
"documents = CharacterTextSplitter().split_documents(documents)\n", | ||
"embeddings = OpenAIEmbeddings()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"docsearch = DuckDB.from_documents(documents, embeddings)\n", | ||
"\n", | ||
"query = \"What did the president say about Ketanji Brown Jackson\"\n", | ||
"docs = docsearch.similarity_search(query)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print(docs[0].page_content)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
263 changes: 263 additions & 0 deletions
263
libs/community/langchain_community/vectorstores/duckdb.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
# mypy: disable-error-code=func-returns-value | ||
from __future__ import annotations | ||
|
||
import json | ||
import uuid | ||
from typing import Any, Iterable, List, Optional, Type | ||
|
||
from langchain_core.documents import Document | ||
from langchain_core.embeddings import Embeddings | ||
from langchain_core.vectorstores import VST, VectorStore | ||
|
||
|
||
class DuckDB(VectorStore): | ||
"""`DuckDB` vector store. | ||
This class provides a vector store interface for adding texts and performing | ||
similarity searches using DuckDB. | ||
For more information about DuckDB, see: https://duckdb.org/ | ||
This integration requires the `duckdb` Python package. | ||
You can install it with `pip install duckdb`. | ||
*Security Notice*: The default DuckDB configuration is not secure. | ||
By **default**, DuckDB can interact with files across the entire file system, | ||
which includes abilities to read, write, and list files and directories. | ||
It can also access some python variables present in the global namespace. | ||
When using this DuckDB vectorstore, we suggest that you initialize the | ||
DuckDB connection with a secure configuration. | ||
For example, you can set `enable_external_access` to `false` in the connection | ||
configuration to disable external access to the DuckDB connection. | ||
You can view the DuckDB configuration options here: | ||
https://duckdb.org/docs/configuration/overview.html | ||
Please review other relevant security considerations in the DuckDB | ||
documentation. (e.g., "autoinstall_known_extensions": "false", | ||
"autoload_known_extensions": "false") | ||
See https://python.langchain.com/docs/security for more information. | ||
Args: | ||
connection: Optional DuckDB connection | ||
embedding: The embedding function or model to use for generating embeddings. | ||
vector_key: The column name for storing vectors. Defaults to `embedding`. | ||
id_key: The column name for storing unique identifiers. Defaults to `id`. | ||
text_key: The column name for storing text. Defaults to `text`. | ||
table_name: The name of the table to use for storing embeddings. Defaults to | ||
`embeddings`. | ||
Example: | ||
.. code-block:: python | ||
import duckdb | ||
conn = duckdb.connect(database=':memory:', | ||
config={ | ||
# Sample configuration to restrict some DuckDB capabilities | ||
# List is not exhaustive. Please review DuckDB documentation. | ||
"enable_external_access": "false", | ||
"autoinstall_known_extensions": "false", | ||
"autoload_known_extensions": "false" | ||
} | ||
) | ||
embedding_function = ... # Define or import your embedding function here | ||
vector_store = DuckDB(conn, embedding_function) | ||
vector_store.add_texts(['text1', 'text2']) | ||
result = vector_store.similarity_search('text1') | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
connection: Optional[Any] = None, | ||
embedding: Embeddings, | ||
vector_key: str = "embedding", | ||
id_key: str = "id", | ||
text_key: str = "text", | ||
table_name: str = "vectorstore", | ||
): | ||
"""Initialize with DuckDB connection and setup for vector storage.""" | ||
try: | ||
import duckdb | ||
except ImportError: | ||
raise ImportError( | ||
"Could not import duckdb package. " | ||
"Please install it with `pip install duckdb`." | ||
) | ||
self.duckdb = duckdb | ||
self._embedding = embedding | ||
self._vector_key = vector_key | ||
self._id_key = id_key | ||
self._text_key = text_key | ||
self._table_name = table_name | ||
|
||
if self._embedding is None: | ||
raise ValueError("An embedding function or model must be provided.") | ||
|
||
if connection is None: | ||
import warnings | ||
|
||
warnings.warn( | ||
"No DuckDB connection provided. A new connection will be created." | ||
"This connection is running in memory and no data will be persisted." | ||
"To persist data, specify `connection=duckdb.connect(...)` when using " | ||
"the API. Please review the documentation of the vectorstore for " | ||
"security recommendations on configuring the connection." | ||
) | ||
|
||
self._connection = connection or self.duckdb.connect( | ||
database=":memory:", config={"enable_external_access": "false"} | ||
) | ||
self._ensure_table() | ||
self._table = self._connection.table(self._table_name) | ||
|
||
@property | ||
def embeddings(self) -> Optional[Embeddings]: | ||
"""Returns the embedding object used by the vector store.""" | ||
return self._embedding | ||
|
||
def add_texts( | ||
self, | ||
texts: Iterable[str], | ||
metadatas: Optional[List[dict]] = None, | ||
**kwargs: Any, | ||
) -> List[str]: | ||
"""Turn texts into embedding and add it to the database using Pandas DataFrame | ||
Args: | ||
texts: Iterable of strings to add to the vectorstore. | ||
metadatas: Optional list of metadatas associated with the texts. | ||
kwargs: Additional parameters including optional 'ids' to associate | ||
with the texts. | ||
Returns: | ||
List of ids of the added texts. | ||
""" | ||
|
||
# Extract ids from kwargs or generate new ones if not provided | ||
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts]) | ||
|
||
# Embed texts and create documents | ||
ids = ids or [str(uuid.uuid4()) for _ in texts] | ||
embeddings = self._embedding.embed_documents(list(texts)) | ||
for idx, text in enumerate(texts): | ||
embedding = embeddings[idx] | ||
# Serialize metadata if present, else default to None | ||
metadata = ( | ||
json.dumps(metadatas[idx]) | ||
if metadatas and idx < len(metadatas) | ||
else None | ||
) | ||
self._connection.execute( | ||
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)", | ||
[ids[idx], text, embedding, metadata], | ||
) | ||
return ids | ||
|
||
def similarity_search( | ||
self, query: str, k: int = 4, **kwargs: Any | ||
) -> List[Document]: | ||
"""Performs a similarity search for a given query string. | ||
Args: | ||
query: The query string to search for. | ||
k: The number of similar texts to return. | ||
Returns: | ||
A list of Documents most similar to the query. | ||
""" | ||
embedding = self._embedding.embed_query(query) # type: ignore | ||
list_cosine_similarity = self.duckdb.FunctionExpression( | ||
"list_cosine_similarity", | ||
self.duckdb.ColumnExpression(self._vector_key), | ||
self.duckdb.ConstantExpression(embedding), | ||
) | ||
docs = ( | ||
self._table.select( | ||
*[ | ||
self.duckdb.StarExpression(exclude=[]), | ||
list_cosine_similarity.alias("similarity"), | ||
] | ||
) | ||
.order("similarity desc") | ||
.limit(k) | ||
.select( | ||
self.duckdb.StarExpression(exclude=["similarity", self._vector_key]) | ||
) | ||
.fetchdf() | ||
) | ||
return [ | ||
Document( | ||
page_content=docs[self._text_key][idx], | ||
metadata=json.loads(docs["metadata"][idx]) | ||
if docs["metadata"][idx] | ||
else {}, | ||
) | ||
for idx in range(len(docs)) | ||
] | ||
|
||
@classmethod | ||
def from_texts( | ||
cls: Type[VST], | ||
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict]] = None, | ||
**kwargs: Any, | ||
) -> DuckDB: | ||
"""Creates an instance of DuckDB and populates it with texts and | ||
their embeddings. | ||
Args: | ||
texts: List of strings to add to the vector store. | ||
embedding: The embedding function or model to use for generating embeddings. | ||
metadatas: Optional list of metadata dictionaries associated with the texts. | ||
**kwargs: Additional keyword arguments including: | ||
- connection: DuckDB connection. If not provided, a new connection will | ||
be created. | ||
- vector_key: The column name for storing vectors. Default "vector". | ||
- id_key: The column name for storing unique identifiers. Default "id". | ||
- text_key: The column name for storing text. Defaults to "text". | ||
- table_name: The name of the table to use for storing embeddings. | ||
Defaults to "embeddings". | ||
Returns: | ||
An instance of DuckDB with the provided texts and their embeddings added. | ||
""" | ||
|
||
# Extract kwargs for DuckDB instance creation | ||
connection = kwargs.get("connection", None) | ||
vector_key = kwargs.get("vector_key", "vector") | ||
id_key = kwargs.get("id_key", "id") | ||
text_key = kwargs.get("text_key", "text") | ||
table_name = kwargs.get("table_name", "embeddings") | ||
|
||
# Create an instance of DuckDB | ||
instance = DuckDB( | ||
connection=connection, | ||
embedding=embedding, | ||
vector_key=vector_key, | ||
id_key=id_key, | ||
text_key=text_key, | ||
table_name=table_name, | ||
) | ||
# Add texts and their embeddings to the DuckDB vector store | ||
instance.add_texts(texts, metadatas=metadatas, **kwargs) | ||
|
||
return instance | ||
|
||
def _ensure_table(self) -> None: | ||
"""Ensures the table for storing embeddings exists.""" | ||
create_table_sql = f""" | ||
CREATE TABLE IF NOT EXISTS {self._table_name} ( | ||
{self._id_key} VARCHAR PRIMARY KEY, | ||
{self._text_key} VARCHAR, | ||
{self._vector_key} FLOAT[], | ||
metadata VARCHAR | ||
) | ||
""" | ||
self._connection.execute(create_table_sql) |
Oops, something went wrong.