Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adopt Secret to pgvector #402

Merged
merged 14 commits into from
Feb 14, 2024
13 changes: 12 additions & 1 deletion integrations/pgvector/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,23 @@ pip install pgvector-haystack

## Testing

TODO
Ensure that you have a PostgreSQL running with the `pgvector` extension. For a quick setup using Docker, run:
```
docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=postgres ankane/pgvector
```

then run the tests:

```console
hatch run test
```

To run the coverage report:

```console
hatch run cov
```

## License

`pgvector-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
4 changes: 3 additions & 1 deletion integrations/pgvector/examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# git clone https://github.com/anakin87/neural-search-pills

import glob
import os

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
Expand All @@ -20,9 +21,10 @@
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"

# Initialize PgvectorDocumentStore
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="haystack_test",
embedding_dimension=768,
vector_function="cosine_similarity",
Expand Down
13 changes: 7 additions & 6 deletions integrations/pgvector/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ ignore = [
"S105", "S106", "S107",
# Ignore complexity
"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
# ignore function-call-in-default-argument
"B008",
]
unfixable = [
# Don't touch unused imports
Expand All @@ -156,23 +158,22 @@ ban-relative-imports = "parents"
# examples can contain "print" commands
"examples/**/*" = ["T201"]


[tool.coverage.run]
source_pkgs = ["src", "tests"]
source = ["haystack_integrations"]
branch = true
parallel = true


[tool.coverage.paths]
weaviate_haystack = ["src/haystack_integrations", "*/pgvector-haystack/src"]
tests = ["tests", "*/pgvector-haystack/tests"]

[tool.coverage.report]
omit = ["*/tests/*", "*/__init__.py"]
show_missing=true
exclude_lines = [
"no cov",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
]


[[tool.mypy.overrides]]
module = [
"haystack.*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,8 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever":
data["init_parameters"]["document_store"] = default_from_dict(
PgvectorDocumentStore, data["init_parameters"]["document_store"]
)
doc_store_params = data["init_parameters"]["document_store"]
data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params)
return default_from_dict(cls, data)

@component.output_types(documents=List[Document])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import logging
from typing import Any, Dict, List, Literal, Optional

from haystack import default_to_dict
from haystack import default_from_dict, default_to_dict
from haystack.dataclasses.document import ByteStream, Document
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils.auth import Secret, deserialize_secrets_inplace
from haystack.utils.filters import convert
from psycopg import Error, IntegrityError, connect
from psycopg.abc import Query
Expand Down Expand Up @@ -69,7 +70,7 @@ class PgvectorDocumentStore:
def __init__(
self,
*,
connection_string: str,
connection_string: Secret = Secret.from_env_var("PG_CONN_STR"),
table_name: str = "haystack_documents",
embedding_dimension: int = 768,
vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity",
Expand All @@ -84,8 +85,8 @@ def __init__(
It is meant to be connected to a PostgreSQL database with the pgvector extension installed.
A specific table to store Haystack documents will be created if it doesn't exist yet.

:param connection_string: The connection string to use to connect to the PostgreSQL database.
e.g. "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
:param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an
environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME"
:param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents".
:param embedding_dimension: The dimension of the embedding. Defaults to 768.
:param vector_function: The similarity function to use when searching for similar embeddings.
Expand Down Expand Up @@ -130,7 +131,7 @@ def __init__(
self.hnsw_index_creation_kwargs = hnsw_index_creation_kwargs or {}
self.hnsw_ef_search = hnsw_ef_search

connection = connect(connection_string)
connection = connect(self.connection_string.resolve_value())
connection.autocommit = True
self._connection = connection

Expand All @@ -151,7 +152,7 @@ def __init__(
def to_dict(self) -> Dict[str, Any]:
return default_to_dict(
self,
connection_string=self.connection_string,
connection_string=self.connection_string.to_dict(),
table_name=self.table_name,
embedding_dimension=self.embedding_dimension,
vector_function=self.vector_function,
Expand All @@ -162,6 +163,11 @@ def to_dict(self) -> Dict[str, Any]:
hnsw_ef_search=self.hnsw_ef_search,
)

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore":
deserialize_secrets_inplace(data["init_parameters"], ["connection_string"])
return default_from_dict(cls, data)

def _execute_sql(
self, sql_query: Query, params: Optional[tuple] = None, error_msg: str = "", cursor: Optional[Cursor] = None
):
Expand Down Expand Up @@ -221,15 +227,15 @@ def _handle_hnsw(self):
)
self._execute_sql(sql_set_hnsw_ef_search, error_msg="Could not set hnsw.ef_search")

index_esists = bool(
index_exists = bool(
self._execute_sql(
"SELECT 1 FROM pg_indexes WHERE tablename = %s AND indexname = %s",
(self.table_name, HNSW_INDEX_NAME),
"Could not check if HNSW index exists",
).fetchone()
)

if index_esists and not self.hnsw_recreate_index_if_exists:
if index_exists and not self.hnsw_recreate_index_if_exists:
logger.warning(
"HNSW index already exists and won't be recreated. "
"If you want to recreate it, pass 'hnsw_recreate_index_if_exists=True' to the "
Expand Down Expand Up @@ -373,7 +379,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D

return written_docs

def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict[str, Any]]:
@staticmethod
def _from_haystack_to_pg_documents(documents: List[Document]) -> List[Dict[str, Any]]:
"""
Internal method to convert a list of Haystack Documents to a list of dictionaries that can be used to insert
documents into the PgvectorDocumentStore.
Expand All @@ -395,7 +402,8 @@ def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict

return db_documents

def _from_pg_to_haystack_documents(self, documents: List[Dict[str, Any]]) -> List[Document]:
@staticmethod
def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Document]:
"""
Internal method to convert a list of dictionaries from pgvector to a list of Haystack Documents.
"""
Expand Down
6 changes: 4 additions & 2 deletions integrations/pgvector/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
import os

import pytest
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore


@pytest.fixture
def document_store(request):
connection_string = "postgresql://postgres:postgres@localhost:5432/postgres"
os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres"
table_name = f"haystack_{request.node.name}"
embedding_dimension = 768
vector_function = "cosine_similarity"
recreate_table = True
search_strategy = "exact_nearest_neighbor"

store = PgvectorDocumentStore(
connection_string=connection_string,
table_name=table_name,
embedding_dimension=embedding_dimension,
vector_function=vector_function,
recreate_table=recreate_table,
search_strategy=search_strategy,
)

yield store

store.delete_table()
5 changes: 1 addition & 4 deletions integrations/pgvector/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def test_write_dataframe(self, document_store: PgvectorDocumentStore):

def test_init(self):
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="my_table",
embedding_dimension=512,
vector_function="l2_distance",
Expand All @@ -52,7 +51,6 @@ def test_init(self):
hnsw_ef_search=50,
)

assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres"
assert document_store.table_name == "my_table"
assert document_store.embedding_dimension == 512
assert document_store.vector_function == "l2_distance"
Expand All @@ -64,7 +62,6 @@ def test_init(self):

def test_to_dict(self):
document_store = PgvectorDocumentStore(
connection_string="postgresql://postgres:postgres@localhost:5432/postgres",
table_name="my_table",
embedding_dimension=512,
vector_function="l2_distance",
Expand All @@ -78,7 +75,7 @@ def test_to_dict(self):
assert document_store.to_dict() == {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "my_table",
"embedding_dimension": 512,
"vector_function": "l2_distance",
Expand Down
7 changes: 4 additions & 3 deletions integrations/pgvector/tests/test_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from unittest.mock import Mock

from haystack.dataclasses import Document
from haystack.utils.auth import EnvVarSecret
from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore

Expand Down Expand Up @@ -37,7 +38,7 @@ def test_to_dict(self, document_store: PgvectorDocumentStore):
"document_store": {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "haystack_test_to_dict",
"embedding_dimension": 768,
"vector_function": "cosine_similarity",
Expand All @@ -62,7 +63,7 @@ def test_from_dict(self):
"document_store": {
"type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore",
"init_parameters": {
"connection_string": "postgresql://postgres:postgres@localhost:5432/postgres",
"connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"},
"table_name": "haystack_test_to_dict",
"embedding_dimension": 768,
"vector_function": "cosine_similarity",
Expand All @@ -83,7 +84,7 @@ def test_from_dict(self):
document_store = retriever.document_store

assert isinstance(document_store, PgvectorDocumentStore)
assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres"
assert isinstance(document_store.connection_string, EnvVarSecret)
assert document_store.table_name == "haystack_test_to_dict"
assert document_store.embedding_dimension == 768
assert document_store.vector_function == "cosine_similarity"
Expand Down