From 613e4ecd871eb57ff514f947dd1767cc29ddfaf0 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 14 Feb 2024 10:43:40 +0100 Subject: [PATCH] Adopt `Secret` to pgvector (#402) * initial import * adding Secret support and fixing tests * completing docs * code formating * linting and typing * fixing tests * adding custom from_dict * adding test coverage * use deserialize_secrets_inplace() --- integrations/pgvector/README.md | 13 ++++++++- integrations/pgvector/examples/example.py | 4 ++- integrations/pgvector/pyproject.toml | 13 +++++---- .../pgvector/embedding_retriever.py | 5 ++-- .../pgvector/document_store.py | 28 ++++++++++++------- integrations/pgvector/tests/conftest.py | 6 ++-- .../pgvector/tests/test_document_store.py | 5 +--- integrations/pgvector/tests/test_retriever.py | 7 +++-- 8 files changed, 51 insertions(+), 30 deletions(-) diff --git a/integrations/pgvector/README.md b/integrations/pgvector/README.md index 277c732f4..a2d325c54 100644 --- a/integrations/pgvector/README.md +++ b/integrations/pgvector/README.md @@ -20,12 +20,23 @@ pip install pgvector-haystack ## Testing -TODO +Ensure that you have a PostgreSQL running with the `pgvector` extension. For a quick setup using Docker, run: +``` +docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=postgres ankane/pgvector +``` + +then run the tests: ```console hatch run test ``` +To run the coverage report: + +```console +hatch run cov +``` + ## License `pgvector-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/pgvector/examples/example.py b/integrations/pgvector/examples/example.py index 14c2cba60..764c915d1 100644 --- a/integrations/pgvector/examples/example.py +++ b/integrations/pgvector/examples/example.py @@ -11,6 +11,7 @@ # git clone https://github.com/anakin87/neural-search-pills import glob +import os from haystack import Pipeline from haystack.components.converters import MarkdownToDocument @@ -20,9 +21,10 @@ from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore +os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres" + # Initialize PgvectorDocumentStore document_store = PgvectorDocumentStore( - connection_string="postgresql://postgres:postgres@localhost:5432/postgres", table_name="haystack_test", embedding_dimension=768, vector_function="cosine_similarity", diff --git a/integrations/pgvector/pyproject.toml b/integrations/pgvector/pyproject.toml index 65ded967f..178d9f7e8 100644 --- a/integrations/pgvector/pyproject.toml +++ b/integrations/pgvector/pyproject.toml @@ -138,6 +138,8 @@ ignore = [ "S105", "S106", "S107", # Ignore complexity "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", + # ignore function-call-in-default-argument + "B008", ] unfixable = [ # Don't touch unused imports @@ -156,23 +158,22 @@ ban-relative-imports = "parents" # examples can contain "print" commands "examples/**/*" = ["T201"] + [tool.coverage.run] -source_pkgs = ["src", "tests"] +source = ["haystack_integrations"] branch = true parallel = true - -[tool.coverage.paths] -weaviate_haystack = ["src/haystack_integrations", "*/pgvector-haystack/src"] -tests = ["tests", "*/pgvector-haystack/tests"] - [tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing=true exclude_lines = [ "no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] + [[tool.mypy.overrides]] module = [ "haystack.*", diff --git a/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py b/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py index 26807e9bd..4b8df868b 100644 --- a/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py +++ b/integrations/pgvector/src/haystack_integrations/components/retrievers/pgvector/embedding_retriever.py @@ -68,9 +68,8 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PgvectorEmbeddingRetriever": - data["init_parameters"]["document_store"] = default_from_dict( - PgvectorDocumentStore, data["init_parameters"]["document_store"] - ) + doc_store_params = data["init_parameters"]["document_store"] + data["init_parameters"]["document_store"] = PgvectorDocumentStore.from_dict(doc_store_params) return default_from_dict(cls, data) @component.output_types(documents=List[Document]) diff --git a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py index 097e86c7e..798c75276 100644 --- a/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py +++ b/integrations/pgvector/src/haystack_integrations/document_stores/pgvector/document_store.py @@ -4,10 +4,11 @@ import logging from typing import Any, Dict, List, Literal, Optional -from haystack import default_to_dict +from haystack import default_from_dict, default_to_dict from haystack.dataclasses.document import ByteStream, Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.utils.auth import Secret, deserialize_secrets_inplace from haystack.utils.filters import convert from psycopg import Error, IntegrityError, connect from psycopg.abc import Query @@ -69,7 +70,7 @@ class PgvectorDocumentStore: def __init__( self, *, - connection_string: str, + connection_string: Secret = Secret.from_env_var("PG_CONN_STR"), table_name: str = "haystack_documents", embedding_dimension: int = 768, vector_function: Literal["cosine_similarity", "inner_product", "l2_distance"] = "cosine_similarity", @@ -84,8 +85,8 @@ def __init__( It is meant to be connected to a PostgreSQL database with the pgvector extension installed. A specific table to store Haystack documents will be created if it doesn't exist yet. - :param connection_string: The connection string to use to connect to the PostgreSQL database. - e.g. "postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" + :param connection_string: The connection string to use to connect to the PostgreSQL database, defined as an + environment variable, e.g.: PG_CONN_STR="postgresql://USER:PASSWORD@HOST:PORT/DB_NAME" :param table_name: The name of the table to use to store Haystack documents. Defaults to "haystack_documents". :param embedding_dimension: The dimension of the embedding. Defaults to 768. :param vector_function: The similarity function to use when searching for similar embeddings. @@ -130,7 +131,7 @@ def __init__( self.hnsw_index_creation_kwargs = hnsw_index_creation_kwargs or {} self.hnsw_ef_search = hnsw_ef_search - connection = connect(connection_string) + connection = connect(self.connection_string.resolve_value()) connection.autocommit = True self._connection = connection @@ -151,7 +152,7 @@ def __init__( def to_dict(self) -> Dict[str, Any]: return default_to_dict( self, - connection_string=self.connection_string, + connection_string=self.connection_string.to_dict(), table_name=self.table_name, embedding_dimension=self.embedding_dimension, vector_function=self.vector_function, @@ -162,6 +163,11 @@ def to_dict(self) -> Dict[str, Any]: hnsw_ef_search=self.hnsw_ef_search, ) + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "PgvectorDocumentStore": + deserialize_secrets_inplace(data["init_parameters"], ["connection_string"]) + return default_from_dict(cls, data) + def _execute_sql( self, sql_query: Query, params: Optional[tuple] = None, error_msg: str = "", cursor: Optional[Cursor] = None ): @@ -221,7 +227,7 @@ def _handle_hnsw(self): ) self._execute_sql(sql_set_hnsw_ef_search, error_msg="Could not set hnsw.ef_search") - index_esists = bool( + index_exists = bool( self._execute_sql( "SELECT 1 FROM pg_indexes WHERE tablename = %s AND indexname = %s", (self.table_name, HNSW_INDEX_NAME), @@ -229,7 +235,7 @@ def _handle_hnsw(self): ).fetchone() ) - if index_esists and not self.hnsw_recreate_index_if_exists: + if index_exists and not self.hnsw_recreate_index_if_exists: logger.warning( "HNSW index already exists and won't be recreated. " "If you want to recreate it, pass 'hnsw_recreate_index_if_exists=True' to the " @@ -373,7 +379,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D return written_docs - def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict[str, Any]]: + @staticmethod + def _from_haystack_to_pg_documents(documents: List[Document]) -> List[Dict[str, Any]]: """ Internal method to convert a list of Haystack Documents to a list of dictionaries that can be used to insert documents into the PgvectorDocumentStore. @@ -395,7 +402,8 @@ def _from_haystack_to_pg_documents(self, documents: List[Document]) -> List[Dict return db_documents - def _from_pg_to_haystack_documents(self, documents: List[Dict[str, Any]]) -> List[Document]: + @staticmethod + def _from_pg_to_haystack_documents(documents: List[Dict[str, Any]]) -> List[Document]: """ Internal method to convert a list of dictionaries from pgvector to a list of Haystack Documents. """ diff --git a/integrations/pgvector/tests/conftest.py b/integrations/pgvector/tests/conftest.py index 743e8de14..068f2ac54 100644 --- a/integrations/pgvector/tests/conftest.py +++ b/integrations/pgvector/tests/conftest.py @@ -1,10 +1,12 @@ +import os + import pytest from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore @pytest.fixture def document_store(request): - connection_string = "postgresql://postgres:postgres@localhost:5432/postgres" + os.environ["PG_CONN_STR"] = "postgresql://postgres:postgres@localhost:5432/postgres" table_name = f"haystack_{request.node.name}" embedding_dimension = 768 vector_function = "cosine_similarity" @@ -12,13 +14,13 @@ def document_store(request): search_strategy = "exact_nearest_neighbor" store = PgvectorDocumentStore( - connection_string=connection_string, table_name=table_name, embedding_dimension=embedding_dimension, vector_function=vector_function, recreate_table=recreate_table, search_strategy=search_strategy, ) + yield store store.delete_table() diff --git a/integrations/pgvector/tests/test_document_store.py b/integrations/pgvector/tests/test_document_store.py index e8d9107d7..1e158f134 100644 --- a/integrations/pgvector/tests/test_document_store.py +++ b/integrations/pgvector/tests/test_document_store.py @@ -41,7 +41,6 @@ def test_write_dataframe(self, document_store: PgvectorDocumentStore): def test_init(self): document_store = PgvectorDocumentStore( - connection_string="postgresql://postgres:postgres@localhost:5432/postgres", table_name="my_table", embedding_dimension=512, vector_function="l2_distance", @@ -52,7 +51,6 @@ def test_init(self): hnsw_ef_search=50, ) - assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres" assert document_store.table_name == "my_table" assert document_store.embedding_dimension == 512 assert document_store.vector_function == "l2_distance" @@ -64,7 +62,6 @@ def test_init(self): def test_to_dict(self): document_store = PgvectorDocumentStore( - connection_string="postgresql://postgres:postgres@localhost:5432/postgres", table_name="my_table", embedding_dimension=512, vector_function="l2_distance", @@ -78,7 +75,7 @@ def test_to_dict(self): assert document_store.to_dict() == { "type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore", "init_parameters": { - "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", + "connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"}, "table_name": "my_table", "embedding_dimension": 512, "vector_function": "l2_distance", diff --git a/integrations/pgvector/tests/test_retriever.py b/integrations/pgvector/tests/test_retriever.py index cca6bbc9f..8eab10de5 100644 --- a/integrations/pgvector/tests/test_retriever.py +++ b/integrations/pgvector/tests/test_retriever.py @@ -4,6 +4,7 @@ from unittest.mock import Mock from haystack.dataclasses import Document +from haystack.utils.auth import EnvVarSecret from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore @@ -37,7 +38,7 @@ def test_to_dict(self, document_store: PgvectorDocumentStore): "document_store": { "type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore", "init_parameters": { - "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", + "connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"}, "table_name": "haystack_test_to_dict", "embedding_dimension": 768, "vector_function": "cosine_similarity", @@ -62,7 +63,7 @@ def test_from_dict(self): "document_store": { "type": "haystack_integrations.document_stores.pgvector.document_store.PgvectorDocumentStore", "init_parameters": { - "connection_string": "postgresql://postgres:postgres@localhost:5432/postgres", + "connection_string": {"env_vars": ["PG_CONN_STR"], "strict": True, "type": "env_var"}, "table_name": "haystack_test_to_dict", "embedding_dimension": 768, "vector_function": "cosine_similarity", @@ -83,7 +84,7 @@ def test_from_dict(self): document_store = retriever.document_store assert isinstance(document_store, PgvectorDocumentStore) - assert document_store.connection_string == "postgresql://postgres:postgres@localhost:5432/postgres" + assert isinstance(document_store.connection_string, EnvVarSecret) assert document_store.table_name == "haystack_test_to_dict" assert document_store.embedding_dimension == 768 assert document_store.vector_function == "cosine_similarity"