From 1581d2b61bf9d096177fc6acf92169eabbc28cd5 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Sat, 23 Dec 2023 00:55:25 +0000 Subject: [PATCH] address comments --- .../vectorstores/bigquery_vector_search.ipynb | 88 +++++++++++++------ .../vectorstores/bigquery_vector_search.py | 6 +- .../test_bigquery_vector_search.py | 12 ++- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb index 00a9a5ae50b61..04c09564c0698 100644 --- a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb +++ b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb @@ -8,10 +8,14 @@ "source": [ "# BigQueryVectorSearch\n", "> **BigQueryVectorSearch**:\n", - "BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results.\n", + "BigQuery vector search lets you use GoogleSQL to do semantic\n", + "search, using vector indexes for fast but approximate results,\n", + "or using brute force for exact results.\n", "\n", "\n", - "This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in BigQuery." + "This tutorial illustrates how to work with an end-to-end data\n", + "and embedding management system in LangChain, and provide scalable\n", + "semantic search in BigQuery." ] }, { @@ -45,7 +49,24 @@ }, "outputs": [], "source": [ - "! pip install google-cloud-aiplatform langchain==0.0.316 google-cloud-bigquery pydantic==1.10.8 typing-inspect==0.8.0 typing_extensions==4.5.0 pandas openai==0.28.1 tiktoken datasets google-api-python-client pypdf faiss-cpu transformers config --upgrade --user" + "! pip install \\\n", + " google-cloud-aiplatform \\\n", + " langchain==0.0.316 \\\n", + " google-cloud-bigquery \\\n", + " pydantic==1.10.8 \\\n", + " typing-inspect==0.8.0 \\\n", + " typing_extensions==4.5.0 \\\n", + " pandas \\\n", + " openai==0.28.1 \\\n", + " tiktoken \\\n", + " datasets \\\n", + " google-api-python-client \\\n", + " pypdf \\\n", + " faiss-cpu \\\n", + " transformers \\\n", + " config \\\n", + " --upgrade \\\n", + " --user" ] }, { @@ -54,7 +75,10 @@ "id": "v40bB_GMcr9f" }, "source": [ - "**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." + "**Colab only:** Uncomment the following cell to restart the\n", + "kernel or use the button to restart the kernel. For Vertex\n", + "AI Workbench you can restart the terminal using the button\n", + "on top." ] }, { @@ -65,7 +89,8 @@ }, "outputs": [], "source": [ - "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# # Automatically restart kernel after installs so that your\n", + "# # environment can access the new packages\n", "# import IPython\n", "\n", "# app = IPython.Application.instance()\n", @@ -88,7 +113,8 @@ "If you don't know your project ID, try the following:\n", "* Run `gcloud config list`.\n", "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + "* See the support page:\n", + "[Locate the project ID](https://support.google.com/googleapi/answer/7014113)." ] }, { @@ -109,7 +135,10 @@ "source": [ "#### Set the region\n", "\n", - "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + "You can also change the `REGION` variable used by BigQuery.\n", + "Learn more about \n", + "[BigQuery regions](https://cloud.google.com/bigquery/docs/locations\\\n", + "#supported_locations)." ] }, { @@ -127,8 +156,11 @@ "source": [ "### Authenticating your notebook environment\n", "\n", - "- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n", - "- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." + "- If you are using **Colab** to run this notebook, uncomment the cell\n", + "below and continue.\n", + "- If you are using **Vertex AI Workbench**, check out the setup\n", + "instructions\n", + "[here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." ] }, { @@ -138,6 +170,7 @@ "outputs": [], "source": [ "from google.colab import auth as google_auth\n", + "\n", "google_auth.authenticate_user()" ] }, @@ -219,11 +252,11 @@ "]\n", "\n", "store = BigQueryVectorSearch(\n", - " embedding,\n", - " project_id=PROJECT_ID,\n", - " dataset_name=\"\",\n", - " table_name=\"\",\n", - " location=REGION)" + " embedding,\n", + " project_id=PROJECT_ID,\n", + " dataset_name=\"\",\n", + " table_name=\"\",\n", + " location=REGION)" ] }, { @@ -244,16 +277,16 @@ "DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN_DISTANCE\n", "\n", "bq_vector_search = BigQueryVectorSearch(\n", - " project_id=PROJECT_ID,\n", - " dataset_name=\"your_dataset\",\n", - " table_name=\"\",\n", - " # Column {content_field} must be of STRING type\n", - " content_field=\"\",\n", - " # Column {text_embedding_field} must be of ARRAY type\n", - " text_embedding_field=\"\",\n", - " embedding=embedding,\n", - " distance_strategy=DEFAULT_DISTANCE_STRATEGY,\n", - " location=REGION)" + " project_id=PROJECT_ID,\n", + " dataset_name=\"your_dataset\",\n", + " table_name=\"\",\n", + " # Column {content_field} must be of STRING type\n", + " content_field=\"\",\n", + " # Column {text_embedding_field} must be of ARRAY type\n", + " text_embedding_field=\"\",\n", + " embedding=embedding,\n", + " distance_strategy=DEFAULT_DISTANCE_STRATEGY,\n", + " location=REGION)" ] }, { @@ -311,7 +344,7 @@ "outputs": [], "source": [ "query_vector = embedding.embed_query(query)\n", - "docs = store.similarity_search_by_vector(query_vector, k=2)" + "docs = store.similarity_search_by_vector(query_vector,k=2)" ] }, { @@ -327,7 +360,10 @@ "metadata": {}, "outputs": [], "source": [ - "docs = store.similarity_search_by_vector(query_vector, filter={\"float_t\": 1.23})" + "docs = store.similarity_search_by_vector(\n", + " query_vector,\n", + " filter={\"float_t\": 1.23}\n", + ")" ] } ], diff --git a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py index 2a320f7b82ba9..f6fc02aa10cd1 100644 --- a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py +++ b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py @@ -12,10 +12,10 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type import numpy as np -from langchain.docstore.document import Document -from langchain.schema.embeddings import Embeddings -from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore from langchain_community.vectorstores.utils import DistanceStrategy diff --git a/libs/community/tests/integration_tests/vectorstores/test_bigquery_vector_search.py b/libs/community/tests/integration_tests/vectorstores/test_bigquery_vector_search.py index 7419ab348b74f..bffc541b9d994 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_bigquery_vector_search.py +++ b/libs/community/tests/integration_tests/vectorstores/test_bigquery_vector_search.py @@ -8,16 +8,20 @@ import uuid import pytest -from google.cloud import bigquery from langchain_community.vectorstores.bigquery_vector_search import BigQueryVectorSearch +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings +TEST_TABLE_NAME = "langchain_test_table" @pytest.fixture(scope="class") def store(request: pytest.FixtureRequest) -> BigQueryVectorSearch: """BigQueryVectorStore tests context.""" + from google.cloud import bigquery TestBigQueryVectorStore.store = BigQueryVectorSearch( - dataset_name=TestBigQueryVectorStore.dataset_name + embedding=FakeEmbeddings, + dataset_name=TestBigQueryVectorStore.dataset_name, + table_name=TEST_TABLE_NAME ) TestBigQueryVectorStore.store.add_texts( TestBigQueryVectorStore.texts, TestBigQueryVectorStore.metadatas @@ -70,7 +74,7 @@ def test_semantic_search(self, store: BigQueryVectorSearch) -> None: def test_semantic_search_filter_fruits(self, store: BigQueryVectorSearch) -> None: """Test on semantic similarity with metadata filter.""" docs = store.similarity_search( - "food", metadata_filter="JSON_VALUE(metadata,'$.kind') = 'fruit'" + "food", filter={"kind":"fruit"} ) kinds = [d.metadata["kind"] for d in docs] assert "fruit" in kinds @@ -80,7 +84,7 @@ def test_semantic_search_filter_fruits(self, store: BigQueryVectorSearch) -> Non def test_get_doc_by_filter(self, store: BigQueryVectorSearch) -> None: """Test on document retrieval with metadata filter.""" docs = store.get_documents( - metadata_filter="JSON_VALUE(metadata,'$.kind') = 'fruit'" + filter={"kind":"fruit"} ) kinds = [d.metadata["kind"] for d in docs] assert "fruit" in kinds