Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ashleyxuu committed Dec 23, 2023
1 parent 8019aa5 commit 1581d2b
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 33 deletions.
88 changes: 62 additions & 26 deletions docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
"source": [
"# BigQueryVectorSearch\n",
"> **BigQueryVectorSearch**:\n",
"BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results.\n",
"BigQuery vector search lets you use GoogleSQL to do semantic\n",
"search, using vector indexes for fast but approximate results,\n",
"or using brute force for exact results.\n",
"\n",
"\n",
"This tutorial illustrates how to work with an end-to-end data and embedding management system in LangChain, and provide scalable semantic search in BigQuery."
"This tutorial illustrates how to work with an end-to-end data\n",
"and embedding management system in LangChain, and provide scalable\n",
"semantic search in BigQuery."
]
},
{
Expand Down Expand Up @@ -45,7 +49,24 @@
},
"outputs": [],
"source": [
"! pip install google-cloud-aiplatform langchain==0.0.316 google-cloud-bigquery pydantic==1.10.8 typing-inspect==0.8.0 typing_extensions==4.5.0 pandas openai==0.28.1 tiktoken datasets google-api-python-client pypdf faiss-cpu transformers config --upgrade --user"
"! pip install \\\n",
" google-cloud-aiplatform \\\n",
" langchain==0.0.316 \\\n",
" google-cloud-bigquery \\\n",
" pydantic==1.10.8 \\\n",
" typing-inspect==0.8.0 \\\n",
" typing_extensions==4.5.0 \\\n",
" pandas \\\n",
" openai==0.28.1 \\\n",
" tiktoken \\\n",
" datasets \\\n",
" google-api-python-client \\\n",
" pypdf \\\n",
" faiss-cpu \\\n",
" transformers \\\n",
" config \\\n",
" --upgrade \\\n",
" --user"
]
},
{
Expand All @@ -54,7 +75,10 @@
"id": "v40bB_GMcr9f"
},
"source": [
"**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top."
"**Colab only:** Uncomment the following cell to restart the\n",
"kernel or use the button to restart the kernel. For Vertex\n",
"AI Workbench you can restart the terminal using the button\n",
"on top."
]
},
{
Expand All @@ -65,7 +89,8 @@
},
"outputs": [],
"source": [
"# # Automatically restart kernel after installs so that your environment can access the new packages\n",
"# # Automatically restart kernel after installs so that your\n",
"# # environment can access the new packages\n",
"# import IPython\n",
"\n",
"# app = IPython.Application.instance()\n",
Expand All @@ -88,7 +113,8 @@
"If you don't know your project ID, try the following:\n",
"* Run `gcloud config list`.\n",
"* Run `gcloud projects list`.\n",
"* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)."
"* See the support page:\n",
"[Locate the project ID](https://support.google.com/googleapi/answer/7014113)."
]
},
{
Expand All @@ -109,7 +135,10 @@
"source": [
"#### Set the region\n",
"\n",
"You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)."
"You can also change the `REGION` variable used by BigQuery.\n",
"Learn more about \n",
"[BigQuery regions](https://cloud.google.com/bigquery/docs/locations\\\n",
"#supported_locations)."
]
},
{
Expand All @@ -127,8 +156,11 @@
"source": [
"### Authenticating your notebook environment\n",
"\n",
"- If you are using **Colab** to run this notebook, uncomment the cell below and continue.\n",
"- If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)."
"- If you are using **Colab** to run this notebook, uncomment the cell\n",
"below and continue.\n",
"- If you are using **Vertex AI Workbench**, check out the setup\n",
"instructions\n",
"[here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)."
]
},
{
Expand All @@ -138,6 +170,7 @@
"outputs": [],
"source": [
"from google.colab import auth as google_auth\n",
"\n",
"google_auth.authenticate_user()"
]
},
Expand Down Expand Up @@ -219,11 +252,11 @@
"]\n",
"\n",
"store = BigQueryVectorSearch(\n",
" embedding,\n",
" project_id=PROJECT_ID,\n",
" dataset_name=\"<your_dataset>\",\n",
" table_name=\"<your_table>\",\n",
" location=REGION)"
" embedding,\n",
" project_id=PROJECT_ID,\n",
" dataset_name=\"<your_dataset>\",\n",
" table_name=\"<your_table>\",\n",
" location=REGION)"
]
},
{
Expand All @@ -244,16 +277,16 @@
"DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN_DISTANCE\n",
"\n",
"bq_vector_search = BigQueryVectorSearch(\n",
" project_id=PROJECT_ID,\n",
" dataset_name=\"your_dataset\",\n",
" table_name=\"<your_table>\",\n",
" # Column {content_field} must be of STRING type\n",
" content_field=\"<your_content>\",\n",
" # Column {text_embedding_field} must be of ARRAY<FLOAT64> type\n",
" text_embedding_field=\"<your_embedding>\",\n",
" embedding=embedding,\n",
" distance_strategy=DEFAULT_DISTANCE_STRATEGY,\n",
" location=REGION)"
" project_id=PROJECT_ID,\n",
" dataset_name=\"your_dataset\",\n",
" table_name=\"<your_table>\",\n",
" # Column {content_field} must be of STRING type\n",
" content_field=\"<your_content>\",\n",
" # Column {text_embedding_field} must be of ARRAY<FLOAT64> type\n",
" text_embedding_field=\"<your_embedding>\",\n",
" embedding=embedding,\n",
" distance_strategy=DEFAULT_DISTANCE_STRATEGY,\n",
" location=REGION)"
]
},
{
Expand Down Expand Up @@ -311,7 +344,7 @@
"outputs": [],
"source": [
"query_vector = embedding.embed_query(query)\n",
"docs = store.similarity_search_by_vector(query_vector, k=2)"
"docs = store.similarity_search_by_vector(query_vector,k=2)"
]
},
{
Expand All @@ -327,7 +360,10 @@
"metadata": {},
"outputs": [],
"source": [
"docs = store.similarity_search_by_vector(query_vector, filter={\"float_t\": 1.23})"
"docs = store.similarity_search_by_vector(\n",
" query_vector,\n",
" filter={\"float_t\": 1.23}\n",
")"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Type

import numpy as np
from langchain.docstore.document import Document
from langchain.schema.embeddings import Embeddings
from langchain.schema.vectorstore import VectorStore
from langchain.vectorstores.utils import maximal_marginal_relevance
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

from langchain_community.vectorstores.utils import DistanceStrategy

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@
import uuid

import pytest
from google.cloud import bigquery

from langchain_community.vectorstores.bigquery_vector_search import BigQueryVectorSearch
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings

TEST_TABLE_NAME = "langchain_test_table"

@pytest.fixture(scope="class")
def store(request: pytest.FixtureRequest) -> BigQueryVectorSearch:
"""BigQueryVectorStore tests context."""
from google.cloud import bigquery
TestBigQueryVectorStore.store = BigQueryVectorSearch(
dataset_name=TestBigQueryVectorStore.dataset_name
embedding=FakeEmbeddings,
dataset_name=TestBigQueryVectorStore.dataset_name,
table_name=TEST_TABLE_NAME
)
TestBigQueryVectorStore.store.add_texts(
TestBigQueryVectorStore.texts, TestBigQueryVectorStore.metadatas
Expand Down Expand Up @@ -70,7 +74,7 @@ def test_semantic_search(self, store: BigQueryVectorSearch) -> None:
def test_semantic_search_filter_fruits(self, store: BigQueryVectorSearch) -> None:
"""Test on semantic similarity with metadata filter."""
docs = store.similarity_search(
"food", metadata_filter="JSON_VALUE(metadata,'$.kind') = 'fruit'"
"food", filter={"kind":"fruit"}
)
kinds = [d.metadata["kind"] for d in docs]
assert "fruit" in kinds
Expand All @@ -80,7 +84,7 @@ def test_semantic_search_filter_fruits(self, store: BigQueryVectorSearch) -> Non
def test_get_doc_by_filter(self, store: BigQueryVectorSearch) -> None:
"""Test on document retrieval with metadata filter."""
docs = store.get_documents(
metadata_filter="JSON_VALUE(metadata,'$.kind') = 'fruit'"
filter={"kind":"fruit"}
)
kinds = [d.metadata["kind"] for d in docs]
assert "fruit" in kinds
Expand Down

0 comments on commit 1581d2b

Please sign in to comment.