Skip to content

Commit

Permalink
Merge branch 'main' into vi-test
Browse files Browse the repository at this point in the history
  • Loading branch information
vishah02 authored Nov 28, 2024
2 parents e26010b + 7f704aa commit 4043df5
Show file tree
Hide file tree
Showing 49 changed files with 2,421 additions and 1,716 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/_compile_integration_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:
strategy:
matrix:
python-version:
- "3.8"
- "3.9"
- "3.10"
- "3.11"
- "3.12"
name: "poetry run pytest -m compile tests/integration_tests #${{ matrix.python-version }}"
steps:
- uses: actions/checkout@v4
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/_lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ jobs:
# Starting new jobs is also relatively slow,
# so linting on fewer versions makes CI faster.
python-version:
- "3.8"
- "3.11"
- "3.9"
- "3.12"
steps:
- uses: actions/checkout@v4

Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ jobs:
packages-dir: ${{ inputs.working-directory }}/dist/
verbose: true
print-hash: true
# Temp workaround since attestations are on by default as of gh-action-pypi-publish v1.11.0
attestations: false

mark-release:
needs:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ jobs:
strategy:
matrix:
python-version:
- "3.8"
- "3.9"
- "3.10"
- "3.11"
- "3.12"
name: "make test #${{ matrix.python-version }}"
steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/_test_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,5 @@ jobs:
# This is *only for CI use* and is *extremely dangerous* otherwise!
# https://github.com/pypa/gh-action-pypi-publish#tolerating-release-package-file-duplicates
skip-existing: true
# Temp workaround since attestations are on by default as of gh-action-pypi-publish v1.11.0
attestations: false
Original file line number Diff line number Diff line change
Expand Up @@ -333,9 +333,9 @@ def _create_search_query(
full_query = f"""{embeddings_query}
{select_clause}
FROM VECTOR_SEARCH(
TABLE `{self.full_table_id}`,
(SELECT * FROM `{self.full_table_id}` WHERE {where_filter_expr}),
"{self.embedding_field}",
(SELECT row_num, {self.embedding_field} from embeddings),
(SELECT row_num, {self.embedding_field} FROM embeddings),
distance_type => "{self.distance_type}",
top_k => {k}
)
Expand All @@ -346,7 +346,6 @@ def _create_search_query(
FROM (
{full_query}
) AS result
WHERE {where_filter_expr}
ORDER BY row_num, score
"""
return full_query_wrapper
Expand Down
7 changes: 7 additions & 0 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""Path to the credentials file."""
token_path: Path = Path.home() / ".credentials" / "token.json"
"""Path to the token file."""
credentials: Any = None
"""Your own google credentials created via your own mechanism"""
folder_id: Optional[str] = None
"""The folder id to load from."""
document_ids: Optional[List[str]] = None
Expand Down Expand Up @@ -276,6 +278,11 @@ def _load_credentials(self) -> Any:
if self.token_path.exists():
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)

if self.credentials:
# use whatever was passed to us
creds = self.credentials
return creds

if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
Expand Down
61 changes: 60 additions & 1 deletion libs/community/langchain_google_community/vertex_ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
export PROJECT_ID=... - set to your Google Cloud project ID
export DATA_STORE_ID=... - the ID of the search engine to use for the test
"""

from __future__ import annotations

import json
Expand All @@ -16,6 +15,7 @@
from google.protobuf.json_format import MessageToDict
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.load import Serializable, load
from langchain_core.retrievers import BaseRetriever
from langchain_core.tools import BaseTool
Expand Down Expand Up @@ -279,6 +279,23 @@ class VertexAISearchRetriever(BaseRetriever, _BaseVertexAISearchRetriever):
https://cloud.google.com/generative-ai-app-builder/docs/boost-search-results
https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1beta/BoostSpec
"""
custom_embedding: Optional[Embeddings] = None
"""Custom embedding model for the retriever. (Bring your own embedding)
It needs to match the embedding model that was used to embed docs in the datastore.
It needs to be a langchain embedding VertexAIEmbeddings(project="{PROJECT}")
If you provide an embedding model, you also need to provide a ranking_expression and
a custom_embedding_field_path.
https://cloud.google.com/generative-ai-app-builder/docs/bring-embeddings
"""
custom_embedding_field_path: Optional[str] = None
""" The field path for the custom embedding used in the Vertex AI datastore schema.
"""
custom_embedding_ratio: Optional[float] = 0.0
"""Controls the ranking of results. Value should be between 0 and 1.
It will generate the ranking_expression in the following manner:
"{custom_embedding_ratio} * dotProduct({custom_embedding_field_path}) +
{1 - custom_embedding_ratio} * relevance_score"
"""

_client: SearchServiceClient = PrivateAttr()
_serving_config: str = PrivateAttr()
Expand Down Expand Up @@ -384,6 +401,46 @@ def _create_search_request(self, query: str) -> SearchRequest:
else:
content_search_spec = None

if (
self.custom_embedding is not None
or self.custom_embedding_field_path is not None
):
if self.custom_embedding is None:
raise ValueError(
"Please provide a custom embedding model if you provide a "
"custom_embedding_field_path."
)
if self.custom_embedding_field_path is None:
raise ValueError(
"Please provide a custom_embedding_field_path if you provide a "
"custom embedding model."
)
if self.custom_embedding_ratio is None:
raise ValueError(
"Please provide a custom_embedding_ratio if you provide a "
"custom embedding model or a custom_embedding_field_path."
)
if not 0 <= self.custom_embedding_ratio <= 1:
raise ValueError(
"Custom embedding ratio must be between 0 and 1 "
f"when using custom embeddings. Got {self.custom_embedding_ratio}"
)
embedding_vector = SearchRequest.EmbeddingSpec.EmbeddingVector(
field_path=self.custom_embedding_field_path,
vector=self.custom_embedding.embed_query(query),
)
embedding_spec = SearchRequest.EmbeddingSpec(
embedding_vectors=[embedding_vector]
)
ranking_expression = (
f"{self.custom_embedding_ratio} * "
f"dotProduct({self.custom_embedding_field_path}) + "
f"{1 - self.custom_embedding_ratio} * relevance_score"
)
else:
embedding_spec = None
ranking_expression = None

return SearchRequest(
query=query,
filter=self.filter,
Expand All @@ -397,6 +454,8 @@ def _create_search_request(self, query: str) -> SearchRequest:
boost_spec=SearchRequest.BoostSpec(**self.boost_spec)
if self.boost_spec
else None,
embedding_spec=embedding_spec,
ranking_expression=ranking_expression,
)

def _get_relevant_documents(
Expand Down
2 changes: 1 addition & 1 deletion libs/community/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-google-community"
version = "2.0.1"
version = "2.0.3"
description = "An integration package connecting miscellaneous Google's products and LangChain"
authors = []
readme = "README.md"
Expand Down
102 changes: 102 additions & 0 deletions libs/community/tests/unit_tests/test_vertex_ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from google.auth import credentials as ga_credentials
from google.cloud.discoveryengine_v1beta import Document as DiscoveryEngineDocument
from google.cloud.discoveryengine_v1beta.types import SearchRequest, SearchResponse
from langchain_core.embeddings import FakeEmbeddings

from langchain_google_community.vertex_ai_search import VertexAISearchRetriever

Expand Down Expand Up @@ -313,3 +314,104 @@ def test_convert_unstructured_search_response_extractive_answers(
assert "relevance_score" not in documents[1].metadata
assert "previous_segments" not in documents[1].metadata
assert "next_segments" not in documents[1].metadata


def test_custom_embedding_with_valid_values() -> None:
"""
Test with a valid custom embedding model and field path.
"""
# Mock the SearchServiceClient to avoid real network calls
with mock.patch(
"google.cloud.discoveryengine_v1beta.SearchServiceClient"
) as mock_client_class:
mock_client = mock_client_class.return_value
mock_client.serving_config_path.return_value = "serving_config_value"
embeddings = FakeEmbeddings(size=100)

retriever = VertexAISearchRetriever(
project_id="project_id_value",
data_store_id="data_store_id_value",
location_id="location_id_value",
serving_config_id="serving_config_id_value",
credentials=ga_credentials.AnonymousCredentials(),
filter="filter_value",
order_by="title desc",
canonical_filter="true",
custom_embedding=embeddings,
custom_embedding_field_path="embedding_field",
custom_embedding_ratio=0.5,
)

# Assert that serving_config_path was called with the correct arguments
mock_client.serving_config_path.assert_called_once_with(
project="project_id_value",
location="location_id_value",
data_store="data_store_id_value",
serving_config="serving_config_id_value",
)

search_request = retriever._create_search_request(query="query_value")
assert search_request.embedding_spec is not None
assert search_request.ranking_expression == (
"0.5 * dotProduct(embedding_field) + 0.5 * relevance_score"
)


def test_custom_embedding_with_invalid_ratio() -> None:
"""
Test with an invalid custom embedding ratio.
"""
with mock.patch(
"google.cloud.discoveryengine_v1beta.SearchServiceClient"
) as mock_client_class:
mock_client = mock_client_class.return_value
mock_client.serving_config_path.return_value = "serving_config_value"
embeddings = FakeEmbeddings(size=100)
retriever = VertexAISearchRetriever(
project_id="mock-project",
data_store_id="mock-data-store",
custom_embedding=embeddings,
custom_embedding_field_path="embedding_field",
custom_embedding_ratio=1.5, # Invalid ratio
)
with pytest.raises(ValueError):
retriever._create_search_request(query="query_value")


def test_custom_embedding_with_missing_field_path() -> None:
"""
Test with a missing custom embedding field path.
"""
with mock.patch(
"google.cloud.discoveryengine_v1beta.SearchServiceClient"
) as mock_client_class:
mock_client = mock_client_class.return_value
mock_client.serving_config_path.return_value = "serving_config_value"
embeddings = FakeEmbeddings(size=100)
retriever = VertexAISearchRetriever(
project_id="mock-project",
data_store_id="mock-data-store",
custom_embedding=embeddings,
custom_embedding_ratio=0.5, # Invalid ratio
)
with pytest.raises(ValueError):
retriever._create_search_request(query="query_value")


def test_custom_embedding_with_missing_model() -> None:
"""
Test with a missing custom embedding model.
"""
with mock.patch(
"google.cloud.discoveryengine_v1beta.SearchServiceClient"
) as mock_client_class:
mock_client = mock_client_class.return_value
mock_client.serving_config_path.return_value = "serving_config_value"
retriever = VertexAISearchRetriever(
project_id="mock-project",
data_store_id="mock-data-store",
custom_embedding_field_path="embedding_field",
custom_embedding_ratio=0.5, # Invalid ratio
)
with pytest.raises(ValueError):
retriever._create_search_request(query="query_value")
8 changes: 0 additions & 8 deletions libs/genai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@ This package contains the LangChain integrations for Gemini through their genera
pip install -U langchain-google-genai
```

### Image utilities
To use image utility methods, like loading images from GCS urls, install with extras group 'images':

```bash
pip install -e "langchain-google-genai[images]"
```

## Chat Models

Expand Down Expand Up @@ -60,9 +54,7 @@ The value of `image_url` can be any of the following:

- A public image URL
- An accessible gcs file (e.g., "gcs://path/to/file.png")
- A local file path
- A base64 encoded image (e.g., ``)
- A PIL image



Expand Down
Loading

0 comments on commit 4043df5

Please sign in to comment.