Merge branch 'main' into vi-test

langchain-ai · Nov 28, 2024 · 4043df5 · 4043df5
2 parents e26010b + 7f704aa
commit 4043df5
Show file tree

Hide file tree

Showing 49 changed files with 2,421 additions and 1,716 deletions.
diff --git a/.github/workflows/_compile_integration_test.yml b/.github/workflows/_compile_integration_test.yml
@@ -20,10 +20,10 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
     name: "poetry run pytest -m compile tests/integration_tests #${{ matrix.python-version }}"
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/_lint.yml b/.github/workflows/_lint.yml
@@ -33,8 +33,8 @@ jobs:
         # Starting new jobs is also relatively slow,
         # so linting on fewer versions makes CI faster.
         python-version:
-          - "3.8"
-          - "3.11"
+          - "3.9"
+          - "3.12"
     steps:
       - uses: actions/checkout@v4
 

diff --git a/.github/workflows/_release.yml b/.github/workflows/_release.yml
@@ -236,6 +236,8 @@ jobs:
           packages-dir: ${{ inputs.working-directory }}/dist/
           verbose: true
           print-hash: true
+          # Temp workaround since attestations are on by default as of gh-action-pypi-publish v1.11.0
+          attestations: false
 
   mark-release:
     needs:

diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml
@@ -24,10 +24,10 @@ jobs:
     strategy:
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
+          - "3.12"
     name: "make test #${{ matrix.python-version }}"
     steps:
       - uses: actions/checkout@v4

diff --git a/.github/workflows/_test_release.yml b/.github/workflows/_test_release.yml
@@ -98,3 +98,5 @@ jobs:
           # This is *only for CI use* and is *extremely dangerous* otherwise!
           # https://github.com/pypa/gh-action-pypi-publish#tolerating-release-package-file-duplicates
           skip-existing: true
+          # Temp workaround since attestations are on by default as of gh-action-pypi-publish v1.11.0
+          attestations: false
diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py b/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py
@@ -333,9 +333,9 @@ def _create_search_query(
         full_query = f"""{embeddings_query}
         {select_clause}
         FROM VECTOR_SEARCH(
-            TABLE `{self.full_table_id}`,
+            (SELECT * FROM `{self.full_table_id}` WHERE {where_filter_expr}),
             "{self.embedding_field}",
-            (SELECT row_num, {self.embedding_field} from embeddings),
+            (SELECT row_num, {self.embedding_field} FROM embeddings),
             distance_type => "{self.distance_type}",
             top_k => {k}
         )
@@ -346,7 +346,6 @@ def _create_search_query(
         FROM (
             {full_query}
         ) AS result
-        WHERE {where_filter_expr}
         ORDER BY row_num, score
         """
         return full_query_wrapper

diff --git a/libs/community/langchain_google_community/drive.py b/libs/community/langchain_google_community/drive.py
@@ -27,6 +27,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
     """Path to the credentials file."""
     token_path: Path = Path.home() / ".credentials" / "token.json"
     """Path to the token file."""
+    credentials: Any = None
+    """Your own google credentials created via your own mechanism"""
     folder_id: Optional[str] = None
     """The folder id to load from."""
     document_ids: Optional[List[str]] = None
@@ -276,6 +278,11 @@ def _load_credentials(self) -> Any:
         if self.token_path.exists():
             creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
 
+        if self.credentials:
+            # use whatever was passed to us
+            creds = self.credentials
+            return creds
+
         if not creds or not creds.valid:
             if creds and creds.expired and creds.refresh_token:
                 creds.refresh(Request())

diff --git a/libs/community/langchain_google_community/vertex_ai_search.py b/libs/community/langchain_google_community/vertex_ai_search.py
@@ -4,7 +4,6 @@
 export PROJECT_ID=... - set to your Google Cloud project ID
 export DATA_STORE_ID=... - the ID of the search engine to use for the test
 """
-
 from __future__ import annotations
 
 import json
@@ -16,6 +15,7 @@
 from google.protobuf.json_format import MessageToDict
 from langchain_core.callbacks import CallbackManagerForRetrieverRun
 from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
 from langchain_core.load import Serializable, load
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.tools import BaseTool
@@ -279,6 +279,23 @@ class VertexAISearchRetriever(BaseRetriever, _BaseVertexAISearchRetriever):
     https://cloud.google.com/generative-ai-app-builder/docs/boost-search-results
     https://cloud.google.com/generative-ai-app-builder/docs/reference/rest/v1beta/BoostSpec
     """
+    custom_embedding: Optional[Embeddings] = None
+    """Custom embedding model for the retriever. (Bring your own embedding)
+    It needs to match the embedding model that was used to embed docs in the datastore.
+    It needs to be a langchain embedding VertexAIEmbeddings(project="{PROJECT}")
+    If you provide an embedding model, you also need to provide a ranking_expression and
+    a custom_embedding_field_path.
+    https://cloud.google.com/generative-ai-app-builder/docs/bring-embeddings
+    """
+    custom_embedding_field_path: Optional[str] = None
+    """ The field path for the custom embedding used in the Vertex AI datastore schema.
+    """
+    custom_embedding_ratio: Optional[float] = 0.0
+    """Controls the ranking of results. Value should be between 0 and 1.
+    It will generate the ranking_expression in the following manner:
+    "{custom_embedding_ratio} * dotProduct({custom_embedding_field_path}) +
+    {1 - custom_embedding_ratio} * relevance_score"
+    """
 
     _client: SearchServiceClient = PrivateAttr()
     _serving_config: str = PrivateAttr()
@@ -384,6 +401,46 @@ def _create_search_request(self, query: str) -> SearchRequest:
         else:
             content_search_spec = None
 
+        if (
+            self.custom_embedding is not None
+            or self.custom_embedding_field_path is not None
+        ):
+            if self.custom_embedding is None:
+                raise ValueError(
+                    "Please provide a custom embedding model if you provide a "
+                    "custom_embedding_field_path."
+                )
+            if self.custom_embedding_field_path is None:
+                raise ValueError(
+                    "Please provide a custom_embedding_field_path if you provide a "
+                    "custom embedding model."
+                )
+            if self.custom_embedding_ratio is None:
+                raise ValueError(
+                    "Please provide a custom_embedding_ratio if you provide a "
+                    "custom embedding model or a custom_embedding_field_path."
+                )
+            if not 0 <= self.custom_embedding_ratio <= 1:
+                raise ValueError(
+                    "Custom embedding ratio must be between 0 and 1 "
+                    f"when using custom embeddings. Got {self.custom_embedding_ratio}"
+                )
+            embedding_vector = SearchRequest.EmbeddingSpec.EmbeddingVector(
+                field_path=self.custom_embedding_field_path,
+                vector=self.custom_embedding.embed_query(query),
+            )
+            embedding_spec = SearchRequest.EmbeddingSpec(
+                embedding_vectors=[embedding_vector]
+            )
+            ranking_expression = (
+                f"{self.custom_embedding_ratio} * "
+                f"dotProduct({self.custom_embedding_field_path}) + "
+                f"{1 - self.custom_embedding_ratio} * relevance_score"
+            )
+        else:
+            embedding_spec = None
+            ranking_expression = None
+
         return SearchRequest(
             query=query,
             filter=self.filter,
@@ -397,6 +454,8 @@ def _create_search_request(self, query: str) -> SearchRequest:
             boost_spec=SearchRequest.BoostSpec(**self.boost_spec)
             if self.boost_spec
             else None,
+            embedding_spec=embedding_spec,
+            ranking_expression=ranking_expression,
         )
 
     def _get_relevant_documents(

diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-google-community"
-version = "2.0.1"
+version = "2.0.3"
 description = "An integration package connecting miscellaneous Google's products and LangChain"
 authors = []
 readme = "README.md"

diff --git a/libs/community/tests/unit_tests/test_vertex_ai_search.py b/libs/community/tests/unit_tests/test_vertex_ai_search.py
@@ -5,6 +5,7 @@
 from google.auth import credentials as ga_credentials
 from google.cloud.discoveryengine_v1beta import Document as DiscoveryEngineDocument
 from google.cloud.discoveryengine_v1beta.types import SearchRequest, SearchResponse
+from langchain_core.embeddings import FakeEmbeddings
 
 from langchain_google_community.vertex_ai_search import VertexAISearchRetriever
 
@@ -313,3 +314,104 @@ def test_convert_unstructured_search_response_extractive_answers(
         assert "relevance_score" not in documents[1].metadata
         assert "previous_segments" not in documents[1].metadata
         assert "next_segments" not in documents[1].metadata
+
+
+def test_custom_embedding_with_valid_values() -> None:
+    """
+    Test with a valid custom embedding model and field path.
+    """
+    # Mock the SearchServiceClient to avoid real network calls
+    with mock.patch(
+        "google.cloud.discoveryengine_v1beta.SearchServiceClient"
+    ) as mock_client_class:
+        mock_client = mock_client_class.return_value
+        mock_client.serving_config_path.return_value = "serving_config_value"
+        embeddings = FakeEmbeddings(size=100)
+
+        retriever = VertexAISearchRetriever(
+            project_id="project_id_value",
+            data_store_id="data_store_id_value",
+            location_id="location_id_value",
+            serving_config_id="serving_config_id_value",
+            credentials=ga_credentials.AnonymousCredentials(),
+            filter="filter_value",
+            order_by="title desc",
+            canonical_filter="true",
+            custom_embedding=embeddings,
+            custom_embedding_field_path="embedding_field",
+            custom_embedding_ratio=0.5,
+        )
+
+        # Assert that serving_config_path was called with the correct arguments
+        mock_client.serving_config_path.assert_called_once_with(
+            project="project_id_value",
+            location="location_id_value",
+            data_store="data_store_id_value",
+            serving_config="serving_config_id_value",
+        )
+
+        search_request = retriever._create_search_request(query="query_value")
+        assert search_request.embedding_spec is not None
+        assert search_request.ranking_expression == (
+            "0.5 * dotProduct(embedding_field) + 0.5 * relevance_score"
+        )
+
+
+def test_custom_embedding_with_invalid_ratio() -> None:
+    """
+    Test with an invalid custom embedding ratio.
+    """
+    with mock.patch(
+        "google.cloud.discoveryengine_v1beta.SearchServiceClient"
+    ) as mock_client_class:
+        mock_client = mock_client_class.return_value
+        mock_client.serving_config_path.return_value = "serving_config_value"
+        embeddings = FakeEmbeddings(size=100)
+        retriever = VertexAISearchRetriever(
+            project_id="mock-project",
+            data_store_id="mock-data-store",
+            custom_embedding=embeddings,
+            custom_embedding_field_path="embedding_field",
+            custom_embedding_ratio=1.5,  # Invalid ratio
+        )
+        with pytest.raises(ValueError):
+            retriever._create_search_request(query="query_value")
+
+
+def test_custom_embedding_with_missing_field_path() -> None:
+    """
+    Test with a missing custom embedding field path.
+    """
+    with mock.patch(
+        "google.cloud.discoveryengine_v1beta.SearchServiceClient"
+    ) as mock_client_class:
+        mock_client = mock_client_class.return_value
+        mock_client.serving_config_path.return_value = "serving_config_value"
+        embeddings = FakeEmbeddings(size=100)
+        retriever = VertexAISearchRetriever(
+            project_id="mock-project",
+            data_store_id="mock-data-store",
+            custom_embedding=embeddings,
+            custom_embedding_ratio=0.5,  # Invalid ratio
+        )
+        with pytest.raises(ValueError):
+            retriever._create_search_request(query="query_value")
+
+
+def test_custom_embedding_with_missing_model() -> None:
+    """
+    Test with a missing custom embedding model.
+    """
+    with mock.patch(
+        "google.cloud.discoveryengine_v1beta.SearchServiceClient"
+    ) as mock_client_class:
+        mock_client = mock_client_class.return_value
+        mock_client.serving_config_path.return_value = "serving_config_value"
+        retriever = VertexAISearchRetriever(
+            project_id="mock-project",
+            data_store_id="mock-data-store",
+            custom_embedding_field_path="embedding_field",
+            custom_embedding_ratio=0.5,  # Invalid ratio
+        )
+        with pytest.raises(ValueError):
+            retriever._create_search_request(query="query_value")
diff --git a/libs/genai/README.md b/libs/genai/README.md
@@ -8,12 +8,6 @@ This package contains the LangChain integrations for Gemini through their genera
 pip install -U langchain-google-genai
 ```
 
-### Image utilities
-To use image utility methods, like loading images from GCS urls, install with extras group 'images':
-
-```bash
-pip install -e "langchain-google-genai[images]"
-```
 
 ## Chat Models
 
@@ -60,9 +54,7 @@ The value of `image_url` can be any of the following:
 
 - A public image URL
 - An accessible gcs file (e.g., "gcs://path/to/file.png")
-- A local file path
 - A base64 encoded image (e.g., `data:image/png;base64,abcd124`)
-- A PIL image