Merge branch 'main' into main

langchain-ai · Dec 2, 2024 · 1a1d787 · 1a1d787
2 parents 81a2fdc + b33d7b6
commit 1a1d787
Show file tree

Hide file tree

Showing 22 changed files with 1,276 additions and 741 deletions.
diff --git a/.github/workflows/_lint.yml b/.github/workflows/_lint.yml
@@ -69,7 +69,7 @@ jobs:
         # It doesn't matter how you change it, any change will cause a cache-bust.
         working-directory: ${{ inputs.working-directory }}
         run: |
-          poetry install --with lint,typing
+          poetry install --with lint,typing --all-extras
 
       - name: Install langchain editable
         working-directory: ${{ inputs.working-directory }}
@@ -88,7 +88,6 @@ jobs:
             ${{ env.WORKDIR }}/.mypy_cache
           key: mypy-lint-${{ runner.os }}-${{ runner.arch }}-py${{ matrix.python-version }}-${{ inputs.working-directory }}-${{ hashFiles(format('{0}/poetry.lock', inputs.working-directory)) }}
 
-
       - name: Analysing the code with our lint
         working-directory: ${{ inputs.working-directory }}
         run: |

diff --git a/libs/community/langchain_google_community/documentai_warehouse.py b/libs/community/langchain_google_community/documentai_warehouse.py
@@ -40,7 +40,7 @@ class DocumentAIWarehouseRetriever(BaseRetriever):
     If nothing is provided, all documents in the project will be searched."""
     qa_size_limit: int = 5
     """The limit on the number of documents returned."""
-    client: "DocumentServiceClient" = None  #: :meta private:
+    client: "DocumentServiceClient" = None  # type:ignore[assignment] #: :meta private:
 
     @model_validator(mode="before")
     @classmethod

diff --git a/libs/community/langchain_google_community/drive.py b/libs/community/langchain_google_community/drive.py
@@ -27,6 +27,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
     """Path to the credentials file."""
     token_path: Path = Path.home() / ".credentials" / "token.json"
     """Path to the token file."""
+    credentials: Any = None
+    """Your own google credentials created via your own mechanism"""
     folder_id: Optional[str] = None
     """The folder id to load from."""
     document_ids: Optional[List[str]] = None
@@ -276,6 +278,11 @@ def _load_credentials(self) -> Any:
         if self.token_path.exists():
             creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
 
+        if self.credentials:
+            # use whatever was passed to us
+            creds = self.credentials
+            return creds
+
         if not creds or not creds.valid:
             if creds and creds.expired and creds.refresh_token:
                 creds.refresh(Request())

diff --git a/libs/community/pyproject.toml b/libs/community/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-google-community"
-version = "2.0.2"
+version = "2.0.3"
 description = "An integration package connecting miscellaneous Google's products and LangChain"
 authors = []
 readme = "README.md"

diff --git a/libs/genai/langchain_google_genai/_image_utils.py b/libs/genai/langchain_google_genai/_image_utils.py
@@ -8,6 +8,7 @@
 from typing import Any, Dict
 from urllib.parse import urlparse
 
+import filetype  # type: ignore[import]
 import requests
 from google.ai.generativelanguage_v1beta.types import Part
 
@@ -87,7 +88,13 @@ def load_part(self, image_string: str) -> Part:
             raise ValueError(msg)
 
         inline_data: Dict[str, Any] = {"data": bytes_}
+
         mime_type, _ = mimetypes.guess_type(image_string)
+        if not mime_type:
+            kind = filetype.guess(bytes_)
+            if kind:
+                mime_type = kind.mime
+
         if mime_type:
             inline_data["mime_type"] = mime_type
 

diff --git a/libs/genai/poetry.lock b/libs/genai/poetry.lock
diff --git a/libs/genai/pyproject.toml b/libs/genai/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langchain-google-genai"
-version = "2.0.5"
+version = "2.0.6"
 description = "An integration package connecting Google's genai package and LangChain"
 authors = []
 readme = "README.md"
@@ -15,6 +15,7 @@ python = ">=3.9,<4.0"
 langchain-core = ">=0.3.15,<0.4"
 google-generativeai = "^0.8.0"
 pydantic = ">=2,<3"
+filetype = "^1.2.0"
 
 [tool.poetry.group.test]
 optional = true

diff --git a/libs/genai/tests/integration_tests/test_chat_models.py b/libs/genai/tests/integration_tests/test_chat_models.py
@@ -158,6 +158,33 @@ def test_chat_google_genai_invoke_multimodal() -> None:
         assert len(chunk.content.strip()) > 0
 
 
+def test_chat_google_genai_invoke_multimodal_by_url() -> None:
+    messages: list = [
+        HumanMessage(
+            content=[
+                {
+                    "type": "text",
+                    "text": "Guess what's in this picture! You have 3 guesses.",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": "https://picsum.photos/seed/picsum/200/300",
+                },
+            ]
+        ),
+    ]
+    llm = ChatGoogleGenerativeAI(model=_VISION_MODEL)
+    response = llm.invoke(messages)
+    assert isinstance(response.content, str)
+    assert len(response.content.strip()) > 0
+
+    # Try streaming
+    for chunk in llm.stream(messages):
+        print(chunk)  # noqa: T201
+        assert isinstance(chunk.content, str)
+        assert len(chunk.content.strip()) > 0
+
+
 def test_chat_google_genai_invoke_multimodal_multiple_messages() -> None:
     messages: list = [
         HumanMessage(content="Hi there"),

diff --git a/libs/vertexai/langchain_google_vertexai/_anthropic_utils.py b/libs/vertexai/langchain_google_vertexai/_anthropic_utils.py
@@ -31,7 +31,9 @@
 from pydantic import BaseModel
 
 if TYPE_CHECKING:
-    from anthropic.types import RawMessageStreamEvent  # type: ignore
+    from anthropic.types import (
+        RawMessageStreamEvent,  # type: ignore[unused-ignore, import-not-found]
+    )
 
 _message_type_lookups = {
     "human": "user",
@@ -57,6 +59,74 @@ def _format_image(image_url: str) -> Dict:
     }
 
 
+def _format_message_anthropic(message: Union[HumanMessage, AIMessage]):
+    role = _message_type_lookups[message.type]
+    content: List[Dict[str, Any]] = []
+
+    if isinstance(message.content, str):
+        if not message.content.strip():
+            return None
+        content.append({"type": "text", "text": message.content})
+    elif isinstance(message.content, list):
+        for block in message.content:
+            if isinstance(block, str):
+                # Only add non-empty strings for now as empty ones are not
+                # accepted.
+                # https://github.com/anthropics/anthropic-sdk-python/issues/461
+                if not block.strip():
+                    continue
+                content.append({"type": "text", "text": block})
+
+            if isinstance(block, dict):
+                if "type" not in block:
+                    raise ValueError("Dict content block must have a type key")
+
+                new_block = {}
+
+                for copy_attr in ["type", "cache_control"]:
+                    if copy_attr in block:
+                        new_block[copy_attr] = block[copy_attr]
+
+                if block["type"] == "text":
+                    text: str = block.get("text", "")
+                    # Only add non-empty strings for now as empty ones are not
+                    # accepted.
+                    # https://github.com/anthropics/anthropic-sdk-python/issues/461
+                    if text.strip():
+                        new_block["text"] = text
+                        content.append(new_block)
+                    continue
+
+                if block["type"] == "image_url":
+                    # convert format
+                    new_block["source"] = _format_image(block["image_url"]["url"])
+                    content.append(new_block)
+                    continue
+
+                if block["type"] == "tool_use":
+                    # If a tool_call with the same id as a tool_use content block
+                    # exists, the tool_call is preferred.
+                    if isinstance(message, AIMessage) and message.tool_calls:
+                        is_unique = block["id"] not in [
+                            tc["id"] for tc in message.tool_calls
+                        ]
+                        if not is_unique:
+                            continue
+
+                # all other block types
+                content.append(block)
+    else:
+        raise ValueError("Message should be a str, list of str or list of dicts")
+
+    # adding all tool calls
+    if isinstance(message, AIMessage) and message.tool_calls:
+        for tc in message.tool_calls:
+            tu = cast(Dict[str, Any], _lc_tool_call_to_anthropic_tool_use_block(tc))
+            content.append(tu)
+
+    return {"role": role, "content": content}
+
+
 def _format_messages_anthropic(
     messages: List[BaseMessage],
 ) -> Tuple[Optional[str], List[Dict]]:
@@ -77,81 +147,11 @@ def _format_messages_anthropic(
             system_message = message.content
             continue
 
-        role = _message_type_lookups[message.type]
-        content: Union[str, List]
-
-        if not isinstance(message.content, str):
-            # parse as dict
-            assert isinstance(
-                message.content, list
-            ), "Anthropic message content must be str or list of dicts"
-
-            # populate content
-            content = []
-            for item in message.content:
-                if isinstance(item, str):
-                    content.append(
-                        {
-                            "type": "text",
-                            "text": item,
-                        }
-                    )
-                elif isinstance(item, dict):
-                    if "type" not in item:
-                        raise ValueError("Dict content item must have a type key")
-                    elif item["type"] == "image_url":
-                        # convert format
-                        source = _format_image(item["image_url"]["url"])
-                        content.append(
-                            {
-                                "type": "image",
-                                "source": source,
-                            }
-                        )
-                    elif item["type"] == "tool_use":
-                        # If a tool_call with the same id as a tool_use content block
-                        # exists, the tool_call is preferred.
-                        if isinstance(message, AIMessage) and item["id"] in [
-                            tc["id"] for tc in message.tool_calls
-                        ]:
-                            overlapping = [
-                                tc
-                                for tc in message.tool_calls
-                                if tc["id"] == item["id"]
-                            ]
-                            content.extend(
-                                _lc_tool_calls_to_anthropic_tool_use_blocks(overlapping)
-                            )
-                        else:
-                            item.pop("text", None)
-                            content.append(item)
-                    elif item["type"] == "text":
-                        text = item.get("text", "")
-                        # Only add non-empty strings for now as empty ones are not
-                        # accepted.
-                        # https://github.com/anthropics/anthropic-sdk-python/issues/461
-                        if text.strip():
-                            content.append({"type": "text", "text": text})
-                    else:
-                        content.append(item)
-                else:
-                    raise ValueError(
-                        f"Content items must be str or dict, instead was: {type(item)}"
-                    )
-        elif isinstance(message, AIMessage) and message.tool_calls:
-            content = (
-                []
-                if not message.content
-                else [{"type": "text", "text": message.content}]
-            )
-            # Note: Anthropic can't have invalid tool calls as presently defined,
-            # since the model already returns dicts args not JSON strings, and invalid
-            # tool calls are those with invalid JSON for args.
-            content += _lc_tool_calls_to_anthropic_tool_use_blocks(message.tool_calls)
-        else:
-            content = message.content
+        fm = _format_message_anthropic(message)
+        if not fm:
+            continue
+        formatted_messages.append(fm)
 
-        formatted_messages.append({"role": role, "content": content})
     return system_message, formatted_messages
 
 
@@ -184,7 +184,7 @@ def _merge_messages(
     """Merge runs of human/tool messages into single human messages with content blocks."""  # noqa: E501
     merged: list = []
     for curr in messages:
-        curr = curr.copy(deep=True)
+        curr = curr.model_copy(deep=True)
         if isinstance(curr, ToolMessage):
             if isinstance(curr.content, list) and all(
                 isinstance(block, dict) and block.get("type") == "tool_result"
@@ -224,20 +224,15 @@ class _AnthropicToolUse(TypedDict):
     id: str
 
 
-def _lc_tool_calls_to_anthropic_tool_use_blocks(
-    tool_calls: List[ToolCall],
-) -> List[_AnthropicToolUse]:
-    blocks = []
-    for tool_call in tool_calls:
-        blocks.append(
-            _AnthropicToolUse(
-                type="tool_use",
-                name=tool_call["name"],
-                input=tool_call["args"],
-                id=cast(str, tool_call["id"]),
-            )
-        )
-    return blocks
+def _lc_tool_call_to_anthropic_tool_use_block(
+    tool_call: ToolCall,
+) -> _AnthropicToolUse:
+    return _AnthropicToolUse(
+        type="tool_use",
+        name=tool_call["name"],
+        input=tool_call["args"],
+        id=cast(str, tool_call["id"]),
+    )
 
 
 def _make_message_chunk_from_anthropic_event(

diff --git a/libs/vertexai/langchain_google_vertexai/_image_utils.py b/libs/vertexai/langchain_google_vertexai/_image_utils.py
@@ -4,6 +4,7 @@
 import os
 import re
 from enum import Enum
+from functools import cached_property
 from typing import Dict, Optional, Union
 from urllib.parse import urlparse
 
@@ -43,6 +44,10 @@ def __init__(
         """
         self._project = project
 
+    @cached_property
+    def _storage_client(self):
+        return storage.Client(project=self._project)
+
     def load_bytes(self, image_string: str) -> bytes:
         """Routes to the correct loader based on the image_string.
 
@@ -198,7 +203,7 @@ def _blob_from_gcs(self, gcs_uri: str) -> storage.Blob:
             storage.Blob
         """
 
-        gcs_client = storage.Client(project=self._project)
+        gcs_client = self._storage_client
         blob = storage.Blob.from_string(gcs_uri, gcs_client)
         blob.reload(client=gcs_client)
         return blob

diff --git a/libs/vertexai/langchain_google_vertexai/model_garden.py b/libs/vertexai/langchain_google_vertexai/model_garden.py
@@ -148,7 +148,7 @@ def __init__(self, **kwargs: Any) -> None:
 
     @model_validator(mode="after")
     def validate_environment(self) -> Self:
-        from anthropic import (  # type: ignore
+        from anthropic import (  # type: ignore[unused-ignore, import-not-found]
             AnthropicVertex,
             AsyncAnthropicVertex,
         )