diff --git a/libs/vertexai/langchain_google_vertexai/__init__.py b/libs/vertexai/langchain_google_vertexai/__init__.py
index be365bde..9f4b90ee 100644
--- a/libs/vertexai/langchain_google_vertexai/__init__.py
+++ b/libs/vertexai/langchain_google_vertexai/__init__.py
@@ -3,10 +3,21 @@
 from langchain_google_vertexai.chat_models import ChatVertexAI
 from langchain_google_vertexai.embeddings import VertexAIEmbeddings
 from langchain_google_vertexai.functions_utils import PydanticFunctionsOutputParser
-from langchain_google_vertexai.llms import VertexAI, VertexAIModelGarden
+from langchain_google_vertexai.gemma import (
+    GemmaChatLocalKaggle,
+    GemmaChatVertexAIModelGarden,
+    GemmaLocalKaggle,
+    GemmaVertexAIModelGarden,
+)
+from langchain_google_vertexai.llms import VertexAI
+from langchain_google_vertexai.model_garden import VertexAIModelGarden
 
 __all__ = [
     "ChatVertexAI",
+    "GemmaVertexAIModelGarden",
+    "GemmaChatVertexAIModelGarden",
+    "GemmaLocalKaggle",
+    "GemmaChatLocalKaggle",
     "VertexAIEmbeddings",
     "VertexAI",
     "VertexAIModelGarden",
diff --git a/libs/vertexai/langchain_google_vertexai/_base.py b/libs/vertexai/langchain_google_vertexai/_base.py
new file mode 100644
index 00000000..1b2a9846
--- /dev/null
+++ b/libs/vertexai/langchain_google_vertexai/_base.py
@@ -0,0 +1,287 @@
+from __future__ import annotations
+
+from concurrent.futures import Executor
+from typing import Any, ClassVar, Dict, List, Optional
+
+import vertexai  # type: ignore[import-untyped]
+from google.api_core.client_options import ClientOptions
+from google.cloud.aiplatform.gapic import (
+    PredictionServiceAsyncClient,
+    PredictionServiceClient,
+)
+from google.cloud.aiplatform.models import Prediction
+from google.protobuf import json_format
+from google.protobuf.struct_pb2 import Value
+from langchain_core.outputs import Generation, LLMResult
+from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
+from vertexai.language_models import (  # type: ignore[import-untyped]
+    TextGenerationModel,
+)
+from vertexai.preview.language_models import (  # type: ignore[import-untyped]
+    ChatModel as PreviewChatModel,
+)
+from vertexai.preview.language_models import (
+    CodeChatModel as PreviewCodeChatModel,
+)
+
+from langchain_google_vertexai._enums import HarmBlockThreshold, HarmCategory
+from langchain_google_vertexai._utils import (
+    get_client_info,
+    is_codey_model,
+    is_gemini_model,
+)
+
+_PALM_DEFAULT_MAX_OUTPUT_TOKENS = TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS
+_PALM_DEFAULT_TEMPERATURE = 0.0
+_PALM_DEFAULT_TOP_P = 0.95
+_PALM_DEFAULT_TOP_K = 40
+
+
+class _VertexAIBase(BaseModel):
+    client: Any = None  #: :meta private:
+    project: Optional[str] = None
+    "The default GCP project to use when making Vertex API calls."
+    location: str = "us-central1"
+    "The default location to use when making API calls."
+    request_parallelism: int = 5
+    "The amount of parallelism allowed for requests issued to VertexAI models. "
+    "Default is 5."
+    max_retries: int = 6
+    """The maximum number of retries to make when generating."""
+    task_executor: ClassVar[Optional[Executor]] = Field(default=None, exclude=True)
+    stop: Optional[List[str]] = None
+    "Optional list of stop words to use when generating."
+    model_name: Optional[str] = None
+    "Underlying model name."
+
+
+class _VertexAICommon(_VertexAIBase):
+    client_preview: Any = None  #: :meta private:
+    model_name: str
+    "Underlying model name."
+    temperature: Optional[float] = None
+    "Sampling temperature, it controls the degree of randomness in token selection."
+    max_output_tokens: Optional[int] = None
+    "Token limit determines the maximum amount of text output from one prompt."
+    top_p: Optional[float] = None
+    "Tokens are selected from most probable to least until the sum of their "
+    "probabilities equals the top-p value. Top-p is ignored for Codey models."
+    top_k: Optional[int] = None
+    "How the model selects tokens for output, the next token is selected from "
+    "among the top-k most probable tokens. Top-k is ignored for Codey models."
+    credentials: Any = Field(default=None, exclude=True)
+    "The default custom credentials (google.auth.credentials.Credentials) to use "
+    "when making API calls. If not provided, credentials will be ascertained from "
+    "the environment."
+    n: int = 1
+    """How many completions to generate for each prompt."""
+    streaming: bool = False
+    """Whether to stream the results or not."""
+    safety_settings: Optional[Dict[HarmCategory, HarmBlockThreshold]] = None
+    """The default safety settings to use for all generations. 
+    
+        For example: 
+
+            from langchain_google_vertexai import HarmBlockThreshold, HarmCategory
+
+            safety_settings = {
+                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
+                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
+                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
+                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
+                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+            }
+            """  # noqa: E501
+
+    @property
+    def _llm_type(self) -> str:
+        return "vertexai"
+
+    @property
+    def is_codey_model(self) -> bool:
+        return is_codey_model(self.model_name)
+
+    @property
+    def _is_gemini_model(self) -> bool:
+        return is_gemini_model(self.model_name)
+
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Gets the identifying parameters."""
+        return {**{"model_name": self.model_name}, **self._default_params}
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        if self._is_gemini_model:
+            default_params = {}
+        else:
+            default_params = {
+                "temperature": _PALM_DEFAULT_TEMPERATURE,
+                "max_output_tokens": _PALM_DEFAULT_MAX_OUTPUT_TOKENS,
+                "top_p": _PALM_DEFAULT_TOP_P,
+                "top_k": _PALM_DEFAULT_TOP_K,
+            }
+        params = {
+            "temperature": self.temperature,
+            "max_output_tokens": self.max_output_tokens,
+            "candidate_count": self.n,
+        }
+        if not self.is_codey_model:
+            params.update(
+                {
+                    "top_k": self.top_k,
+                    "top_p": self.top_p,
+                }
+            )
+        updated_params = {}
+        for param_name, param_value in params.items():
+            default_value = default_params.get(param_name)
+            if param_value or default_value:
+                updated_params[param_name] = (
+                    param_value if param_value else default_value
+                )
+        return updated_params
+
+    @classmethod
+    def _init_vertexai(cls, values: Dict) -> None:
+        vertexai.init(
+            project=values.get("project"),
+            location=values.get("location"),
+            credentials=values.get("credentials"),
+        )
+        return None
+
+    def _prepare_params(
+        self,
+        stop: Optional[List[str]] = None,
+        stream: bool = False,
+        **kwargs: Any,
+    ) -> dict:
+        stop_sequences = stop or self.stop
+        params_mapping = {"n": "candidate_count"}
+        params = {params_mapping.get(k, k): v for k, v in kwargs.items()}
+        params = {**self._default_params, "stop_sequences": stop_sequences, **params}
+        if stream or self.streaming:
+            params.pop("candidate_count")
+        return params
+
+    def get_num_tokens(self, text: str) -> int:
+        """Get the number of tokens present in the text.
+
+        Useful for checking if an input will fit in a model's context window.
+
+        Args:
+            text: The string input to tokenize.
+
+        Returns:
+            The integer number of tokens in the text.
+        """
+        is_palm_chat_model = isinstance(
+            self.client_preview, PreviewChatModel
+        ) or isinstance(self.client_preview, PreviewCodeChatModel)
+        if is_palm_chat_model:
+            result = self.client_preview.start_chat().count_tokens(text)
+        else:
+            result = self.client_preview.count_tokens([text])
+
+        return result.total_tokens
+
+
+class _BaseVertexAIModelGarden(_VertexAIBase):
+    """Large language models served from Vertex AI Model Garden."""
+
+    async_client: Any = None  #: :meta private:
+    endpoint_id: str
+    "A name of an endpoint where the model has been deployed."
+    allowed_model_args: Optional[List[str]] = None
+    "Allowed optional args to be passed to the model."
+    prompt_arg: str = "prompt"
+    result_arg: Optional[str] = "generated_text"
+    "Set result_arg to None if output of the model is expected to be a string."
+    "Otherwise, if it's a dict, provided an argument that contains the result."
+    single_example_per_request: bool = True
+    "LLM endpoint currently serves only the first example in the request"
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that the python package exists in environment."""
+
+        if not values["project"]:
+            raise ValueError(
+                "A GCP project should be provided to run inference on Model Garden!"
+            )
+
+        client_options = ClientOptions(
+            api_endpoint=f"{values['location']}-aiplatform.googleapis.com"
+        )
+        client_info = get_client_info(module="vertex-ai-model-garden")
+        values["client"] = PredictionServiceClient(
+            client_options=client_options, client_info=client_info
+        )
+        values["async_client"] = PredictionServiceAsyncClient(
+            client_options=client_options, client_info=client_info
+        )
+        return values
+
+    @property
+    def endpoint_path(self) -> str:
+        return self.client.endpoint_path(
+            project=self.project, location=self.location, endpoint=self.endpoint_id
+        )
+
+    @property
+    def _llm_type(self) -> str:
+        return "vertexai_model_garden"
+
+    def _prepare_request(self, prompts: List[str], **kwargs: Any) -> List["Value"]:
+        instances = []
+        for prompt in prompts:
+            if self.allowed_model_args:
+                instance = {
+                    k: v for k, v in kwargs.items() if k in self.allowed_model_args
+                }
+            else:
+                instance = {}
+            instance[self.prompt_arg] = prompt
+            instances.append(instance)
+
+        predict_instances = [
+            json_format.ParseDict(instance_dict, Value()) for instance_dict in instances
+        ]
+        return predict_instances
+
+    def _parse_response(self, predictions: "Prediction") -> LLMResult:
+        generations: List[List[Generation]] = []
+        for result in predictions.predictions:
+            if isinstance(result, str):
+                generations.append([Generation(text=self._parse_prediction(result))])
+            else:
+                generations.append(
+                    [
+                        Generation(text=self._parse_prediction(prediction))
+                        for prediction in result
+                    ]
+                )
+        return LLMResult(generations=generations)
+
+    def _parse_prediction(self, prediction: Any) -> str:
+        if isinstance(prediction, str):
+            return prediction
+
+        if self.result_arg:
+            try:
+                return prediction[self.result_arg]
+            except KeyError:
+                if isinstance(prediction, str):
+                    error_desc = (
+                        "Provided non-None `result_arg` (result_arg="
+                        f"{self.result_arg}). But got prediction of type "
+                        f"{type(prediction)} instead of dict. Most probably, you"
+                        "need to set `result_arg=None` during VertexAIModelGarden "
+                        "initialization."
+                    )
+                    raise ValueError(error_desc)
+                else:
+                    raise ValueError(f"{self.result_arg} key not found in prediction!")
+
+        return prediction
diff --git a/libs/vertexai/langchain_google_vertexai/chat_models.py b/libs/vertexai/langchain_google_vertexai/chat_models.py
index 3f7f42d8..c7129a6d 100644
--- a/libs/vertexai/langchain_google_vertexai/chat_models.py
+++ b/libs/vertexai/langchain_google_vertexai/chat_models.py
@@ -49,6 +49,9 @@
     CodeChatModel as PreviewCodeChatModel,
 )
 
+from langchain_google_vertexai._base import (
+    _VertexAICommon,
+)
 from langchain_google_vertexai._image_utils import ImageBytesLoader
 from langchain_google_vertexai._utils import (
     get_generation_info,
@@ -58,9 +61,6 @@
 from langchain_google_vertexai.functions_utils import (
     _format_tools_to_vertex_tool,
 )
-from langchain_google_vertexai.llms import (
-    _VertexAICommon,
-)
 
 logger = logging.getLogger(__name__)
 
diff --git a/libs/vertexai/langchain_google_vertexai/embeddings.py b/libs/vertexai/langchain_google_vertexai/embeddings.py
index 041c21fb..f7a9b97f 100644
--- a/libs/vertexai/langchain_google_vertexai/embeddings.py
+++ b/libs/vertexai/langchain_google_vertexai/embeddings.py
@@ -20,7 +20,7 @@
     TextEmbeddingModel,
 )
 
-from langchain_google_vertexai.llms import _VertexAICommon
+from langchain_google_vertexai._base import _VertexAICommon
 
 logger = logging.getLogger(__name__)
 
diff --git a/libs/vertexai/langchain_google_vertexai/gemma.py b/libs/vertexai/langchain_google_vertexai/gemma.py
new file mode 100644
index 00000000..5b90d0bc
--- /dev/null
+++ b/libs/vertexai/langchain_google_vertexai/gemma.py
@@ -0,0 +1,217 @@
+import os
+from typing import Any, Dict, List, Optional, cast
+
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models.chat_models import (
+    BaseChatModel,
+)
+from langchain_core.language_models.llms import BaseLLM
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    SystemMessage,
+)
+from langchain_core.outputs import (
+    ChatGeneration,
+    ChatResult,
+    Generation,
+    LLMResult,
+)
+from langchain_core.pydantic_v1 import BaseModel, root_validator
+
+from langchain_google_vertexai._base import _BaseVertexAIModelGarden
+from langchain_google_vertexai.model_garden import VertexAIModelGarden
+
+USER_CHAT_TEMPLATE = "<start_of_turn>user\n{prompt}<end_of_turn>\n"
+MODEL_CHAT_TEMPLATE = "<start_of_turn>model\n{prompt}<end_of_turn>\n"
+
+
+def gemma_messages_to_prompt(history: List[BaseMessage]) -> str:
+    """Converts a list of messages to a chat prompt for Gemma."""
+    messages: List[str] = []
+    if len(messages) == 1:
+        content = cast(str, history[0].content)
+        if isinstance(history[0], SystemMessage):
+            raise ValueError("Gemma currently doesn't support system message!")
+        return content
+    for message in history:
+        content = cast(str, message.content)
+        if isinstance(message, SystemMessage):
+            raise ValueError("Gemma currently doesn't support system message!")
+        elif isinstance(message, AIMessage):
+            messages.append(MODEL_CHAT_TEMPLATE.format(prompt=content))
+        elif isinstance(message, HumanMessage):
+            messages.append(USER_CHAT_TEMPLATE.format(prompt=content))
+        else:
+            raise ValueError(f"Unexpected message with type {type(message)}")
+    messages.append("<start_of_turn>model\n")
+    return "".join(messages)
+
+
+class _GemmaBase(BaseModel):
+    max_tokens: Optional[int] = None
+    """The maximum number of tokens to generate."""
+    temperature: Optional[float] = None
+    """The temperature to use for sampling."""
+    top_p: Optional[float] = None
+    """The top-p value to use for sampling."""
+    top_k: Optional[int] = None
+    """The top-k value to use for sampling."""
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling gemma."""
+        params = {
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "top_k": self.top_k,
+        }
+        return {k: v for k, v in params.items() if v is not None}
+
+    def _get_params(self, **kwargs) -> Dict[str, Any]:
+        return {k: kwargs.get(k, v) for k, v in self._default_params.items()}
+
+
+class GemmaVertexAIModelGarden(VertexAIModelGarden):
+    allowed_model_args: Optional[List[str]] = [
+        "temperature",
+        "top_p",
+        "top_k",
+        "max_tokens",
+    ]
+
+    @property
+    def _llm_type(self) -> str:
+        return "gemma_vertexai_model_garden"
+
+
+class GemmaChatVertexAIModelGarden(_GemmaBase, _BaseVertexAIModelGarden, BaseChatModel):
+    allowed_model_args: Optional[List[str]] = [
+        "temperature",
+        "top_p",
+        "top_k",
+        "max_tokens",
+    ]
+
+    @property
+    def _llm_type(self) -> str:
+        return "gemma_vertexai_model_garden"
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling gemma."""
+        params = {"max_length": self.max_tokens}
+        return {k: v for k, v in params.items() if v is not None}
+
+    def _generate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        request = self._get_params(**kwargs)
+        request["prompt"] = gemma_messages_to_prompt(messages)
+        output = self.client.predict(endpoint=self.endpoint_path, instances=[request])
+        generations = [
+            ChatGeneration(
+                message=AIMessage(content=output.predictions[0]),
+            )
+        ]
+        return ChatResult(generations=generations)
+
+    async def _agenerate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        """Top Level call"""
+        request = self._get_params(**kwargs)
+        request["prompt"] = gemma_messages_to_prompt(messages)
+        output = await self.async_client.predict_(
+            endpoint=self.endpoint_path, instances=[request]
+        )
+        generations = [
+            ChatGeneration(
+                message=AIMessage(content=output.predictions[0]),
+            )
+        ]
+        return ChatResult(generations=generations)
+
+
+class _GemmaLocalKaggleBase(_GemmaBase):
+    """Local gemma model."""
+
+    client: Any = None  #: :meta private:
+    keras_backend: str = "jax"
+    model_name: str = "gemma_2b_en"
+    """Gemma model name."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        try:
+            os.environ["KERAS_BACKEND"] = values["keras_backend"]
+            from keras_nlp.models import GemmaCausalLM  # type: ignore
+        except ImportError:
+            raise ImportError(
+                "Could not import GemmaCausalLM library. "
+                "Please install the GemmaCausalLM library to "
+                "use this  model: pip install keras-nlp keras>=3 kaggle"
+            )
+
+        values["client"] = GemmaCausalLM.from_preset(values["model_name"])
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling gemma."""
+        params = {"max_length": self.max_tokens}
+        return {k: v for k, v in params.items() if v is not None}
+
+
+class GemmaLocalKaggle(_GemmaLocalKaggleBase, BaseLLM):
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        params = {"max_length": self.max_tokens} if self.max_tokens else {}
+        results = self.client.generate(prompts, **params)
+        results = results if isinstance(results, str) else [results]
+        return LLMResult(generations=[[Generation(text=result)] for result in results])
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "gemma_local_kaggle"
+
+
+class GemmaChatLocalKaggle(_GemmaLocalKaggleBase, BaseChatModel):
+    def _generate(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        params = {"max_length": self.max_tokens} if self.max_tokens else {}
+        prompt = gemma_messages_to_prompt(messages)
+        output = self.client.generate(prompt, **params)
+        generation = ChatGeneration(message=AIMessage(content=output))
+        return ChatResult(generations=[generation])
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "gemma_local_chat_kaggle"
diff --git a/libs/vertexai/langchain_google_vertexai/llms.py b/libs/vertexai/langchain_google_vertexai/llms.py
index c236ab3b..9257269c 100644
--- a/libs/vertexai/langchain_google_vertexai/llms.py
+++ b/libs/vertexai/langchain_google_vertexai/llms.py
@@ -44,6 +44,12 @@
     TextGenerationModel as PreviewTextGenerationModel,
 )
 
+from langchain_google_vertexai._base import (
+    _PALM_DEFAULT_MAX_OUTPUT_TOKENS,
+    _PALM_DEFAULT_TEMPERATURE,
+    _PALM_DEFAULT_TOP_K,
+    _PALM_DEFAULT_TOP_P,
+)
 from langchain_google_vertexai._enums import HarmBlockThreshold, HarmCategory
 from langchain_google_vertexai._utils import (
     create_retry_decorator,
@@ -53,11 +59,6 @@
     is_gemini_model,
 )
 
-_PALM_DEFAULT_MAX_OUTPUT_TOKENS = TextGenerationModel._DEFAULT_MAX_OUTPUT_TOKENS
-_PALM_DEFAULT_TEMPERATURE = 0.0
-_PALM_DEFAULT_TOP_P = 0.95
-_PALM_DEFAULT_TOP_K = 40
-
 
 def _completion_with_retry(
     llm: VertexAI,
diff --git a/libs/vertexai/langchain_google_vertexai/model_garden.py b/libs/vertexai/langchain_google_vertexai/model_garden.py
new file mode 100644
index 00000000..6469fb42
--- /dev/null
+++ b/libs/vertexai/langchain_google_vertexai/model_garden.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import asyncio
+from typing import Any, List, Optional
+
+from langchain_core.callbacks.manager import (
+    AsyncCallbackManagerForLLMRun,
+    CallbackManagerForLLMRun,
+)
+from langchain_core.language_models.llms import BaseLLM
+from langchain_core.outputs import Generation, LLMResult
+
+from langchain_google_vertexai._base import _BaseVertexAIModelGarden
+
+
+class VertexAIModelGarden(_BaseVertexAIModelGarden, BaseLLM):
+    """Large language models served from Vertex AI Model Garden."""
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        instances = self._prepare_request(prompts, **kwargs)
+
+        if self.single_example_per_request and len(instances) > 1:
+            results = []
+            for instance in instances:
+                response = self.client.predict(
+                    endpoint=self.endpoint_path, instances=[instance]
+                )
+                results.append(self._parse_prediction(response.predictions[0]))
+            return LLMResult(
+                generations=[[Generation(text=result)] for result in results]
+            )
+
+        response = self.client.predict(endpoint=self.endpoint_path, instances=instances)
+        return self._parse_response(response)
+
+    async def _agenerate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        instances = self._prepare_request(prompts, **kwargs)
+        if self.single_example_per_request and len(instances) > 1:
+            responses = []
+            for instance in instances:
+                responses.append(
+                    self.async_client.predict(
+                        endpoint=self.endpoint_path, instances=[instance]
+                    )
+                )
+
+            responses = await asyncio.gather(*responses)
+            return LLMResult(
+                generations=[
+                    [Generation(text=self._parse_prediction(response.predictions[0]))]
+                    for response in responses
+                ]
+            )
+
+        response = await self.async_client.predict(
+            endpoint=self.endpoint_path, instances=instances
+        )
+        return self._parse_response(response)
diff --git a/libs/vertexai/tests/integration_tests/test_gemma.py b/libs/vertexai/tests/integration_tests/test_gemma.py
new file mode 100644
index 00000000..de2d27ec
--- /dev/null
+++ b/libs/vertexai/tests/integration_tests/test_gemma.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from langchain_core.messages import (
+    AIMessage,
+    HumanMessage,
+)
+
+from langchain_google_vertexai import (
+    GemmaChatLocalKaggle,
+    GemmaChatVertexAIModelGarden,
+    GemmaLocalKaggle,
+    GemmaVertexAIModelGarden,
+)
+
+
+@pytest.mark.skip("CI testing not set up")
+@pytest.mark.skip("CI testing not set up")
+def test_gemma_model_garden() -> None:
+    """In order to run this test, you should provide endpoint names.
+
+    Example:
+    export GEMMA_ENDPOINT_ID=...
+    export PROJECT=...
+    """
+    endpoint_id = os.environ["GEMMA_ENDPOINT_ID"]
+    project = os.environ["PROJECT"]
+    location = "us-central1"
+    llm = GemmaVertexAIModelGarden(
+        endpoint_id=endpoint_id,
+        project=project,
+        location=location,
+    )
+    output = llm.invoke("What is the meaning of life?")
+    assert isinstance(output, str)
+    assert len(output) > 2
+    assert llm._llm_type == "gemma_vertexai_model_garden"
+
+
+@pytest.mark.skip("CI testing not set up")
+def test_gemma_chat_model_garden() -> None:
+    """In order to run this test, you should provide endpoint names.
+
+    Example:
+    export GEMMA_ENDPOINT_ID=...
+    export PROJECT=...
+    """
+    endpoint_id = os.environ["GEMMA_ENDPOINT_ID"]
+    project = os.environ["PROJECT"]
+    location = "us-central1"
+    llm = GemmaChatVertexAIModelGarden(
+        endpoint_id=endpoint_id,
+        project=project,
+        location=location,
+    )
+    assert llm._llm_type == "gemma_vertexai_model_garden"
+
+    text_question1, text_answer1 = "How much is 2+2?", "4"
+    text_question2 = "How much is 3+3?"
+    message1 = HumanMessage(content=text_question1)
+    message2 = AIMessage(content=text_answer1)
+    message3 = HumanMessage(content=text_question2)
+    output = llm.invoke([message1])
+    assert isinstance(output, AIMessage)
+    assert len(output.content) > 2
+    output = llm.invoke([message1, message2, message3])
+    assert isinstance(output, AIMessage)
+    assert len(output.content) > 2
+
+
+@pytest.mark.skip("CI testing not set up")
+def test_gemma_kaggle() -> None:
+    llm = GemmaLocalKaggle(model_name="gemma_2b_en")
+    output = llm.invoke("What is the meaning of life?")
+    assert isinstance(output, str)
+    print(output)
+    assert len(output) > 2
+
+
+@pytest.mark.skip("CI testing not set up")
+def test_gemma_chat_kaggle() -> None:
+    llm = GemmaChatLocalKaggle(model_name="gemma_2b_en")
+    text_question1, text_answer1 = "How much is 2+2?", "4"
+    text_question2 = "How much is 3+3?"
+    message1 = HumanMessage(content=text_question1)
+    message2 = AIMessage(content=text_answer1)
+    message3 = HumanMessage(content=text_question2)
+    output = llm.invoke([message1])
+    assert isinstance(output, AIMessage)
+    assert len(output.content) > 2
+    output = llm.invoke([message1, message2, message3])
+    assert isinstance(output, AIMessage)
+    assert len(output.content) > 2
diff --git a/libs/vertexai/tests/integration_tests/test_llms.py b/libs/vertexai/tests/integration_tests/test_llms.py
index 6b252f42..f9525c8b 100644
--- a/libs/vertexai/tests/integration_tests/test_llms.py
+++ b/libs/vertexai/tests/integration_tests/test_llms.py
@@ -3,13 +3,11 @@
 Your end-user credentials would be used to make the calls (make sure you've run 
 `gcloud auth login` first).
 """
-import os
-from typing import Optional
 
 import pytest
 from langchain_core.outputs import LLMResult
 
-from langchain_google_vertexai.llms import VertexAI, VertexAIModelGarden
+from langchain_google_vertexai.llms import VertexAI
 
 model_names_to_test = ["text-bison@001", "gemini-pro"]
 model_names_to_test_with_default = [None] + model_names_to_test
@@ -119,87 +117,6 @@ async def test_astream() -> None:
         assert isinstance(token, str)
 
 
-@pytest.mark.skip("CI testing not set up")
-@pytest.mark.parametrize(
-    "endpoint_os_variable_name,result_arg",
-    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
-)
-def test_model_garden(
-    endpoint_os_variable_name: str, result_arg: Optional[str]
-) -> None:
-    """In order to run this test, you should provide endpoint names.
-
-    Example:
-    export FALCON_ENDPOINT_ID=...
-    export LLAMA_ENDPOINT_ID=...
-    export PROJECT=...
-    """
-    endpoint_id = os.environ[endpoint_os_variable_name]
-    project = os.environ["PROJECT"]
-    location = "europe-west4"
-    llm = VertexAIModelGarden(
-        endpoint_id=endpoint_id,
-        project=project,
-        result_arg=result_arg,
-        location=location,
-    )
-    output = llm("What is the meaning of life?")
-    assert isinstance(output, str)
-    assert llm._llm_type == "vertexai_model_garden"
-
-
-@pytest.mark.skip("CI testing not set up")
-@pytest.mark.parametrize(
-    "endpoint_os_variable_name,result_arg",
-    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
-)
-def test_model_garden_generate(
-    endpoint_os_variable_name: str, result_arg: Optional[str]
-) -> None:
-    """In order to run this test, you should provide endpoint names.
-
-    Example:
-    export FALCON_ENDPOINT_ID=...
-    export LLAMA_ENDPOINT_ID=...
-    export PROJECT=...
-    """
-    endpoint_id = os.environ[endpoint_os_variable_name]
-    project = os.environ["PROJECT"]
-    location = "europe-west4"
-    llm = VertexAIModelGarden(
-        endpoint_id=endpoint_id,
-        project=project,
-        result_arg=result_arg,
-        location=location,
-    )
-    output = llm.generate(["What is the meaning of life?", "How much is 2+2"])
-    assert isinstance(output, LLMResult)
-    assert len(output.generations) == 2
-
-
-@pytest.mark.skip("CI testing not set up")
-@pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "endpoint_os_variable_name,result_arg",
-    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
-)
-async def test_model_garden_agenerate(
-    endpoint_os_variable_name: str, result_arg: Optional[str]
-) -> None:
-    endpoint_id = os.environ[endpoint_os_variable_name]
-    project = os.environ["PROJECT"]
-    location = "europe-west4"
-    llm = VertexAIModelGarden(
-        endpoint_id=endpoint_id,
-        project=project,
-        result_arg=result_arg,
-        location=location,
-    )
-    output = await llm.agenerate(["What is the meaning of life?", "How much is 2+2"])
-    assert isinstance(output, LLMResult)
-    assert len(output.generations) == 2
-
-
 @pytest.mark.parametrize(
     "model_name",
     model_names_to_test,
diff --git a/libs/vertexai/tests/integration_tests/test_model_garden.py b/libs/vertexai/tests/integration_tests/test_model_garden.py
new file mode 100644
index 00000000..5500692e
--- /dev/null
+++ b/libs/vertexai/tests/integration_tests/test_model_garden.py
@@ -0,0 +1,89 @@
+import os
+from typing import Optional
+
+import pytest
+from langchain_core.outputs import LLMResult
+
+from langchain_google_vertexai import VertexAIModelGarden
+
+
+@pytest.mark.skip("CI testing not set up")
+@pytest.mark.parametrize(
+    "endpoint_os_variable_name,result_arg",
+    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
+)
+def test_model_garden(
+    endpoint_os_variable_name: str, result_arg: Optional[str]
+) -> None:
+    """In order to run this test, you should provide endpoint names.
+
+    Example:
+    export FALCON_ENDPOINT_ID=...
+    export LLAMA_ENDPOINT_ID=...
+    export PROJECT=...
+    """
+    endpoint_id = os.environ[endpoint_os_variable_name]
+    project = os.environ["PROJECT"]
+    location = "europe-west4"
+    llm = VertexAIModelGarden(
+        endpoint_id=endpoint_id,
+        project=project,
+        result_arg=result_arg,
+        location=location,
+    )
+    output = llm("What is the meaning of life?")
+    assert isinstance(output, str)
+    print(output)
+    assert llm._llm_type == "vertexai_model_garden"
+
+
+@pytest.mark.skip("CI testing not set up")
+@pytest.mark.parametrize(
+    "endpoint_os_variable_name,result_arg",
+    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
+)
+def test_model_garden_generate(
+    endpoint_os_variable_name: str, result_arg: Optional[str]
+) -> None:
+    """In order to run this test, you should provide endpoint names.
+
+    Example:
+    export FALCON_ENDPOINT_ID=...
+    export LLAMA_ENDPOINT_ID=...
+    export PROJECT=...
+    """
+    endpoint_id = os.environ[endpoint_os_variable_name]
+    project = os.environ["PROJECT"]
+    location = "europe-west4"
+    llm = VertexAIModelGarden(
+        endpoint_id=endpoint_id,
+        project=project,
+        result_arg=result_arg,
+        location=location,
+    )
+    output = llm.generate(["What is the meaning of life?", "How much is 2+2"])
+    assert isinstance(output, LLMResult)
+    assert len(output.generations) == 2
+
+
+@pytest.mark.skip("CI testing not set up")
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "endpoint_os_variable_name,result_arg",
+    [("FALCON_ENDPOINT_ID", "generated_text"), ("LLAMA_ENDPOINT_ID", None)],
+)
+async def test_model_garden_agenerate(
+    endpoint_os_variable_name: str, result_arg: Optional[str]
+) -> None:
+    endpoint_id = os.environ[endpoint_os_variable_name]
+    project = os.environ["PROJECT"]
+    location = "europe-west4"
+    llm = VertexAIModelGarden(
+        endpoint_id=endpoint_id,
+        project=project,
+        result_arg=result_arg,
+        location=location,
+    )
+    output = await llm.agenerate(["What is the meaning of life?", "How much is 2+2"])
+    assert isinstance(output, LLMResult)
+    assert len(output.generations) == 2
diff --git a/libs/vertexai/tests/unit_tests/test_imports.py b/libs/vertexai/tests/unit_tests/test_imports.py
index 7afa74f1..e207e7e6 100644
--- a/libs/vertexai/tests/unit_tests/test_imports.py
+++ b/libs/vertexai/tests/unit_tests/test_imports.py
@@ -2,6 +2,10 @@
 
 EXPECTED_ALL = [
     "ChatVertexAI",
+    "GemmaVertexAIModelGarden",
+    "GemmaChatVertexAIModelGarden",
+    "GemmaLocalKaggle",
+    "GemmaChatLocalKaggle",
     "VertexAIEmbeddings",
     "VertexAI",
     "VertexAIModelGarden",