From 023ff4b74120ddfccbddaea510a3cbd4272ce177 Mon Sep 17 00:00:00 2001
From: wenxindongwork <wenxindong@google.com>
Date: Tue, 20 Aug 2024 14:29:12 -0700
Subject: [PATCH 1/5] update colab examples

The `from optimum.tpu` version imports models that are specifically optimized for inference.
---
 examples/language-modeling/gemma_tuning.ipynb | 2 +-
 examples/language-modeling/llama_tuning.md    | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
index 1e810613..537c248f 100644
--- a/examples/language-modeling/gemma_tuning.ipynb
+++ b/examples/language-modeling/gemma_tuning.ipynb
@@ -249,7 +249,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from optimum.tpu import AutoModelForCausalLM\n",
+    "from transformers import AutoModelForCausalLM\n",
     "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)"
    ]
   },
diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
index 00b629be..13bd484e 100644
--- a/examples/language-modeling/llama_tuning.md
+++ b/examples/language-modeling/llama_tuning.md
@@ -47,8 +47,7 @@ Then, the tokenizer and model need to be loaded. We will choose [`meta-llama/Met
 
 ```python
 import torch
-from transformers import AutoTokenizer
-from optimum.tpu import AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 model_id = "meta-llama/Meta-Llama-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

From 8df30be4f4e44d1482b6161bae7f7a0c7c10cae0 Mon Sep 17 00:00:00 2001
From: wenxindongwork <wenxindong@google.com>
Date: Tue, 20 Aug 2024 14:58:17 -0700
Subject: [PATCH 2/5] udpate exmaples

---
 examples/language-modeling/gemma_tuning.ipynb | 6 +++++-
 examples/language-modeling/llama_tuning.md    | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
index 537c248f..5b749d87 100644
--- a/examples/language-modeling/gemma_tuning.ipynb
+++ b/examples/language-modeling/gemma_tuning.ipynb
@@ -297,7 +297,11 @@
     "from transformers import TrainingArguments\n",
     "\n",
     "# Set up the FSDP arguments\n",
-    "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n",
+    "cls_to_wrap = \"GemmaDecoderLayer\"\n",
+    "fsdp_training_args = {\n",
+    "    \"fsdp\": \"full_shard\",\n",
+    "    \"fsdp_config\": fsdp_v2.get_fsdp_config(cls_to_wrap),\n",
+    "}\n",
     "\n",
     "# Set up the trainer\n",
     "trainer = SFTTrainer(\n",
diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
index 13bd484e..fb9e6115 100644
--- a/examples/language-modeling/llama_tuning.md
+++ b/examples/language-modeling/llama_tuning.md
@@ -68,7 +68,11 @@ data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 You then need to specify the FSDP training arguments to enable the sharding feature,the function will deduce the classes that should be sharded:
 
 ```python
-fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)
+cls_to_wrap = "LlamaDecoderLayer"
+fsdp_training_args = {
+    "fsdp": "full_shard",
+    "fsdp_config": fsdp_v2.get_fsdp_config(cls_to_wrap),
+}
 ```
 
 Now training can be done as simply as using the standard `Trainer` class:

From f40451df2fcf396e710d579c8c9c959dc979d178 Mon Sep 17 00:00:00 2001
From: wenxindongwork <wenxindong@google.com>
Date: Wed, 21 Aug 2024 08:59:31 -0700
Subject: [PATCH 3/5] update fsdp_v2. get_fsdp_training_args

---
 examples/language-modeling/gemma_tuning.ipynb | 6 +-----
 examples/language-modeling/llama_tuning.md    | 6 +-----
 optimum/tpu/fsdp_v2.py                        | 6 ++++--
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
index 5b749d87..537c248f 100644
--- a/examples/language-modeling/gemma_tuning.ipynb
+++ b/examples/language-modeling/gemma_tuning.ipynb
@@ -297,11 +297,7 @@
     "from transformers import TrainingArguments\n",
     "\n",
     "# Set up the FSDP arguments\n",
-    "cls_to_wrap = \"GemmaDecoderLayer\"\n",
-    "fsdp_training_args = {\n",
-    "    \"fsdp\": \"full_shard\",\n",
-    "    \"fsdp_config\": fsdp_v2.get_fsdp_config(cls_to_wrap),\n",
-    "}\n",
+    "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n",
     "\n",
     "# Set up the trainer\n",
     "trainer = SFTTrainer(\n",
diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
index fb9e6115..13bd484e 100644
--- a/examples/language-modeling/llama_tuning.md
+++ b/examples/language-modeling/llama_tuning.md
@@ -68,11 +68,7 @@ data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 You then need to specify the FSDP training arguments to enable the sharding feature,the function will deduce the classes that should be sharded:
 
 ```python
-cls_to_wrap = "LlamaDecoderLayer"
-fsdp_training_args = {
-    "fsdp": "full_shard",
-    "fsdp_config": fsdp_v2.get_fsdp_config(cls_to_wrap),
-}
+fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)
 ```
 
 Now training can be done as simply as using the standard `Trainer` class:
diff --git a/optimum/tpu/fsdp_v2.py b/optimum/tpu/fsdp_v2.py
index 5d1d61cb..2b2bcbe9 100644
--- a/optimum/tpu/fsdp_v2.py
+++ b/optimum/tpu/fsdp_v2.py
@@ -83,8 +83,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
     matched_model = False
     if model_type == "gemma":
         from .modeling_gemma import GemmaForCausalLM
+        from transformers import GemmaForCausalLM as HFGemmaForCausalLLM
 
-        if isinstance(model, GemmaForCausalLM):
+        if isinstance(model, GemmaForCausalLM) or isinstance(model, HFGemmaForCausalLLM):
             logger = logging.get_logger(__name__)
             from torch_xla import __version__ as xla_version
             if xla_version == "2.3.0":
@@ -95,8 +96,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
             matched_model = True
     elif model_type == "llama":
         from .modeling_llama import LlamaForCausalLM
+        from transformers import LlamaForCausalLM as HFLlamaForCausalLLM
 
-        if isinstance(model, LlamaForCausalLM):
+        if isinstance(model, LlamaForCausalLM) or isinstance(model, HFLlamaForCausalLLM):
             cls_to_wrap = "LlamaDecoderLayer"
             matched_model = True
 

From 2cb9068a98ca85e749d499c4f18be3a695b512b1 Mon Sep 17 00:00:00 2001
From: wenxindongwork <wenxindong@google.com>
Date: Wed, 21 Aug 2024 14:22:12 -0700
Subject: [PATCH 4/5] load model in bf16

---
 examples/language-modeling/gemma_tuning.ipynb | 3 ++-
 examples/language-modeling/llama_tuning.md    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
index 537c248f..9fbcaf79 100644
--- a/examples/language-modeling/gemma_tuning.ipynb
+++ b/examples/language-modeling/gemma_tuning.ipynb
@@ -250,7 +250,8 @@
    "outputs": [],
    "source": [
     "from transformers import AutoModelForCausalLM\n",
-    "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)"
+    "import torch\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=torch.bfloat16)"
    ]
   },
   {
diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
index 13bd484e..9d38130a 100644
--- a/examples/language-modeling/llama_tuning.md
+++ b/examples/language-modeling/llama_tuning.md
@@ -53,7 +53,7 @@ model_id = "meta-llama/Meta-Llama-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # Add custom token for padding Llama
 tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
-model = AutoModelForCausalLM.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
 ```
 
 To tune the model with the [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset, you can load it and obtain the `quote` column:

From 1b83380fe11a1184fdf2e476357044816c2537c7 Mon Sep 17 00:00:00 2001
From: wenxindongwork <wenxindong@google.com>
Date: Thu, 22 Aug 2024 08:06:10 -0700
Subject: [PATCH 5/5] make style

---
 examples/language-modeling/gemma_tuning.ipynb | 14 ++++++++++++--
 optimum/tpu/fsdp_v2.py                        |  6 ++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
index 9fbcaf79..fc7f4717 100644
--- a/examples/language-modeling/gemma_tuning.ipynb
+++ b/examples/language-modeling/gemma_tuning.ipynb
@@ -118,6 +118,8 @@
    "outputs": [],
    "source": [
     "from optimum.tpu import fsdp_v2\n",
+    "\n",
+    "\n",
     "fsdp_v2.use_fsdp_v2()"
    ]
   },
@@ -141,6 +143,8 @@
    "outputs": [],
    "source": [
     "from datasets import load_dataset\n",
+    "\n",
+    "\n",
     "dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")"
    ]
   },
@@ -199,6 +203,8 @@
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer\n",
+    "\n",
+    "\n",
     "model_id = \"google/gemma-2b\"\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
@@ -249,8 +255,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from transformers import AutoModelForCausalLM\n",
     "import torch\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "\n",
+    "\n",
     "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=torch.bfloat16)"
    ]
   },
@@ -271,6 +279,7 @@
    "source": [
     "from peft import LoraConfig\n",
     "\n",
+    "\n",
     "# Set up PEFT LoRA for fine-tuning.\n",
     "lora_config = LoraConfig(\n",
     "    r=8,\n",
@@ -294,8 +303,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from trl import SFTTrainer\n",
     "from transformers import TrainingArguments\n",
+    "from trl import SFTTrainer\n",
+    "\n",
     "\n",
     "# Set up the FSDP arguments\n",
     "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n",
diff --git a/optimum/tpu/fsdp_v2.py b/optimum/tpu/fsdp_v2.py
index 2b2bcbe9..8a138793 100644
--- a/optimum/tpu/fsdp_v2.py
+++ b/optimum/tpu/fsdp_v2.py
@@ -82,9 +82,10 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
     model_type = model.config.model_type
     matched_model = False
     if model_type == "gemma":
-        from .modeling_gemma import GemmaForCausalLM
         from transformers import GemmaForCausalLM as HFGemmaForCausalLLM
 
+        from .modeling_gemma import GemmaForCausalLM
+
         if isinstance(model, GemmaForCausalLM) or isinstance(model, HFGemmaForCausalLLM):
             logger = logging.get_logger(__name__)
             from torch_xla import __version__ as xla_version
@@ -95,9 +96,10 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
             cls_to_wrap = "GemmaDecoderLayer"
             matched_model = True
     elif model_type == "llama":
-        from .modeling_llama import LlamaForCausalLM
         from transformers import LlamaForCausalLM as HFLlamaForCausalLLM
 
+        from .modeling_llama import LlamaForCausalLM
+
         if isinstance(model, LlamaForCausalLM) or isinstance(model, HFLlamaForCausalLLM):
             cls_to_wrap = "LlamaDecoderLayer"
             matched_model = True