huggingface · tengomucho · Aug 22, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
@@ -249,8 +249,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from optimum.tpu import AutoModelForCausalLM\n",
-    "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)"
+    "from transformers import AutoModelForCausalLM\n",
+    "import torch\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=torch.bfloat16)"
    ]
   },
   {

diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
@@ -47,14 +47,13 @@ Then, the tokenizer and model need to be loaded. We will choose [`meta-llama/Met
 
 ```python
 import torch
-from transformers import AutoTokenizer
-from optimum.tpu import AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 model_id = "meta-llama/Meta-Llama-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 # Add custom token for padding Llama
 tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
-model = AutoModelForCausalLM.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
 ```
 
 To tune the model with the [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset, you can load it and obtain the `quote` column:

diff --git a/optimum/tpu/fsdp_v2.py b/optimum/tpu/fsdp_v2.py
@@ -83,8 +83,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
     matched_model = False
     if model_type == "gemma":
         from .modeling_gemma import GemmaForCausalLM
+        from transformers import GemmaForCausalLM as HFGemmaForCausalLLM
 
-        if isinstance(model, GemmaForCausalLM):
+        if isinstance(model, GemmaForCausalLM) or isinstance(model, HFGemmaForCausalLLM):
             logger = logging.get_logger(__name__)
             from torch_xla import __version__ as xla_version
             if xla_version == "2.3.0":
@@ -95,8 +96,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict:
             matched_model = True
     elif model_type == "llama":
         from .modeling_llama import LlamaForCausalLM
+        from transformers import LlamaForCausalLM as HFLlamaForCausalLLM
 
-        if isinstance(model, LlamaForCausalLM):
+        if isinstance(model, LlamaForCausalLM) or isinstance(model, HFLlamaForCausalLLM):
             cls_to_wrap = "LlamaDecoderLayer"
             matched_model = True