huggingface · tengomucho · Aug 22, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb
@@ -249,7 +249,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from optimum.tpu import AutoModelForCausalLM\n",
+    "from transformers import AutoModelForCausalLM\n",
     "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)"
    ]
   },
@@ -297,7 +297,11 @@
     "from transformers import TrainingArguments\n",
     "\n",
     "# Set up the FSDP arguments\n",
-    "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n",
+    "cls_to_wrap = \"GemmaDecoderLayer\"\n",
+    "fsdp_training_args = {\n",
+    "    \"fsdp\": \"full_shard\",\n",
+    "    \"fsdp_config\": fsdp_v2.get_fsdp_config(cls_to_wrap),\n",
+    "}\n",
     "\n",
     "# Set up the trainer\n",
     "trainer = SFTTrainer(\n",

diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md
@@ -47,8 +47,7 @@ Then, the tokenizer and model need to be loaded. We will choose [`meta-llama/Met
 
 ```python
 import torch
-from transformers import AutoTokenizer
-from optimum.tpu import AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
 
 model_id = "meta-llama/Meta-Llama-3-8B"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -69,7 +68,11 @@ data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
 You then need to specify the FSDP training arguments to enable the sharding feature,the function will deduce the classes that should be sharded:
 
 ```python
-fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)
+cls_to_wrap = "LlamaDecoderLayer"
+fsdp_training_args = {
+    "fsdp": "full_shard",
+    "fsdp_config": fsdp_v2.get_fsdp_config(cls_to_wrap),
+}
 ```
 
 Now training can be done as simply as using the standard `Trainer` class: