From 023ff4b74120ddfccbddaea510a3cbd4272ce177 Mon Sep 17 00:00:00 2001 From: wenxindongwork Date: Tue, 20 Aug 2024 14:29:12 -0700 Subject: [PATCH 1/5] update colab examples The `from optimum.tpu` version imports models that are specifically optimized for inference. --- examples/language-modeling/gemma_tuning.ipynb | 2 +- examples/language-modeling/llama_tuning.md | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb index 1e810613..537c248f 100644 --- a/examples/language-modeling/gemma_tuning.ipynb +++ b/examples/language-modeling/gemma_tuning.ipynb @@ -249,7 +249,7 @@ "metadata": {}, "outputs": [], "source": [ - "from optimum.tpu import AutoModelForCausalLM\n", + "from transformers import AutoModelForCausalLM\n", "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)" ] }, diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md index 00b629be..13bd484e 100644 --- a/examples/language-modeling/llama_tuning.md +++ b/examples/language-modeling/llama_tuning.md @@ -47,8 +47,7 @@ Then, the tokenizer and model need to be loaded. We will choose [`meta-llama/Met ```python import torch -from transformers import AutoTokenizer -from optimum.tpu import AutoModelForCausalLM +from transformers import AutoTokenizer, AutoModelForCausalLM model_id = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_id) From 8df30be4f4e44d1482b6161bae7f7a0c7c10cae0 Mon Sep 17 00:00:00 2001 From: wenxindongwork Date: Tue, 20 Aug 2024 14:58:17 -0700 Subject: [PATCH 2/5] udpate exmaples --- examples/language-modeling/gemma_tuning.ipynb | 6 +++++- examples/language-modeling/llama_tuning.md | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb index 537c248f..5b749d87 100644 --- a/examples/language-modeling/gemma_tuning.ipynb +++ b/examples/language-modeling/gemma_tuning.ipynb @@ -297,7 +297,11 @@ "from transformers import TrainingArguments\n", "\n", "# Set up the FSDP arguments\n", - "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n", + "cls_to_wrap = \"GemmaDecoderLayer\"\n", + "fsdp_training_args = {\n", + " \"fsdp\": \"full_shard\",\n", + " \"fsdp_config\": fsdp_v2.get_fsdp_config(cls_to_wrap),\n", + "}\n", "\n", "# Set up the trainer\n", "trainer = SFTTrainer(\n", diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md index 13bd484e..fb9e6115 100644 --- a/examples/language-modeling/llama_tuning.md +++ b/examples/language-modeling/llama_tuning.md @@ -68,7 +68,11 @@ data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) You then need to specify the FSDP training arguments to enable the sharding feature,the function will deduce the classes that should be sharded: ```python -fsdp_training_args = fsdp_v2.get_fsdp_training_args(model) +cls_to_wrap = "LlamaDecoderLayer" +fsdp_training_args = { + "fsdp": "full_shard", + "fsdp_config": fsdp_v2.get_fsdp_config(cls_to_wrap), +} ``` Now training can be done as simply as using the standard `Trainer` class: From f40451df2fcf396e710d579c8c9c959dc979d178 Mon Sep 17 00:00:00 2001 From: wenxindongwork Date: Wed, 21 Aug 2024 08:59:31 -0700 Subject: [PATCH 3/5] update fsdp_v2. get_fsdp_training_args --- examples/language-modeling/gemma_tuning.ipynb | 6 +----- examples/language-modeling/llama_tuning.md | 6 +----- optimum/tpu/fsdp_v2.py | 6 ++++-- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb index 5b749d87..537c248f 100644 --- a/examples/language-modeling/gemma_tuning.ipynb +++ b/examples/language-modeling/gemma_tuning.ipynb @@ -297,11 +297,7 @@ "from transformers import TrainingArguments\n", "\n", "# Set up the FSDP arguments\n", - "cls_to_wrap = \"GemmaDecoderLayer\"\n", - "fsdp_training_args = {\n", - " \"fsdp\": \"full_shard\",\n", - " \"fsdp_config\": fsdp_v2.get_fsdp_config(cls_to_wrap),\n", - "}\n", + "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n", "\n", "# Set up the trainer\n", "trainer = SFTTrainer(\n", diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md index fb9e6115..13bd484e 100644 --- a/examples/language-modeling/llama_tuning.md +++ b/examples/language-modeling/llama_tuning.md @@ -68,11 +68,7 @@ data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) You then need to specify the FSDP training arguments to enable the sharding feature,the function will deduce the classes that should be sharded: ```python -cls_to_wrap = "LlamaDecoderLayer" -fsdp_training_args = { - "fsdp": "full_shard", - "fsdp_config": fsdp_v2.get_fsdp_config(cls_to_wrap), -} +fsdp_training_args = fsdp_v2.get_fsdp_training_args(model) ``` Now training can be done as simply as using the standard `Trainer` class: diff --git a/optimum/tpu/fsdp_v2.py b/optimum/tpu/fsdp_v2.py index 5d1d61cb..2b2bcbe9 100644 --- a/optimum/tpu/fsdp_v2.py +++ b/optimum/tpu/fsdp_v2.py @@ -83,8 +83,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict: matched_model = False if model_type == "gemma": from .modeling_gemma import GemmaForCausalLM + from transformers import GemmaForCausalLM as HFGemmaForCausalLLM - if isinstance(model, GemmaForCausalLM): + if isinstance(model, GemmaForCausalLM) or isinstance(model, HFGemmaForCausalLLM): logger = logging.get_logger(__name__) from torch_xla import __version__ as xla_version if xla_version == "2.3.0": @@ -95,8 +96,9 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict: matched_model = True elif model_type == "llama": from .modeling_llama import LlamaForCausalLM + from transformers import LlamaForCausalLM as HFLlamaForCausalLLM - if isinstance(model, LlamaForCausalLM): + if isinstance(model, LlamaForCausalLM) or isinstance(model, HFLlamaForCausalLLM): cls_to_wrap = "LlamaDecoderLayer" matched_model = True From 2cb9068a98ca85e749d499c4f18be3a695b512b1 Mon Sep 17 00:00:00 2001 From: wenxindongwork Date: Wed, 21 Aug 2024 14:22:12 -0700 Subject: [PATCH 4/5] load model in bf16 --- examples/language-modeling/gemma_tuning.ipynb | 3 ++- examples/language-modeling/llama_tuning.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb index 537c248f..9fbcaf79 100644 --- a/examples/language-modeling/gemma_tuning.ipynb +++ b/examples/language-modeling/gemma_tuning.ipynb @@ -250,7 +250,8 @@ "outputs": [], "source": [ "from transformers import AutoModelForCausalLM\n", - "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False)" + "import torch\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=torch.bfloat16)" ] }, { diff --git a/examples/language-modeling/llama_tuning.md b/examples/language-modeling/llama_tuning.md index 13bd484e..9d38130a 100644 --- a/examples/language-modeling/llama_tuning.md +++ b/examples/language-modeling/llama_tuning.md @@ -53,7 +53,7 @@ model_id = "meta-llama/Meta-Llama-3-8B" tokenizer = AutoTokenizer.from_pretrained(model_id) # Add custom token for padding Llama tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) -model = AutoModelForCausalLM.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) ``` To tune the model with the [Abirate/english_quotes](https://huggingface.co/datasets/Abirate/english_quotes) dataset, you can load it and obtain the `quote` column: From 1b83380fe11a1184fdf2e476357044816c2537c7 Mon Sep 17 00:00:00 2001 From: wenxindongwork Date: Thu, 22 Aug 2024 08:06:10 -0700 Subject: [PATCH 5/5] make style --- examples/language-modeling/gemma_tuning.ipynb | 14 ++++++++++++-- optimum/tpu/fsdp_v2.py | 6 ++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/examples/language-modeling/gemma_tuning.ipynb b/examples/language-modeling/gemma_tuning.ipynb index 9fbcaf79..fc7f4717 100644 --- a/examples/language-modeling/gemma_tuning.ipynb +++ b/examples/language-modeling/gemma_tuning.ipynb @@ -118,6 +118,8 @@ "outputs": [], "source": [ "from optimum.tpu import fsdp_v2\n", + "\n", + "\n", "fsdp_v2.use_fsdp_v2()" ] }, @@ -141,6 +143,8 @@ "outputs": [], "source": [ "from datasets import load_dataset\n", + "\n", + "\n", "dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")" ] }, @@ -199,6 +203,8 @@ "outputs": [], "source": [ "from transformers import AutoTokenizer\n", + "\n", + "\n", "model_id = \"google/gemma-2b\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", @@ -249,8 +255,10 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import AutoModelForCausalLM\n", "import torch\n", + "from transformers import AutoModelForCausalLM\n", + "\n", + "\n", "model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=torch.bfloat16)" ] }, @@ -271,6 +279,7 @@ "source": [ "from peft import LoraConfig\n", "\n", + "\n", "# Set up PEFT LoRA for fine-tuning.\n", "lora_config = LoraConfig(\n", " r=8,\n", @@ -294,8 +303,9 @@ "metadata": {}, "outputs": [], "source": [ - "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", + "from trl import SFTTrainer\n", + "\n", "\n", "# Set up the FSDP arguments\n", "fsdp_training_args = fsdp_v2.get_fsdp_training_args(model)\n", diff --git a/optimum/tpu/fsdp_v2.py b/optimum/tpu/fsdp_v2.py index 2b2bcbe9..8a138793 100644 --- a/optimum/tpu/fsdp_v2.py +++ b/optimum/tpu/fsdp_v2.py @@ -82,9 +82,10 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict: model_type = model.config.model_type matched_model = False if model_type == "gemma": - from .modeling_gemma import GemmaForCausalLM from transformers import GemmaForCausalLM as HFGemmaForCausalLLM + from .modeling_gemma import GemmaForCausalLM + if isinstance(model, GemmaForCausalLM) or isinstance(model, HFGemmaForCausalLLM): logger = logging.get_logger(__name__) from torch_xla import __version__ as xla_version @@ -95,9 +96,10 @@ def get_fsdp_training_args(model: PreTrainedModel) -> Dict: cls_to_wrap = "GemmaDecoderLayer" matched_model = True elif model_type == "llama": - from .modeling_llama import LlamaForCausalLM from transformers import LlamaForCausalLM as HFLlamaForCausalLLM + from .modeling_llama import LlamaForCausalLM + if isinstance(model, LlamaForCausalLM) or isinstance(model, HFLlamaForCausalLLM): cls_to_wrap = "LlamaDecoderLayer" matched_model = True