From 2326aafde45b7448033ffd47b8b3811ef20f1aef Mon Sep 17 00:00:00 2001 From: Nino Risteski Date: Tue, 12 Nov 2024 18:51:34 +0100 Subject: [PATCH] adding basic examples --- examples/__init__.py | 0 examples/local_gpu_trainer.py | 43 ++++++++++++++++ examples/sagemaker_train_compiler.py | 64 ++++++++++++++++++++++++ examples/train_gpt2.py | 51 +++++++++++++++++++ examples/train_gpt2_lora.py | 57 +++++++++++++++++++++ examples/train_quantized.py | 56 +++++++++++++++++++++ src/shallowflow/utils/sagemaker_utils.py | 45 +++++++++++++++++ 7 files changed, 316 insertions(+) create mode 100644 examples/__init__.py create mode 100644 examples/local_gpu_trainer.py create mode 100644 examples/sagemaker_train_compiler.py create mode 100644 examples/train_gpt2.py create mode 100644 examples/train_gpt2_lora.py create mode 100644 examples/train_quantized.py create mode 100644 src/shallowflow/utils/sagemaker_utils.py diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/local_gpu_trainer.py b/examples/local_gpu_trainer.py new file mode 100644 index 0000000..66b8434 --- /dev/null +++ b/examples/local_gpu_trainer.py @@ -0,0 +1,43 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer +from shallowflow.trainer import LocalGPUTrainer, GTX1660Config +from datasets import load_dataset + +def main(): + # Configure for GTX 1660 + config = GTX1660Config( + batch_size=8, + mixed_precision=True, + gradient_checkpointing=True + ) + + # Load model and tokenizer + model = AutoModelForCausalLM.from_pretrained("gpt2") + tokenizer = AutoTokenizer.from_pretrained("gpt2") + + # Initialize trainer with wandb tracking + trainer = LocalGPUTrainer( + model=model, + tokenizer=tokenizer, + config=config, + project_name="shallowflow-local", + entity="your-wandb-username" # Optional + ) + + # Load tiny shakespeare dataset + dataset = load_dataset("tiny_shakespeare") + train_dataset = dataset["train"] + eval_dataset = dataset["validation"] + + try: + # Train with monitoring + trainer.train( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + num_epochs=3 + ) + finally: + # Ensure wandb tracking is properly closed + trainer.finish() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/sagemaker_train_compiler.py b/examples/sagemaker_train_compiler.py new file mode 100644 index 0000000..fa0500f --- /dev/null +++ b/examples/sagemaker_train_compiler.py @@ -0,0 +1,64 @@ +import os +import argparse +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + Trainer, + TrainingArguments +) +from datasets import load_dataset + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", type=str) + parser.add_argument("--epochs", type=int, default=3) + parser.add_argument("--learning_rate", type=float, default=3e-4) + return parser.parse_args() + +def main(): + args = parse_args() + + # SageMaker environment variables + training_dir = os.environ["SM_CHANNEL_TRAINING"] + model_dir = os.environ["SM_MODEL_DIR"] + num_gpus = os.environ["SM_NUM_GPUS"] + + # Load model and tokenizer + model = AutoModelForCausalLM.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # Load Tiny Shakespeare dataset + dataset = load_dataset("tiny_shakespeare", split="train") + + # Training arguments optimized for compiler + training_args = TrainingArguments( + output_dir=model_dir, + num_train_epochs=args.epochs, + learning_rate=args.learning_rate, + per_device_train_batch_size=16, + optim="adamw_torch_xla", # Optimized for Training Compiler + dataloader_num_workers=4, + preprocessing_num_workers=4 + ) + + # Initialize trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset, + tokenizer=tokenizer + ) + + # Train + trainer.train() + + # Save model + trainer.save_model(model_dir) + +# Required for distributed training +def _mp_fn(index): + main() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/train_gpt2.py b/examples/train_gpt2.py new file mode 100644 index 0000000..433afcf --- /dev/null +++ b/examples/train_gpt2.py @@ -0,0 +1,51 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from datasets import load_dataset +from shallowflow import LLMTrainer, TrainingConfig +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='Train GPT-2 with ShallowFlow') + parser.add_argument('--model_name', default='gpt2', help='Model name or path') + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--learning_rate', type=float, default=3e-4) + parser.add_argument('--num_epochs', type=int, default=3) + parser.add_argument('--output_dir', default='outputs') + return parser.parse_args() + +def main(): + args = parse_args() + + # Initialize config + config = TrainingConfig( + model_name=args.model_name, + batch_size=args.batch_size, + learning_rate=args.learning_rate, + num_epochs=args.num_epochs + ) + + # Load model and tokenizer + model = AutoModelForCausalLM.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # Load dataset + dataset = load_dataset("wikitext", "wikitext-2-raw-v1") + + # Initialize trainer + trainer = LLMTrainer( + model=model, + tokenizer=tokenizer, + config=config + ) + + # Train + trainer.train( + train_dataset=dataset["train"], + eval_dataset=dataset["validation"] + ) + + # Save model + trainer.save_model(args.output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/train_gpt2_lora.py b/examples/train_gpt2_lora.py new file mode 100644 index 0000000..9430290 --- /dev/null +++ b/examples/train_gpt2_lora.py @@ -0,0 +1,57 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from datasets import load_dataset +from shallowflow import LLMTrainer, TrainingConfig +from shallowflow.optimizations import LoRAConfig +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='Train GPT-2 with LoRA') + parser.add_argument('--model_name', default='gpt2') + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--lora_rank', type=int, default=8) + parser.add_argument('--lora_alpha', type=int, default=16) + parser.add_argument('--output_dir', default='outputs_lora') + return parser.parse_args() + +def main(): + args = parse_args() + + # Initialize configs + training_config = TrainingConfig( + model_name=args.model_name, + batch_size=args.batch_size, + use_lora=True + ) + + lora_config = LoRAConfig( + rank=args.lora_rank, + alpha=args.lora_alpha + ) + + # Load model and tokenizer + model = AutoModelForCausalLM.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # Initialize trainer with LoRA + trainer = LLMTrainer( + model=model, + tokenizer=tokenizer, + config=training_config, + lora_config=lora_config + ) + + # Load and process dataset + dataset = load_dataset("wikitext", "wikitext-2-raw-v1") + + # Train + trainer.train( + train_dataset=dataset["train"], + eval_dataset=dataset["validation"] + ) + + # Save LoRA weights + trainer.save_lora_weights(args.output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/train_quantized.py b/examples/train_quantized.py new file mode 100644 index 0000000..91cf3e1 --- /dev/null +++ b/examples/train_quantized.py @@ -0,0 +1,56 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from datasets import load_dataset +from shallowflow import LLMTrainer, TrainingConfig +from shallowflow.optimizations import QuantizationConfig +import argparse + +def parse_args(): + parser = argparse.ArgumentParser(description='Train GPT-2 with Quantization') + parser.add_argument('--model_name', default='gpt2') + parser.add_argument('--batch_size', type=int, default=16) + parser.add_argument('--bits', type=int, default=8) + parser.add_argument('--output_dir', default='outputs_quantized') + return parser.parse_args() + +def main(): + args = parse_args() + + # Initialize configs + training_config = TrainingConfig( + model_name=args.model_name, + batch_size=args.batch_size, + use_quantization=True + ) + + quant_config = QuantizationConfig( + bits=args.bits, + symmetric=True + ) + + # Load model and tokenizer + model = AutoModelForCausalLM.from_pretrained(args.model_name) + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + # Initialize trainer with quantization + trainer = LLMTrainer( + model=model, + tokenizer=tokenizer, + config=training_config, + quantization_config=quant_config + ) + + # Load dataset + dataset = load_dataset("wikitext", "wikitext-2-raw-v1") + + # Train + trainer.train( + train_dataset=dataset["train"], + eval_dataset=dataset["validation"] + ) + + # Save quantized model + trainer.save_quantized_model(args.output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/shallowflow/utils/sagemaker_utils.py b/src/shallowflow/utils/sagemaker_utils.py new file mode 100644 index 0000000..8ff7d2b --- /dev/null +++ b/src/shallowflow/utils/sagemaker_utils.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from typing import Optional +import sagemaker +from sagemaker.huggingface import HuggingFace +from sagemaker.training_compiler import TrainingCompilerConfig + +@dataclass +class SageMakerConfig: + instance_type: str = "ml.g4dn.xlarge" + instance_count: int = 1 + use_compiler: bool = True + max_epochs: int = 3 + learning_rate: float = 3e-4 + +class SageMakerManager: + def __init__(self, config: SageMakerConfig): + self.config = config + self.session = sagemaker.Session() + + def setup_compiler_training( + self, + model_name: str, + script_path: str + ): + # Configure Training Compiler + compiler_config = TrainingCompilerConfig(enabled=True) + + # Create HuggingFace Estimator + estimator = HuggingFace( + entry_point=script_path, + instance_type=self.config.instance_type, + instance_count=self.config.instance_count, + compiler_config=compiler_config, + transformers_version="4.26.0", + pytorch_version="1.13.1", + py_version="py39", + role=sagemaker.get_execution_role(), + hyperparameters={ + "epochs": self.config.max_epochs, + "learning_rate": self.config.learning_rate, + "model_name": model_name + } + ) + + return estimator \ No newline at end of file