FlagOpen · shenzhu1993 · Jun 7, 2024
@@ -0,0 +1 @@
+commit-id d7dc60ec3ef6341526fd187281dc289418c17899
@@ -0,0 +1,5 @@
+#! /usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+rom .core_models_gpt_gpt_model import print_device_type
@@ -0,0 +1,18 @@
+import megatron
+from megatron import print_rank_0
+from flagscale.patches_utils import add_patches_module
+
+#[iluvatar] start of changes
+def print_device_type():
+    device_type = "iluvatar" 
+    if device_type:
+        print_rank_0("=== Monkey-patching Device Type: {} ===".format(device_type))
+    else:
+        print_rank_0("=== Monkey-patching Device Type: None ===")
+
+#[iluvatar] end of changes
+
+# This is used for monkey-patching demonstration.
+module_path = "megatron.core.models.gpt.gpt_model"
+module_dict = {"print_device_type",print_device_type}
+add_patches_module(module_path,module_dict)
@@ -0,0 +1,33 @@
+defaults:
+  - train: train_aquila_7b 
+  - _self_
+
+experiment:
+  exp_name: aquila2
+  exp_dir: ./outputs
+  task:
+    type: train
+    backend: megatron
+    entrypoint: /home/FlagScale/flagscale/train/train_aquila.py
+  runner:
+    backend: torchrun
+    nnodes: 1 
+    nproc_per_node: 8 
+  envs:
+    PYTORCH_SKIP_COMPILE_CHECK: 1
+    MACA_PATH: /opt/maca
+    MACA_SMALL_PAGESIZE_ENABLE: 1
+    PYTORCH_ENABLE_SAME_RANK_A100: 1
+    CUCC_PATH: /opt/maca/tools/cu-bridge
+    CUDA_PATH: /opt/maca/tools/cu-bridge
+    SET_DEVICE_NUMA_PREFERRED: 1
+    MHA_USE_BLAS: ON
+    MHA_BWD_NO_ATOMIC_F64: 1
+    CUDA_DEVICE_MAX_CONNECTIONS: 1
+    CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+
+action: run
+
+hydra:
+  run:
+    dir: ${experiment.exp_dir}/hydra 
@@ -0,0 +1,66 @@
+system:
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 1
+  disable_bias_linear: True
+  use_flash_attn: True
+  use_distributed_optimizer: True
+  device_type: iluvatar 
+  precision:
+    fp16: True
+    initial_loss_scale: 522893
+    min_loss_scale: 1.0
+    attention_softmax_in_fp32: True
+    accumulate_allreduce_grads_in_fp32: True
+  logging:
+    log_interval: 1
+    tensorboard_log_interval: 1
+    wandb_project: "train-aquila-7B" 
+    wandb_exp_name: "train-test-7B" 
+  checkpoint:
+    save_interval: 2000 
+
+model:
+  use_mcore_models: true
+  num_layers: 32 
+  hidden_size: 4096 
+  num_attention_heads: 32 
+  seq_length: 2048 
+  max_position_embeddings: 2048
+  norm_epsilon: 1e-5
+  use_rotary_position_embeddings: true
+  no_position_embedding: true
+  swiglu: true
+  multiple_of: 256
+  normalization: RMSNorm
+  rotary_interleaved_patch: true
+  untie_embeddings_and_output_weights: true
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  weight_decay: 0.1
+  clip_grad: 1.0
+  train_samples: 100000
+  eval_iters: 0
+  micro_batch_size: 1
+  global_batch_size: 128 
+  seed: 1234
+
+  optimizer:
+    weight_decay: 0.1
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 2.0e-5
+      min_lr: 2.0e-6
+      lr_warmup_samples: 500
+      lr_decay_style: cosine 
+
+data:
+  data_path: ${data_path:??}
+  split: 1
+  tokenizer:
+    tokenizer_type: AquilaTokenizer
+    vocab_file: ./examples/aquila/tokenizer/vocab.json
+    merge_file: ./examples/aquila/tokenizer/merges.txt
+    special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
+    vocab_size: 100008
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		commit-id d7dc60ec3ef6341526fd187281dc289418c17899