diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml
index 89ae03e9b..cc836e182 100644
--- a/examples/llama/conf/config.yaml
+++ b/examples/llama/conf/config.yaml
@@ -49,4 +49,31 @@ action: run
 
 hydra: 
   run:
-    dir: ${experiment.exp_dir}/hydra 
\ No newline at end of file
+    dir: ${experiment.exp_dir}/hydra 
+
+# defaults:
+#   - train: train_llama2_7b_tp_hetero
+#   - _self_
+
+# experiment:
+#   exp_name: llama2_tp_hetero
+#   exp_dir: ./outputs_llama2_tp_hetero
+#   task:
+#     type: train
+#     backend: megatron
+#     entrypoint: ./flagscale/train/hetero/train_llama.py
+#   runner:
+#     backend: torchrun
+#     nnodes: 1
+#     nproc_per_node: 8
+#     hostfile: hostfile
+#   envs:
+#     CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
+#     CUDA_DEVICE_MAX_CONNECTIONS: 1
+
+# action: run
+
+# hydra:
+#   run:
+#     dir: ${experiment.exp_dir}/hydra
+
diff --git a/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml
new file mode 100644
index 000000000..90995e2d9
--- /dev/null
+++ b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml
@@ -0,0 +1,67 @@
+system:
+  tensor_model_parallel_size: 4
+  pipeline_model_parallel_size: 3
+  disable_bias_linear: True
+  use_flash_attn: True
+  sequence_parallel: True
+  use_distributed_optimizer: True
+  hetero_mode: pp
+  hetero_device_types: A100
+  hetero_current_device_type: A100
+  hetero_pipeline_stages: [3,16,8,8]
+  process_meshes: [4,1,1,2,1,2]
+  precision:
+    bf16: True
+    initial_loss_scale: 16384
+    min_loss_scale: 1.0
+  logging:
+    log_interval: 1
+  checkpoint:
+    save_interval: 100
+
+model:
+  use_mcore_models: True
+  transformer_impl: transformer_engine
+  num_layers: 32 
+  hidden_size: 4096 
+  ffn_hidden_size: 11008
+  num_attention_heads: 32 
+  seq_length: 4096   
+  group_query_attention: False
+  num_query_groups: 8
+  max_position_embeddings: 4096
+  norm_epsilon: 1e-5
+  use_rotary_position_embeddings: True
+  no_position_embedding: True
+  swiglu: True
+  normalization: RMSNorm
+  untie_embeddings_and_output_weights: True
+  init_method_std: 0.02
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  weight_decay: 0.1
+  clip_grad: 1.0
+  train_iters: 30
+  eval_iters: 0
+  eval_interval: 2000
+  micro_batch_size: 1
+  global_batch_size: 32
+
+  optimizer:
+    weight_decay: 1e-2
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    lr_scheduler:
+      lr: 0.00015
+      min_lr: 1.0e-5
+      lr_warmup_fraction: .01
+      lr_decay_iters: 1
+      lr_decay_style: cosine
+
+data:
+  data_path: ${data_path:??}
+  split: 1
+  tokenizer:
+    tokenizer_type: Llama2Tokenizer
+    tokenizer_model: examples/llama/tokenizer.model 
+    vocab_size: 32000