Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(*): Add internlm3 config #403

Merged
merged 14 commits into from
Jan 22, 2025
5 changes: 3 additions & 2 deletions README-ja-JP.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,9 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
</td>
<td>
<ul>
<li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
<li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
<li><a href="configs/7B_isp_sft.py">InternLM</a></li>
<li><a href="configs/7B_internlm2.py">InternLM2</a></li>
<li><a href="configs/8B_internlm3.py">InternLM3</a></li>
<li><a href="configs/7B_llama2.py">Llama2</a></li>
<li><a href="configs/7B_qwen2.py">Qwen2</a></li>
<li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
Expand Down
5 changes: 3 additions & 2 deletions README-zh-Hans.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,9 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
</td>
<td>
<ul>
<li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
<li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
<li><a href="configs/7B_isp_sft.py">InternLM</a></li>
<li><a href="configs/7B_internlm2.py">InternLM2</a></li>
<li><a href="configs/8B_internlm3.py">InternLM3</a></li>
<li><a href="configs/7B_llama2.py">Llama2</a></li>
<li><a href="configs/7B_qwen2.py">Qwen2</a></li>
<li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,9 @@ Please refer to the [System Architecture document](./doc/en/structure.md) for ar
</td>
<td>
<ul>
<li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
<li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
<li><a href="configs/7B_isp_sft.py">InternLM</a></li>
<li><a href="configs/7B_internlm2.py">InternLM2</a></li>
<li><a href="configs/8B_internlm3.py">InternLM3</a></li>
<li><a href="configs/7B_llama2.py">Llama2</a></li>
<li><a href="configs/7B_qwen2.py">Qwen2</a></li>
<li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
Expand Down
2 changes: 1 addition & 1 deletion configs/7B_internlm2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
JOB_NAME = "7b_internlm2_train"
model_type = "INTERNLM2_PUBLIC"
model_type = "INTERNLM2"
DO_ALERT = False

VOCAB_SIZE = 92544
Expand Down
2 changes: 1 addition & 1 deletion configs/7B_isp_sft.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
JOB_NAME = "7b_train"
model_type = "INTERNLM2_PUBLIC"
model_type = "INTERNLM2"
DO_ALERT = False

VOCAB_SIZE = 103168
Expand Down
270 changes: 270 additions & 0 deletions configs/8B_internlm3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
# Copyright (c) InternLM. All rights reserved.
JOB_NAME = "8b_internlm3_train"
model_type = "INTERNLM3"
DO_ALERT = False

VOCAB_SIZE = 128512
SEQ_LEN = 4096
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
NUM_KV_ATTENTION_HEAD = 2
MLP_RATIO = 2.5
NUM_LAYER = 48


MODEL_ONLY_FOLDER = None # "local:llm_ckpts/xxxx"
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
SAVE_CKPT_FOLDER = None # "local:llm_ckpts"
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"

# boto3 Ckpt folder format:
# import os
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
CHECKPOINT_EVERY = 50
ckpt = dict(
enable_save_ckpt=False, # enable ckpt save.
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt.
# 'load_ckpt_info' setting guide:
# 1. the 'path' indicate ckpt path,
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
# load function such as "llama"
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="hf"),
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
# training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
# with an automatic restart mechanism upon training reboot.
# Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
# path specified in `load_ckpt_info` by default.
# If you want to initialize your model weights from another model, you must set `auto_resume` to False.
# If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
auto_resume=False,
checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency.
# 'enable_internevo2hf_ckpt' is designed to convert the saved model checkpoint in internevo format to the huggingface format.
enable_internevo2hf_ckpt=False,
)

TRAIN_FOLDER = None
VALID_FOLDER = None # "/path/to/dataset"
data = dict(
seq_len=SEQ_LEN,
# micro_num means the number of micro_batch contained in one gradient update
micro_num=1,
# packed_length = micro_bsz * SEQ_LEN
micro_bsz=2,
# defaults to the value of micro_num
valid_micro_num=4,
# defaults to 0, means disable evaluate
valid_every=0,
pack_sample_into_one=False,
total_steps=20000,
skip_batches="",
# rampup_batch_size (str): A string with three space-separated integers representing the
# starting batch size, the increment, and the number of steps between
# each increment. For example, "192 24 8" means that the batch size (micro_num)
# starts at 192 and increases by 24 every 8 steps. Defaults to None.
# (IMPORTANT): The interval step size is 'micro_bsz'.
rampup_batch_size="",
# Datasets with less than 50 rows will be discarded
min_length=50,
train_folder=TRAIN_FOLDER,
valid_folder=VALID_FOLDER,
empty_cache_and_diag_interval=200,
diag_outlier_ratio=1.1,
# use_packed_dataset=False,
)

grad_scaler = dict(
fp16=dict(
# the initial loss scale, defaults to 2**16
initial_scale=2**16,
# the minimum loss scale, defaults to None
min_scale=1,
# the number of steps to increase loss scale when no overflow occurs
growth_interval=1000,
),
# the multiplication factor for increasing loss scale, defaults to 2
growth_factor=2,
# the multiplication factor for decreasing loss scale, defaults to 0.5
backoff_factor=0.5,
# the maximum loss scale, defaults to None
max_scale=2**24,
# the number of overflows before decreasing loss scale, defaults to 2
hysteresis=2,
)

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=True,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
clip_grad_norm=1.0,
)


# loss config (dict):
# 1. label_smoothing
# 2. op_type: cross_entropy operator type, we support five types for loss computing,
# including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
# default is "py_vocab_parallel".
# "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
# "apex_naive": cross_entropy from apex
# "py_naive": self-implemented cross_entropy
# "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
# "py_vocab_parallel": self-implemented vocab parallel cross_entropy

# * op_types that ends with "naive" only support parallel_output=False;
# * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
loss = dict(label_smoothing=0, op_type="py_vocab_parallel")

adam = dict(
lr=1e-4,
adam_beta1=0.9,
adam_beta2=0.95,
adam_beta2_c=0,
adam_eps=1e-8,
weight_decay=0.01,
)

lr_scheduler = dict(
total_steps=data["total_steps"],
init_steps=0, # optimizer_warmup_step
warmup_ratio=0.01,
eta_min=1e-5,
last_epoch=-1,
)

beta2_scheduler = dict(
init_beta2=adam["adam_beta2"],
c=adam["adam_beta2_c"],
cur_iter=-1,
)

use_fp32_norm = False
model = dict(
checkpoint=False,
num_chunks=1,
num_attention_heads=NUM_ATTENTION_HEAD,
num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
embed_split_hidden=True,
vocab_size=VOCAB_SIZE,
embed_grad_scale=1,
parallel_output=True,
hidden_size=HIDDEN_SIZE,
num_layers=NUM_LAYER,
no_bias=True,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.bfloat16",
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
# Whether the odd and even columns of the query and key in the model are normally interleaved.
# If it's True, the model's odd and even columns are normally ordered; if it's False,
# it means that the model has prematurely concatenated all odd columns and even columns in front
# and back, in order to improve the RoPE's computational efficiency.
# Example:
# qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
# qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
qk_interleaved=False,
rope_base=50000000,
enable_qkv_fusion=False,
)

"""
zero1 parallel (dict):
1. size: int
* if size <= 0, the size of the zero process group is equal to the size of the dp process group,
so parameters will be divided within the range of dp.
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
tensor parallel (dict):
1. size: int, the size of tensor parallel.
2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
pipeline parallel (dict):
1. size: int, the size of pipeline parallel.
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
defaults to False.
weight parallel (dict):
1. size: int, the size of weight parallel.
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
3. launch_allgather_before: str, before which module to launch the all gather communication to
prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
Must be used with forward_overlap_per 'layer'.
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
sequence_2D (dict):
1. enable: bool, whether enable the 2D sequence parallel or not.
2. head_size: int, the parallel degree of head parallelism (DeepSpeed Ulysses).
head_size * context_size should be equal tensor size.
3. context_size: int, the parallel degree of context parallelism.
head_size * context_size should be equal tensor size.
4. window_size: int, the sliding window size in context parallelism.
5. device_placement_strategy: dict,
head_first: bool, if `True`, ranks of the same head parallel group are
given high priority for colocation on the same node;
if `False`, ranks of the same context parallel group are
given high priority for colocation on the same node;
interleaved: bool, if `head_first` is `False` and `window_size` > 1, this config could
interleaved the ranks in the same window to make full use of NIC as much as possible.
"""
parallel = dict(
zero1=dict(size=1),
tensor=dict(size=1, mode="isp"),
pipeline=dict(size=1, interleaved_overlap=True),
weight=dict(size=16, overlap=True, launch_allgather_before="wo", forward_overlap_per="module"),
sequence_2D=dict(
enable=False,
head_size=2,
context_size=4,
window_size=1,
device_placement_strategy=dict(head_first=True, interleaved=False),
),
)

cudnn_deterministic = False
cudnn_benchmark = False

monitor = dict(
# feishu alert configs
alert=dict(
enable_feishu_alert=DO_ALERT,
feishu_alert_address=None, # feishu webhook to send alert message
light_monitor_address=None, # light_monitor address to send heartbeat
alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
),
tensorboard=dict(
queue_max_length=10,
),
)

# metric_dtype can be "fp32" or other string
# only when set to "fp32" will use fp32 to calc in metrics
# metric_dtype = "fp32"

generation = dict(
ckpt_folder="/path/to/saved/ckpt",
output_folder="/path/to/save/generation",
batch_size=1,
eos_id=[2, 0],
bos_id=1,
max_length=100,
do_sample=True,
temperature=1.0,
top_k=50,
top_p=1.0,
repetition_penalty=1,
length_penalty=1.0,
)
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_1B.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2_PUBLIC"
model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 2048
Expand Down
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_20B.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2_PUBLIC"
model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 6144
Expand Down
2 changes: 1 addition & 1 deletion configs/_base_/models/internlm2_7B.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) InternLM. All rights reserved.

model_type = "INTERNLM2_PUBLIC"
model_type = "INTERNLM2"

VOCAB_SIZE = 92544
HIDDEN_SIZE = 4096
Expand Down
4 changes: 4 additions & 0 deletions internlm/initialize/launch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) InternLM. All rights reserved.

import argparse
import os
Expand Down Expand Up @@ -307,6 +308,9 @@ def args_sanity_check():
logger.info(f"clip_grad_norm: {clip_grad_norm}")

model = gpc.config.model
if "enable_qkv_fusion" not in model:
model._add_item("enable_qkv_fusion", True)

if "dtype" not in model:
logger.warning("dtype is not set, use torch.float16 by defalut!")
model._add_item("dtype", torch.float16)
Expand Down
Loading
Loading