Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Add the Infinity-Instruct SFT code #278

Merged
merged 17 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added examples/qwen/__init__.py
Empty file.
33 changes: 33 additions & 0 deletions examples/qwen/conf/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defaults:
- _self_
- train: train_qwen_2.5_1.5b
# - train: train_mixtral_1.8b

experiment:
exp_name: train_qwen_2.5_1.5b
exp_dir: ./outputs # outputs ## log、checkpoints output path
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
backend: torchrun
nnodes: 2
nproc_per_node: 8
hostfile: torchrun # Please replace with your actual hostfile path
envs:
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_SOCKET_IFNAME: eth0
NCCL_IB_DISABLE: 0
NCCL_IB_CUDA_SUPPORT: 1
NCCL_IB_GID_INDEX: 0
NCCL_DEBUG: INFO
OMP_NUM_THREADS: 4
GLOO_SOCKET_IFNAME: eth0
NCCL_IB_HCA: mlx5_2,mlx5_5
cmds:
before_start: "" # activate environment
after_stop: ""

action: run
30 changes: 30 additions & 0 deletions examples/qwen/conf/config_qwen2.5_1.5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
defaults:
- _self_
- train: train_qwen_2.5_1.5b
# - train: train_mixtral_1.8b

experiment:
exp_name: train_qwen_2.5_1.5b
exp_dir: ./outputs # outputs ## log、checkpoints output path
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
backend: torchrun
nnodes: 2
nproc_per_node: 8
hostfile: torchrun # Please replace with your actual hostfile path
envs:
CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
CUDA_DEVICE_MAX_CONNECTIONS: 1
NCCL_SOCKET_IFNAME: eth0
NCCL_IB_DISABLE: 0
NCCL_IB_CUDA_SUPPORT: 1
NCCL_IB_GID_INDEX: 0
NCCL_DEBUG: INFO
OMP_NUM_THREADS: 4
GLOO_SOCKET_IFNAME: eth0
NCCL_IB_HCA: mlx5_2,mlx5_5

action: run
80 changes: 80 additions & 0 deletions examples/qwen/conf/train/qwen_2.5_1.5b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
system:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
make_vocab_size_divisible_by: 128
disable_bias_linear: True
sequence_parallel: True
use_flash_attn: True
use_distributed_optimizer: True
distributed-timeout-minutes: 60
precision:
bf16: True
attention_softmax_in_fp32: True
accumulate_allreduce_grads_in_fp32: True
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: "train-qwen2.5-1.5B"
wandb_exp_name: "train-qwen2.5-1.5B"
checkpoint:
load: ${megatron_model__path:}
# If you want to train the model, you need to comment out ckpt_format, ckpt_convert_format, ckpt_convert_save, which are used for converting ckpt.
ckpt_format: torch_dist # ${experiment.ckpt_format}
ckpt_convert_format: torch # ${experiment.ckpt_convert_format}
ckpt_convert_save: ${experiment.ckpt_convert_save}
save_interval: 5000000
rampup_save_interval: 50000

model:
use_mcore_models: true
num_layers: 28
hidden_size: 1536
num_attention_heads: 12
num_query_groups: 2
group_query_attention: True
ffn_hidden_size: 8960
seq_length: 4096
max_position_embeddings: 4096
norm_epsilon: 1e-6
norm_init_weight: 0.02
use_rotary_position_embeddings: true
rotary_base: 1000000.0
no_position_embedding: true
reset_position_ids: true
add_qkv_bias: true
reset_attention_mask: true
swiglu: true
normalization: RMSNorm
untie_embeddings_and_output_weights: false
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
weight_decay: 0.0
clip_grad: 1.0
train_samples: 1478125
eval_iters: 0
eval_interval: 2000000
micro_batch_size: 1
global_batch_size: 512
finetune: true
transformer_impl: transformer_engine
seed: 42
#data_searching_range: [1156,1274]
optimizer:
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.95
lr_scheduler:
lr: 1e-5
min_lr: 0
lr_warmup_samples: 21120
lr_decay_style: cosine

data:
data_path: ${data_path:??}
split: 1
apply_sft_dataset_separated_loss_mask_if_existed: true
tokenizer:
tokenizer_type: HFTokenizerFS
tokenizer_path: ${HF_model_path:??}
vocab_size: 151665
Empty file.
237 changes: 237 additions & 0 deletions examples/qwen/utils/convo_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
"""GPT style dataset."""

import copy
import hashlib
import os
import time

import numpy as np
import torch

from megatron import print_rank_0
from megatron.core import mpu
from megatron.data.data_samplers import RandomSeedDataset

class ConversationDatasetCPT(torch.utils.data.Dataset):
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples, role_sep="\n\n"):
super(ConversationDatasetCPT, self).__init__()
self.conversations = conversations
self.tokenizer = tokenizer
self.maxlen = maxlen+1
self.seed = seed
self.num_samples = num_samples

## TODO convo template
self.sep = role_sep

# rng state
np_rng = np.random.RandomState(seed=seed)
np_rng.shuffle(self.conversations)

def __getitem__(self, i):
source = self.conversations[i]

instruction = source['instruction']
conversations = source['conversations']

BOS_TOKEN = self.tokenizer.cls
EOS_TOKEN = self.tokenizer.eod
example = [BOS_TOKEN]

# instruction
instruction = self.tokenizer.tokenize(f"{instruction}")
example += instruction

labels = [-100] * len(example)

for conversation in conversations:
role = conversation['from']
content = conversation['value']
content += self.sep

content = self.tokenizer.tokenize(f"{content}")

example += content
if role == 'gpt':
role_labels = copy.deepcopy(content)
else:
# masking
role_labels = [-100] * len(content)
labels += role_labels

example.append(EOS_TOKEN)
labels.append(EOS_TOKEN)

# maxlen
example = example[:self.maxlen]
labels = labels[:self.maxlen]

# padding
delta = self.maxlen - len(example)
if delta > 0:
example.extend([self.tokenizer.pad]*delta)
labels.extend([-100]*delta)

output = {
"tokens": np.array(example, dtype=np.int64),
"labels": np.array(labels, dtype=np.int64),
}
return output

def __len__(self):
return len(self.conversations)


class ConversationDatasetV2(torch.utils.data.Dataset):
def __init__(self, conversations, tokenizer, maxlen, seed, num_samples):
super(ConversationDatasetV2, self).__init__()
self.conversations = conversations
self.tokenizer = tokenizer
self.maxlen = maxlen+1
self.seed = seed
self.num_samples = num_samples

# rng state
np_rng = np.random.RandomState(seed=seed)
np_rng.shuffle(self.conversations)


def __getitem__(self, i):
from examples.aquila.utils.convo_prompt import _add_speaker_and_signal
from examples.aquila.utils.convo_prompt import header

#source = self.conversations[self.sample_idx[i]]
source = self.conversations[i]
_add_speaker_and_signal(source)

source["chat_desc"] = header
chat_desc = source['chat_desc']
instruction = source['instruction']
conversations = source['conversations']

BOS_TOKEN = self.tokenizer.cls
EOS_TOKEN = self.tokenizer.eod
example = [BOS_TOKEN]

# chat_desc
example += self.tokenizer.tokenize(f"{chat_desc}")

# instruction
instruction = self.tokenizer.tokenize(f"{instruction}")
example += instruction

labels = copy.deepcopy(example)
# add zero-out
#labels = [-100] * len(example)

for conversation in conversations:
role = conversation['from']
content = conversation['value']
content = self.tokenizer.tokenize(f"{content}")
example += content
if role == 'gpt':
role_labels = copy.deepcopy(content)
else:
# masking
role_labels = [-100] * len(content)
labels += role_labels

example.append(EOS_TOKEN)
labels.append(EOS_TOKEN)

# maxlen
example = example[:self.maxlen]
labels = labels[:self.maxlen]

# padding
delta = self.maxlen - len(example)
if delta > 0:
example.extend([self.tokenizer.pad]*delta)
labels.extend([-100]*delta)

output = {
"tokens": np.array(example, dtype=np.int64),
"labels": np.array(labels, dtype=np.int64),
}
return output

def __len__(self):
#return len(self.sample_idx)
return len(self.conversations)


def build_train_valid_test_datasets(train_valid_test_num_samples,
seq_length, seed, tokenizer,
train_data_prefix,
valid_data_prefix,
test_data_prefix=None,
finetune_dataset_type=None):
"""Build train, valid, and test datasets."""
suppored_dataset_types = dict(CPT=ConversationDatasetCPT)
dataset_cls = ConversationDatasetV2
if finetune_dataset_type in suppored_dataset_types:
dataset_cls = suppored_dataset_types[finetune_dataset_type]

def read_file(jsonl_file):
import jsonlines
conversations = []
with jsonlines.open(jsonl_file) as reader:
for line in reader:
conversations.append(line)
return conversations

train_dataset, valid_dataset, test_dataset = None, None, None
# Single dataset.
if train_data_prefix is not None:
train_conversations = read_file(train_data_prefix[0])
train_dataset = dataset_cls(
train_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[0])
train_dataset = RandomSeedDataset(train_dataset)

if valid_data_prefix is not None:
valid_conversations = read_file(valid_data_prefix[0])
valid_dataset = dataset_cls(
valid_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[1])
valid_dataset = RandomSeedDataset(valid_dataset)

if test_data_prefix is not None:
test_conversations = read_file(test_data_prefix[0])
test_dataset = dataset_cls(
test_conversations,
tokenizer=tokenizer,
maxlen=seq_length,
seed=seed,
num_samples=train_valid_test_num_samples[2])
test_dataset = RandomSeedDataset(test_dataset)

return (train_dataset, valid_dataset, test_dataset)

if __name__ == "__main__":
train_valid_test_num_samples = [12000,2000,0]
seq_length = 2048
seed = 1234
from megatron.tokenizer.tokenizer import _AquilaTokenizer
tokenizer = _AquilaTokenizer(
'../examples/aquila/tokenizer/vocab.json',
'../examples/aquila/tokenizer/merges.txt')
print(f"{dir(tokenizer)}")
train_data_prefix = ['path/to/train/set']
valid_data_prefix = ['path/to/valid/set']
train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets(
train_valid_test_num_samples,
seq_length, seed, tokenizer,
train_data_prefix,
valid_data_prefix,
test_data_prefix=None)
for idx, sample in enumerate(train_dataset):
print(f"idx={idx} sample={type(sample['labels'])}")
break

Loading