-
-
Notifications
You must be signed in to change notification settings - Fork 27
/
config-example.yaml
414 lines (375 loc) · 15.8 KB
/
config-example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
# -----
#
# The infctx trainer is based on pytorch lightning - and uses the following yaml config file format
# For many of the undocumented trainer parameters / settings in this example, additional documentation details can be found in pytorch lightning documentation
# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.cli.LightningCLI.html
#
# For better dataset specific configuration, you may want to see the various notebooks in `notebook/dataset-config`
#
# -----
# Use either an intger value to enable a fixed training seed.
seed_everything: true
trainer:
# Configure the number of GPU, avaliable on your machine
# auto means it will automatically detect and use all GPUs
accelerator: gpu
devices: auto
num_nodes: 1
#
# Configure the deepspeed strategy, we recommend you start with `deepspeed_stage_2_offload`
# and adjust from there according to your training needs. `deepspeed_stage_3_offload` is useful
# for training LoRA on large models on a single GPU.
#
# In general you would want to use the following:
#
# - deepspeed_stage_1 : Each of your GPU has too much vram, and you do not know what to do
#
# - deepspeed_stage_2 : Optimal distributed training strategy, across multiple gpu each with sufficient vram
# - deepspeed_stage_2_offload : Reduce vram usage by offloading the optimizer state and work to cpu
#
# - deepspeed_stage_3 : Split up the model across multiple gpu, useful for large models, at a performance cost
# - deepspeed_stage_3_offload : Additional offloading, for even greater performance cost
#
# For more details see:
# https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
#
strategy: deepspeed_stage_2_offload
# Floating point precision for the model, because RWKV is built FOR bf16
# you should pretty much never change this setting
precision: bf16
# Logger setting for wandb, if you want to enable wandb, uncomment the whole logger section
# ---
# logger:
# class_path: lightning.pytorch.loggers.WandbLogger
# init_args:
# # In most cases, all you would want to modify is name/project/tags
# # or the run ID / resume flag
# name: null
# project: RWKV_training
# tags: ['RWKV']
# id: null
# resume: null
#
# # The rest is advance WANDB settings, which in most cases you can remove/ignore
# save_dir: .
# version: null
# offline: false
# dir: null
# anonymous: null
# log_model: false
# experiment: null
# prefix: ''
# checkpoint_name: null
# job_type: null
# config: null
# entity: null
# reinit: null
# group: null
# notes: null
# magic: null
# config_exclude_keys: null
# config_include_keys: null
# mode: null
# allow_val_change: null
# force: null
# tensorboard: null
# sync_tensorboard: null
# monitor_gym: null
# save_code: null
# settings: null
# Checkpoint settings for the training process
callbacks:
class_path: lightning.pytorch.callbacks.ModelCheckpoint
init_args:
# Configure this to the path you want to save your checkpoints to
# note that a subdir will be created with the name `epoch=x-step=y.ckpt`
#
# to convert a checkpoint to a model, you can use the
# `python3 export_checkpoint.py <checkpoint path>` script,
# which will create a `rwkv_model.pth` in the checkpoint directory.
#
# Do not use the `zero_to_fp32.py` script as that will have export format issues
dirpath: /path/to/your/checkpoint/dir
filename: null
# Save the top/last K checkpoints
save_top_k: 3
# Choose the most recent checkpoints by steps
monitor: 'step'
mode: max
# If enabled (true), save a copy of the latest checkpoint to 'last.ckpt'
# useful to simply checkpoint resume scripts, at a price of disk performance
save_last: true
# DO NOT set this as true, as the model weight exported will have format issues
# expert as checkpoint, and use the `export_checkpoint.py` script to convert to model instead
save_weights_only: false
# How frequent you want to save a checkpoint for every step.
# This will happen for every X data sample, where X = every_n_train_steps * accumulate_grad_batches
#
# In general you will want to avoid putting a low number (expecially if accumulate_grad_batches <= 100)
# as the checkpoint process, will pause all the gpu training for some time, slowing down the overall process
# However you do not want to configure too high of a number, where you will lose too much progress if the training crashes
every_n_train_steps: 100
every_n_epochs: null
save_on_train_epoch_end: true
train_time_interval: null
# Other pytorch lightning settings, which in most cases you can remove/ignore
# ---
# verbose: false
# auto_insert_metric_name: true
########################################
## Training run parameter settings
########################################
# Generally what you want to configure is the maximum number of epochs
# Leave it as -1, and it will keep going forever till interrupted
# Or set it as a number, and it will stop after that number of epochs
max_epochs: 1
min_epochs: null
max_steps: -1
min_steps: null
max_time: null
# Number of datasamples to train for each step, a data sample is considered
# a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
#
# This decides the number of datasample, to learn together from, before backproping
# any weight changes at the end of the batch.
#
# `1 trainer/global_step = accumulate_grad_batches * number of GPU devices * number of nodes`
#
# Recommended to be a big enough number (like 128/256) where it prevents the training
# loss from flucuating in the process. But not too big of a number where the increased
# GPU vRAM / offloaded RAM usage will cause the training to crash.
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 32
# You can alternatively set the accumulate_grad_batches per GPU directly
# (not recommended)
#
# You can only either use target_batch_size, which would auto-compute this value for you
# based on the number of GPU's you have, or have this value set directly - not both
#
# `target_batch_size ~= number of GPU's * accumulate_grad_batches`
#
# In event that taget_batch_size cannot be "perfectly divided" by the number of GPU's
# this number is rounded down, with a minimum of 1
# ---
# accumulate_grad_batches: 256
# Various other pytorch lightning settings, which in most cases you can remove/ignore
# ---
# fast_dev_run: false
# limit_train_batches: null
# limit_val_batches: null
# limit_test_batches: null
# limit_predict_batches: null
# overfit_batches: 0.0
# val_check_interval: null
# check_val_every_n_epoch: 1
# num_sanity_val_steps: 0
# log_every_n_steps: 1
# enable_checkpointing: null
# enable_progress_bar: null
# enable_model_summary: null
# gradient_clip_val: 1.0
# gradient_clip_algorithm: null
# deterministic: null
# benchmark: null
# inference_mode: true
# use_distributed_sampler: true
# profiler: null
# detect_anomaly: false
# barebones: false
# plugins: null
# sync_batchnorm: false
# reload_dataloaders_every_n_epochs: 0
# default_root_dir: null
########################################
## Training model settings
########################################
model:
# Model to start the finetune/training process from
load_model: /path/to/your/model.pth
# Context length to use for the training process
# the larger the number (and batch size) the larger the vram usage
#
# Note that if the datasample context length is larger then the ctx_len
# its training process would be split into ctx_len sized chunks.
#
# This allows the training of extreamly large context length (eg. 100k),
# without eating up too much vram by keeping the training context length
# to a resonable number sutible to the current GPU setup
ctx_len: 2048
# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 6e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
# NOTE: lr_final / lr_period does not work with warmup_steps
# and will be ignored (or replaced) with the warmup_steps logic instead
lr_final: 4e-4
# Number of epoch to reduce the learning rate from lr_init to lr_final
# 1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
# 0 means lr_final will apply immediately
# -1 means we take the current max_step / max_epoch as the period
lr_period: -1
# lr_period type if its set, defaults to epoch
lr_period_type: epoch
# Back Propagation through time, used to work around training of large context length
# beyond what can be supported by the current GPU vram architecture
#
# This is not 1:1 equivalent to the same training process with the full vram
# as the training process is split into multiple segments, part by part.
# with limited learnings from the each segment.
bptt_learning: true
# Segmented range to performing backprop learning on
# 1 means to apply only for the last segment
# -1 means to apply for all segments
#
# For multi-gpu training, when possible, used a fixed value
# to reduce gpu sync overhead. When used with fixed dataset context length
# For mixed dataset sizes, -1 is a resonable trade-off
bptt_learning_range: -1
# Limits the bptt learning only to the "current" chunk
# being learned within the learning range. While this reduces the effectiveness
# of bptt, it also further reduces vram requirements.
#
# This is also known as tbptt (Truncated Back Propagation through time)
bptt_truncated_learning: false
# Aggressively clear the cuda cache between each data samples.
# This causes a performance penalty, but reduces the vram pressure
#
# This is useful for mitigating the following memory pressure warning
# `1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance...`
substep_cuda_cache_clear: false
# The model size setting, to use against the current model
# It is recommended to not configure this, and simply use auto-detection
# Otherwise, this is useful to double check the model settings prior running
#
# For your current model settings, refer to the model card
# of the downloaded model for more details
# ---
# n_embd: 768
# n_layer: 12
# vocab_size: 50277
# Experimental cutoff settings
# ---
# Data samples would be cut down to the respective max ctx_len_cutoffs
# values if its larger then ctx_len. If the data sample is larger then
# the largest len_cutoff, the remaining data will be discarded
#
# Leave it as a blank array to disable the feature
# ---
# ctx_len_cutoffs: []
# ---
# Experimental settings, number of tokens to skip in the data sample
# prefix, for the respective cutoff length. Used to speed up the process
#
# Leave it as a blank array to disable the feature
# ---
# ctx_len_warmup_steps: []
# torch.set_float32_matmul_precision, used to optimize operations with tensor cores
# this should be set as null, for non cuda core GPUs. Has no major impact AFAIK
# ---
# torch_set_float32_matmul_precision: 'high'
# Adam optimizer settings
# You probably want to leave this alone, unless you know what you are doing
# ---
# beta1: 0.9
# beta2: 0.99
# adam_eps: 1.0e-08
# weight_decay: 0.01
# various other pytorch lightning settings you probably should leave alone
# ---
# grad_cp: true
# warmup_steps: -1
# layerwise_lr: true
# dim_att: null
# dim_ffn: null
data:
# dataset_path for the prebuilt dataset, using HF `load_from_disk()`
#
# Use this if you have built your own dataset and saved it with `save_to_disk()`
# with source left as null. Other wise configure this to a directory which the
# dataset will be built and tokenized by the huggingface dataset process.
#
# If using relative path, this should be relative to the trainer script path
data_path: /path/to/store/your/data_path/
# Other wise provide the source path, which is used as huggingface dataset path
# this will be used to populate the dataset_path
#
# Use either the following
# - hugging face dataset
# - local dataset mode (ie: text,json,csv - use source_data_dir, to configure the path then)
# - null
#
# If source is disabled, all other params, except data_path, is effectively ignored
# as the dataset building process is skipped. You are expected to prepare the huggingface
# compliant dataset yourself, and save it to the data_path
source: null
# source: "teven/enwiki_00k" # Hugging face dataset
# source: text # Text mode, used with source_data_dir
# Additional source dataset params, used to grab subsets of the dataset
# ---
# source_dataset_params:
# language: en
# Use data_dir, if you are using source=text/json/etc
# If using relative path, this should be relative to the trainer script path
# source_data_dir: ../dataset-text/
# After loading the dataset, split out test data used for validation,
# This process is skipped if the dataset includes a test split
#
# If given a float value, a percentage of the dataset is used (1.0 being 100%)
# If given an int value, the number of data sample is used.
#
# Due to the limitaitons in the trainer process, there is always a minimum of 1 test sample
test_split: 0.01
test_split_shuffle: true
# Tokenizer to use, use either the inbuilt 'neox', or 'world' tokenizer
# If using a custom tokenizer, provide the HF tokenizer name/path
# ---
tokenizer: neox
# Minimum / Maximum token size of the dataset to use
# useful for filtering out small noisy data samples from large datasets
# (eg. removal of small articles of less then 1024 tokens from wikipedia)
#
# This is ignored, if set to -1
# ---
# min_token_size: 1024
# max_token_size: -1
# Rechunking of text dataset, this is done only when source is set as 'text'
# and will merge the various sentencees, into larger chunks up to the target size
#
# Defaults to 2048
#
# This is ignored, if source is not set as text (unless text_rechunk_force)
# This is ignored, if set to zero
# ---
# text_rechunk_size: 2048
# Apply text rechunk to the dataset, even if its not a 'text' source
# This is done only after dataset filtering, and if source is not 'text'
# ---
# text_rechunk_force: True
# Custom text column to use, useful for dataset with alternative training columns labels
# This is checked before multi column merging, default is null (disabled)
# eg: 'code'
# ---
# custom_text_key: 'code'
# Multi Column merging process, default setting is used to support and merge
# "instruction", "input", "output", datasets. To disable set multi_column_keys to []
#
# A minimum of 2 columns is required, with non empty data, for the merge to occur
# If no match is found, this will fallback to the default prompt/completion or text column,
# or throw an error if the default fallback is not found
# ---
# multi_column_keys: ['instruction', 'input', 'output']
# multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
# multi_column_train_mask: [true, false, true]
# multi_column_separator: '\n\n'
# If processing prompt/completion jsonl pairs, the prompt is masked by default
# use this flag to disable this default behaviour
# ---
# disable_prompt_completion_mask: false
# Path to the current checkpoint to continue training from
# this should be the directory path, and ends with `.ckpt/`
ckpt_path: null