Skip to content

Commit

Permalink
Merge branch 'main' into ladder-1xC
Browse files Browse the repository at this point in the history
  • Loading branch information
AkshitaB authored Nov 14, 2024
2 parents 15d6cea + 31c385f commit c10deab
Show file tree
Hide file tree
Showing 74 changed files with 35,130 additions and 43 deletions.
1 change: 0 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ jobs:
constraints:
cluster:
- ai2/general-cirrascale
- ai2/general-cirrascale-a100-80g-ib
- ai2/allennlp-cirrascale
envVars:
- name: COMMIT_SHA
Expand Down
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- A bunch of annealing configs
- `constant_with_warmup` learning rate schedule
- `one_in_eight` configuration for activation checkpointing
- New tokenizer in the source instead of from huggingface
- Improved support for GCS


## [v0.5.1](https://github.com/allenai/OLMo/releases/tag/v0.5.1) - 2024-10-17

### Added

- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
- Added support for flash attention and gradient checkpointing to `hf_olmo`.
- Added to `scripts.compare_wandb_configs.py` the ability to more easily compare differences in data mixes and evaluation tasks.
- Added `effective_n_kv_heads` to OLMoConfig for hacky VLLM support.

## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26

Expand Down
206 changes: 206 additions & 0 deletions configs/annealing/OLMo-7B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
run_name: OLMo-7B
seed: 6198
dry_run: false

wandb:
name: ${run_name}
project: olmo-medium
group: OLMo-7B-annealing # TODO: change to what you like

model:
d_model: 4096
n_heads: 32
n_layers: 32
mlp_hidden_size: 22016
weight_tying: false
alibi: false
rope: true
flash_attention: true
attention_dropout: 0.0
attention_layer_norm: false
multi_query_attention: false
include_bias: false
block_type: sequential
layer_norm_type: default
layer_norm_with_affine: false
bias_for_layer_norm: false
attention_layer_norm_with_affine: false
activation_type: swiglu
residual_dropout: 0.0
embedding_dropout: 0.0
max_sequence_length: 2048
vocab_size: 50280
embedding_size: 50304
eos_token_id: 50279
pad_token_id: 1
init_device: meta
init_fn: mitchell

compile:
fullgraph: false

optimizer:
name: adamw
learning_rate: 3.0e-4 # TODO: change to your peak learning
weight_decay: 0.1
betas:
- 0.9
- 0.95
metrics_log_interval: 10

scheduler: # TODO: change to what you want
name: linear_with_warmup
t_warmup: 100
alpha_f: 0.1

tokenizer:
identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json
truncate_direction: right

save_folder: runs/${run_name}
remote_save_folder: s3://ai2-llm/checkpoints/oe-data-annealing/${run_name}
save_overwrite: true
# Sharded checkpoints (best for restarts)
save_interval: 500
save_num_checkpoints_to_keep: -1
# Unsharded checkpoints (for final storage)
save_interval_unsharded: null
save_num_unsharded_checkpoints_to_keep: -1

restore_dataloader: false # TODO: this should only be 'false' initially

load_path: /net/nfs/allennlp/llm-checkpoints/step551000-unsharded #TODO: change this

max_duration: null
global_train_batch_size: 2048 # TODO: adjust as needed
device_train_microbatch_size: 2 # TODO: adjust as needed
time_limit: null

precision: amp_bf16

fsdp:
wrapping_strategy: by_block
precision: mixed

max_grad_norm: 1.0
max_grad_norm_ratio: null

speed_monitor:
window_size: 1

eval_interval: ${save_interval}
eval_subset_num_batches: -1
device_eval_batch_size: ${device_train_microbatch_size}
evaluators:
- label: v3-small-ppl-validation
data:
num_workers: 0
drop_last: true
datasets:
v3-small-c4_en-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/c4_en/val/part-0-00000.npy
v3-small-dolma_books-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_books/val/part-0-00000.npy
v3-small-dolma_common-crawl-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_common-crawl/val/part-0-00000.npy
v3-small-dolma_pes2o-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_pes2o/val/part-0-00000.npy
v3-small-dolma_reddit-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_reddit/val/part-0-00000.npy
v3-small-dolma_stack-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_stack/val/part-0-00000.npy
v3-small-dolma_wiki-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/dolma_wiki/val/part-0-00000.npy
v3-small-ice-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/ice/val/part-0-00000.npy
v3-small-m2d2_s2orc-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/m2d2_s2orc/val/part-0-00000.npy
v3-small-pile-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/pile/val/part-0-00000.npy
v3-small-wikitext_103-validation:
- r2://olmo-data/eval-data/perplexity/v3_small_gptneox20b/wikitext_103/val/part-0-00000.npy

- label: v2-small-ppl-validation
data:
num_workers: 0
drop_last: true
datasets:
v2-small-4chan-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/4chan/val.npy
v2-small-c4_100_domains-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/c4_100_domains/val.npy
v2-small-c4_en-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/c4_en/val.npy
v2-small-gab-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/gab/val.npy
v2-small-ice-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/ice/val.npy
v2-small-m2d2_s2orc-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/m2d2_s2orc/val.npy
v2-small-m2d2_wiki-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/m2d2_wiki/val.npy
v2-small-manosphere-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/manosphere/val.npy
v2-small-mc4_en-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/mc4_en/val.npy
v2-small-pile-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/pile/val.npy
v2-small-ptb-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/ptb/val.npy
v2-small-twitterAEE-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/twitterAEE/val.npy
v2-small-wikitext_103-validation:
- r2://olmo-data/eval-data/perplexity/v2_small_gptneox20b/wikitext_103/val.npy

##########################
# Downstream evaluations #
##########################
- label: piqa
type: downstream

- label: hellaswag
type: downstream

- label: winogrande
type: downstream

- label: openbook_qa
type: downstream

# - label: boolq # requires implemention of the pmi_dc matrix
# type: downstream

- label: sciq
type: downstream

- label: arc_easy
type: downstream

# - label: arc_challenge # requires implemention of the pmi_dc matrix
# type: downstream

- label: copa
type: downstream

- label: rte
type: downstream

- label: commitment_bank
type: downstream

- label: mrpc
type: downstream

- label: sst2
type: downstream

data:
pad_direction: right
num_workers: 8
drop_last: true
pin_memory: true
prefetch_factor: 2
persistent_workers: true
timeout: 0
paths:
- s3://ai2-llm/data/... # TODO: update these paths
Loading

0 comments on commit c10deab

Please sign in to comment.