Merge branch 'huggingface:main' into main

huggingface · May 8, 2024 · f6e95ac · f6e95ac
2 parents c9f9760 + 6629dfa
commit f6e95ac
Show file tree

Hide file tree

Showing 54 changed files with 3,475 additions and 407 deletions.
diff --git a/.gitignore b/.gitignore
@@ -162,4 +162,4 @@ cython_debug/
 .vscode
 
 checkpoints/
-wandb/*
+wandb/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,4 +33,4 @@ repos:
       - id: codespell
         args:
           - -w
-          - --ignore-words-list=nd,reacher,thist,ths,magent,ba,fo
+          - --ignore-words-list=nd,reacher,thist,ths,magent,ba,fo,doesnt
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ We support the following:
  - ZeRO-1 optimizer
  - FP32 gradient accumulation
  - Parameter tying/sharding
+ - Spectral µTransfer parametrization for scaling up neural networks
 
 # Installation
 
@@ -111,6 +112,10 @@ Features we would like to add:
 - `scripts/log_lighteval_to_wandb.py`: logs the evaluation results of LightEval to wandb, including summary statistics.
 
 
+# Environment Variables
+- `NANOTRON_BENCHMARK=1`: if you want to log the throughput during training
+
+
 # Credits
 
 We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: Nvidia for `Megatron-LM/apex`, Microsoft for `DeepSpeed`, HazyResearch for `flash-attn`
diff --git a/examples/config_tiny_llama.py b/examples/config_tiny_llama.py
@@ -2,6 +2,7 @@
 import os
 
 from nanotron.config import (
+    AdamWOptimizerArgs,
     CheckpointsArgs,
     Config,
     DataArgs,
@@ -62,11 +63,13 @@
     weight_decay=0.01,
     clip_grad=1.0,
     accumulate_grad_in_fp32=True,
-    adam_eps=1e-08,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    torch_adam_is_fused=True,
     learning_rate_scheduler=learning_rate,
+    optimizer_factory=AdamWOptimizerArgs(
+        adam_eps=1e-08,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        torch_adam_is_fused=True,
+    ),
 )
 
 parallelism = ParallelismArgs(
@@ -78,30 +81,40 @@
     tp_linear_async_communication=True,
 )
 
-tokens = TokensArgs(sequence_length=32, train_steps=10, micro_batch_size=2, batch_accumulation_per_replica=1)
+tokens = TokensArgs(sequence_length=256, train_steps=15, micro_batch_size=2, batch_accumulation_per_replica=1)
 
-dataset = PretrainDatasetsArgs(
-    hf_dataset_or_datasets="HuggingFaceH4/testing_alpaca_small", text_column_name="completion"
-)
+data_stages = [
+    DatasetStageArgs(
+        name="Stable Training Stage",
+        start_training_step=1,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+    DatasetStageArgs(
+        name="Annealing Phase",
+        start_training_step=10,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+]
 
-checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
+checkpoints_path = "./checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
 
 config = Config(
     general=GeneralArgs(project="debug", run="tiny_llama_%date_%jobid", seed=seed),
     checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
     parallelism=parallelism,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
-    tokenizer=TokenizerArgs("gpt2"),
+    tokenizer=TokenizerArgs("robot-test/dummy-tokenizer-wordlevel"),
     optimizer=optimizer,
     logging=LoggingArgs(),
     tokens=tokens,
-    data_stages=[
-        DatasetStageArgs(
-            name="Stable Training Stage", start_training_step=1, data=DataArgs(dataset=dataset, seed=seed)
-        ),
-        DatasetStageArgs(name="Annealing Phase", start_training_step=10, data=DataArgs(dataset=dataset, seed=seed)),
-    ],
+    data_stages=data_stages,
     profiler=None,
 )
 

diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
@@ -1,6 +1,6 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints
+  checkpoints_path: checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
@@ -10,9 +10,9 @@ data_stages:
       dataset_overwrite_cache: false
       dataset_processing_num_proc_per_process: 1
       hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_or_datasets: stas/openwebtext-10k
       hf_dataset_splits: train
-      text_column_name: completion
+      text_column_name: text
     num_loading_workers: 1
     seed: 42
   name: Stable Training Stage
@@ -22,9 +22,9 @@ data_stages:
       dataset_overwrite_cache: false
       dataset_processing_num_proc_per_process: 1
       hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_or_datasets: stas/openwebtext-10k
       hf_dataset_splits: train
-      text_column_name: completion
+      text_column_name: text
     num_loading_workers: 1
     seed: 42
   name: Annealing Phase
@@ -69,19 +69,21 @@ model:
     vocab_size: 256
 optimizer:
   accumulate_grad_in_fp32: true
-  adam_beta1: 0.9
-  adam_beta2: 0.95
-  adam_eps: 1.0e-08
   clip_grad: 1.0
   learning_rate_scheduler:
     learning_rate: 0.0003
     lr_decay_starting_step: null
-    lr_decay_steps: 8
+    lr_decay_steps: 13
     lr_decay_style: cosine
     lr_warmup_steps: 2
     lr_warmup_style: linear
     min_decay_lr: 1.0e-05
-  torch_adam_is_fused: true
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
@@ -95,13 +97,13 @@ parallelism:
 profiler: null
 tokenizer:
   tokenizer_max_length: null
-  tokenizer_name_or_path: gpt2
+  tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel
   tokenizer_revision: null
 tokens:
   batch_accumulation_per_replica: 1
   limit_test_batches: 0
   limit_val_batches: 0
   micro_batch_size: 2
-  sequence_length: 32
-  train_steps: 10
+  sequence_length: 256
+  train_steps: 15
   val_check_interval: -1
diff --git a/examples/contributor-guide/debug_config_tiny_llama.py b/examples/contributor-guide/debug_config_tiny_llama.py
@@ -5,6 +5,7 @@
     CheckpointsArgs,
     Config,
     DataArgs,
+    DatasetStageArgs,
     GeneralArgs,
     LlamaConfig,
     LoggingArgs,
@@ -95,7 +96,12 @@
     optimizer=optimizer,
     logging=LoggingArgs(),
     tokens=tokens,
-    data=DataArgs(dataset=dataset, seed=seed),
+    data_stages=[
+        DatasetStageArgs(
+            name="Stable Training Stage", start_training_step=1, data=DataArgs(dataset=dataset, seed=seed)
+        ),
+        DatasetStageArgs(name="Annealing Phase", start_training_step=10, data=DataArgs(dataset=dataset, seed=seed)),
+    ],
     profiler=None,
 )
 

diff --git a/examples/contributor-guide/debug_config_tiny_llama.yaml b/examples/contributor-guide/debug_config_tiny_llama.yaml
@@ -1,23 +1,34 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: /fsx/ferdinandmom/ferdinand-hf/nanotron/examples/checkpoints
+  checkpoints_path: /fsx/haojun/nanotron_latest/examples/checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
-
 data_stages:
-  - name: General purpose training
-    start_training_step: 1
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_splits: train
+      text_column_name: completion
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_splits: train
+      text_column_name: completion
+    num_loading_workers: 1
+    seed: 42
+  name: Annealing Phase
+  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null

diff --git a/examples/custom-dataloader/README.md b/examples/custom-dataloader/README.md
@@ -0,0 +1,39 @@
+# Use a custom dataloader with Nanotron
+
+This example shows how to use a custom dataloader with Nanotron. We will use a simple dataloader that loads a random tokenized dataset and feeds it to a Nanotron model.
+https://github.com/huggingface/nanotron/blob/2e21db0db46a40bedbd03714616dd0ae4ea75914/examples/custom-dataloader/run_train.py#L72-L84
+
+`DataCollatorForCLM` is a custom data collator that takes a list of input_ids and returns a dictionary with the input_ids and the labels on the ranks which need it. For example `input_ids` are only needed in the first PP rank, while `labels` are needed in the last PP rank.
+
+And to test it out, you should fix your config to have: (example: [config_custom_dl.yaml](config_custom_dl.yaml))
+```yaml
+- data:
+    dataset: null # Custom dataloader will be used
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+```
+
+To try it out you can run the following command:
+
+```bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=2 examples/custom-dataloader/run_train.py --config-file examples/custom-dataloader/config_custom_dl.yaml
+```
+
+## Troubleshooting
+
+### `return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)`
+```
+  File "/fsx/nouamane/projects/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 284, in forward
+    out = super().forward(masked_input)
+  File "/fsx/nouamane/miniconda/envs/2-1-cu121/lib/python3.10/site-packages/torch/nn/modules/sparse.py", line 162, in forward
+    return F.embedding(
+  File "/fsx/nouamane/miniconda/envs/2-1-cu121/lib/python3.10/site-packages/torch/nn/functional.py", line 2233, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+RuntimeError: CUDA error: device-side assert triggered
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+```
+
+If you encounter an error with `torch.embedding`, it's probable you're feeding a token which is bigger than the model's vocabulary size. Check your model's vocab size and tokenizer
-Original file line number
+Diff line change
@@ Expand Up / @@ -162,4 +162,4 @@ cython_debug/ @@
     .vscode
     checkpoints/
-    wandb/*
+    wandb/