added doc

argonne-lcf · Feb 6, 2025 · 13e6cda · 13e6cda
1 parent 84e8295
commit 13e6cda
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 29 deletions.
diff --git a/dlio_benchmark/checkpointing/base_checkpointing.py b/dlio_benchmark/checkpointing/base_checkpointing.py
@@ -58,12 +58,7 @@ def __init__(self, ext):
             else:
                 self.rank_to_checkpoint = 0
         if self.rank_to_checkpoint == self.args.my_rank:
-            self.model_state = None
-            if self.args.model_size > 0:
-                self.model_state = {"a": self.get_tensor(self.args.model_size)}
-            self.checkpoint_size += self.args.model_size
-            if self.args.my_rank == 0:
-                logging.info(f"{utcnow()} model state defined")
+
             if len(self.args.optimization_groups) > 0:
                 self.optimization_groups_predefined = True
             else:
@@ -93,7 +88,7 @@ def __init__(self, ext):
                             self.checkpoint_size += state
                             self.optimization_state[str(index)] = self.get_tensor(state)
             if self.args.my_rank == 0:
-                logging.info(f"{utcnow()} Optimizer state defined")
+                logging.info(f"{utcnow()} Optimizer state defined: {self.checkpoint_size / 1024./1024./1024} GB per rank")
             # layer state
             self.layer_state = None
             start_layer, end_layer = self.get_layer_index(self.args.my_rank, self.tp, self.pp, self.args.num_layers)
@@ -114,12 +109,22 @@ def __init__(self, ext):
                     self.layer_state[str(layer_index)], size = self.get_layer_state(layer_index)
                     ss += size
             if self.args.my_rank == 0:
-                logging.info(f"{utcnow()} Layer states defined!")
+                logging.info(f"{utcnow()} Layer states defined! {ss/1024./1024./1024} GB per rank")
+
+            self.model_state = None
+            if self.args.model_size > 0:
+                self.model_state = {"a": self.get_tensor(self.args.model_size)}
+            if self.args.my_rank == 0:
+                logging.info(f"{utcnow()} Model state defined")
+
         ss = self.comm.allreduce(ss)/1024./1024./1024.
         opt = self.comm.allreduce(self.checkpoint_size)/1024./1024./1024.
         if self.args.zero_stage < 3:
             ss /= self.dp
         self.checkpoint_size = ss + opt
+
+
+
         if self.args.my_rank == 0:
             logging.info(f"{utcnow()} Total state size: {ss} GB")
             logging.info(f"{utcnow()} Total checkpoint size: {self.checkpoint_size} GB")

diff --git a/docs/source/config.rst b/docs/source/config.rst
@@ -57,6 +57,59 @@ More built-in examples can be found in the `workload`_ folder. One can also crea
 model
 ------------------
 One can specify the name of the model as 
+.. list-table:: 
+   :widths: 15 10 30
+   :header-rows: 1
+   * - name 
+     - default
+     - The name of the model
+   * - model_size
+     - 10240
+     - the size of the model parameters per GPU in bytes
+   * - optimization_groups
+     - []
+     - List of optimization group tensors. Use Array notation for yaml.
+   * - num_layers
+     - 1
+     - Number of layers to checkpoint. Each layer would be checkpointed separately.
+   * - layer_parameters
+     - []
+     - List of parameters per layer. This is used to perform I/O per layer.
+
+In the model session, one can define ``parallelism``, which have three variables, tensor, pipeline, and zero_stage. 
+by default, zero_stage=-1 in which no sharding at all. If zero_stage = 3, all the model and optimizer states will be sharded accross
+the data parallel group. 
+
+.. list-table:: 
+   :widths: 15 10 30
+   :header-rows: 1
+
+   * - tensor
+     - 1
+     - Tensor parallelism for model. Used to determine the number of layer model files.
+   * - pipeline
+     - 1
+     - Pipeline parallelism for model.
+   * - zero_stage
+     - -1
+     - Zero stage [-1|1|2|3]. default: -1
+
+For transformer architecture, one can define ``transformer`` under ``model``
+In which three paramters 
+
+.. list-table:: 
+   :widths: 15 10 30
+   :header-rows: 1
+
+   * - hidden_size
+     - 2048
+     - Hidden dimension of the transformer layer.
+   * - ffn_hidden_size
+     - 8196
+     - FFN hidden dimension 
+   * - vocab_size
+     - 32000
+     - vocab size for the embedding layer
 
 .. code-block:: yaml
 
@@ -319,27 +372,6 @@ checkpoint
    * - steps_between_checkpoints
      - -1
      - performing one checkpointing per certain number of steps specified
-   * - model_size
-     - 10240
-     - the size of the model parameters per GPU in bytes
-   * - optimization_groups
-     - []
-     - List of optimization group tensors. Use Array notation for yaml.
-   * - num_layers
-     - 1
-     - Number of layers to checkpoint. Each layer would be checkpointed separately.
-   * - layer_parameters
-     - []
-     - List of parameters per layer. This is used to perform I/O per layer.
-   * - type
-     - rank_zero
-     - Which rank performs this checkpoint. All ranks (all_ranks) or Rank 0 (rank_zero).
-   * - tensor_parallelism
-     - 1
-     - Tensor parallelism for model. Used to determine the number of layer model files.
-   * - pipeline_parallelism
-     - 1
-     - Pipeline parallelism for model.
 
 .. note::