Updating deps and config

mlfoundations · Jun 19, 2024 · aca1b75 · aca1b75
1 parent 936cd9a
commit aca1b75
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 12 deletions.
diff --git a/sagemaker_train/Dockerfile b/sagemaker_train/Dockerfile
@@ -1,7 +1,7 @@
 ARG AWS_REGION
 
 # SageMaker PyTorch image
-FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-sagemaker
+FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker
 
 # Run custom installation of libraries
 # RUN pip install xxx
@@ -26,6 +26,8 @@ RUN rm /opt/ml/code/setup.py
 RUN pip install -r /opt/ml/code/requirements.txt
 RUN pip uninstall flash-attn -y
 RUN pip install flash-attn>=2.2
+RUN pip install s3fs>=2023.6.0
+RUN pip install --upgrade s3fs
 # # Prevent sagemaker from installing requirements again.
 # RUN rm /opt/ml/code/setup.py
 RUN rm /opt/ml/code/requirements.txt

diff --git a/sagemaker_train/Dockerfile_update b/sagemaker_train/Dockerfile_update
@@ -9,6 +9,8 @@ COPY . /opt/ml/code/
 # RUN pip install -e /opt/ml/code/
 
 # Prevent sagemaker from installing requirements again.
+RUN pip install s3fs>=2023.6.0
+RUN pip install --upgrade s3fs
 RUN rm /opt/ml/code/setup.py
 RUN rm /opt/ml/code/requirements.txt
 

diff --git a/sagemaker_train/cfg_sample.yaml b/sagemaker_train/cfg_sample.yaml
@@ -1,38 +1,42 @@
 accum-freq: 4
 beta1: 0.9
 beta2: 0.95
-data-key: "json"
-dataset-resampled: True
+data-key: "json.gz"
+dataset-resampled: False
 # delete-previous-checkpoint: False
 # Total 25B * 40 = 1T tokens
-epochs: 2
+epochs: 1
 fsdp: True
 fsdp-limit-all-gathers: True
 # grad-checkpointing: False
 grad-clip-norm: 1
 log-every-n-steps: 20
 model: "open_lm_7b"
 name: "sample_7b"
-precision: "amp_fp8"
+precision: "amp_bfloat16"
 report-to: "wandb"
 seed: 124
-train-data-mix-weights: [0.725, 0.275]
-train-data: ["TODO"]
+# train-data-mix-weights: [0.725, 0.275]
+dataset-manifest: ["TODO"]
 train-num-samples: 28_000_000_000
-wandb-project-name: "lm1"
+wandb-project-name: "lm7"
 workers: 4
 logs: /opt/ml/checkpoints/
 
 # Some important parameters, double checked with Mitchell:
-batch-size: 128
-ffn-type: gemma_geglu
+global-batch-size: 32
+ffn-type: swiglu_torch
 # fsdp-amp: False
 fsdp-pure-bf16: True
 fsdp-backward-prefetch: True
-lr: 3.e-4
+fsdp-use-orig-params: True
+lr: 3.e-3
 lr-cooldown-end: 3.e-5
 model-norm: "gain_only_lp_layer_norm"
 qk-norm: True
-warmup: 5000
+warmup: 2000
 wd: 0.1
 z-loss-coefficient: 1.e-4
+attn-name: torch_attn
+torchcompile: True
+use_fp8: False