diff --git a/sagemaker_train/Dockerfile b/sagemaker_train/Dockerfile index 36300449..f4d11c62 100644 --- a/sagemaker_train/Dockerfile +++ b/sagemaker_train/Dockerfile @@ -1,7 +1,7 @@ ARG AWS_REGION # SageMaker PyTorch image -FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-sagemaker +FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker # Run custom installation of libraries # RUN pip install xxx @@ -26,6 +26,8 @@ RUN rm /opt/ml/code/setup.py RUN pip install -r /opt/ml/code/requirements.txt RUN pip uninstall flash-attn -y RUN pip install flash-attn>=2.2 +RUN pip install s3fs>=2023.6.0 +RUN pip install --upgrade s3fs # # Prevent sagemaker from installing requirements again. # RUN rm /opt/ml/code/setup.py RUN rm /opt/ml/code/requirements.txt diff --git a/sagemaker_train/Dockerfile_update b/sagemaker_train/Dockerfile_update index 1282688c..c9fa936b 100644 --- a/sagemaker_train/Dockerfile_update +++ b/sagemaker_train/Dockerfile_update @@ -9,6 +9,8 @@ COPY . /opt/ml/code/ # RUN pip install -e /opt/ml/code/ # Prevent sagemaker from installing requirements again. +RUN pip install s3fs>=2023.6.0 +RUN pip install --upgrade s3fs RUN rm /opt/ml/code/setup.py RUN rm /opt/ml/code/requirements.txt diff --git a/sagemaker_train/cfg_sample.yaml b/sagemaker_train/cfg_sample.yaml index 815d0910..59b03277 100644 --- a/sagemaker_train/cfg_sample.yaml +++ b/sagemaker_train/cfg_sample.yaml @@ -1,11 +1,11 @@ accum-freq: 4 beta1: 0.9 beta2: 0.95 -data-key: "json" -dataset-resampled: True +data-key: "json.gz" +dataset-resampled: False # delete-previous-checkpoint: False # Total 25B * 40 = 1T tokens -epochs: 2 +epochs: 1 fsdp: True fsdp-limit-all-gathers: True # grad-checkpointing: False @@ -13,26 +13,30 @@ grad-clip-norm: 1 log-every-n-steps: 20 model: "open_lm_7b" name: "sample_7b" -precision: "amp_fp8" +precision: "amp_bfloat16" report-to: "wandb" seed: 124 -train-data-mix-weights: [0.725, 0.275] -train-data: ["TODO"] +# train-data-mix-weights: [0.725, 0.275] +dataset-manifest: ["TODO"] train-num-samples: 28_000_000_000 -wandb-project-name: "lm1" +wandb-project-name: "lm7" workers: 4 logs: /opt/ml/checkpoints/ # Some important parameters, double checked with Mitchell: -batch-size: 128 -ffn-type: gemma_geglu +global-batch-size: 32 +ffn-type: swiglu_torch # fsdp-amp: False fsdp-pure-bf16: True fsdp-backward-prefetch: True -lr: 3.e-4 +fsdp-use-orig-params: True +lr: 3.e-3 lr-cooldown-end: 3.e-5 model-norm: "gain_only_lp_layer_norm" qk-norm: True -warmup: 5000 +warmup: 2000 wd: 0.1 z-loss-coefficient: 1.e-4 +attn-name: torch_attn +torchcompile: True +use_fp8: False