Skip to content

Commit

Permalink
Updating deps and config
Browse files Browse the repository at this point in the history
  • Loading branch information
shahromil16 committed Jun 19, 2024
1 parent 936cd9a commit aca1b75
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 12 deletions.
4 changes: 3 additions & 1 deletion sagemaker_train/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG AWS_REGION

# SageMaker PyTorch image
FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-sagemaker
FROM 763104351884.dkr.ecr.${AWS_REGION}.amazonaws.com/pytorch-training:2.2.0-gpu-py310-cu121-ubuntu20.04-sagemaker

# Run custom installation of libraries
# RUN pip install xxx
Expand All @@ -26,6 +26,8 @@ RUN rm /opt/ml/code/setup.py
RUN pip install -r /opt/ml/code/requirements.txt
RUN pip uninstall flash-attn -y
RUN pip install flash-attn>=2.2
RUN pip install s3fs>=2023.6.0
RUN pip install --upgrade s3fs
# # Prevent sagemaker from installing requirements again.
# RUN rm /opt/ml/code/setup.py
RUN rm /opt/ml/code/requirements.txt
Expand Down
2 changes: 2 additions & 0 deletions sagemaker_train/Dockerfile_update
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ COPY . /opt/ml/code/
# RUN pip install -e /opt/ml/code/

# Prevent sagemaker from installing requirements again.
RUN pip install s3fs>=2023.6.0
RUN pip install --upgrade s3fs
RUN rm /opt/ml/code/setup.py
RUN rm /opt/ml/code/requirements.txt

Expand Down
26 changes: 15 additions & 11 deletions sagemaker_train/cfg_sample.yaml
Original file line number Diff line number Diff line change
@@ -1,38 +1,42 @@
accum-freq: 4
beta1: 0.9
beta2: 0.95
data-key: "json"
dataset-resampled: True
data-key: "json.gz"
dataset-resampled: False
# delete-previous-checkpoint: False
# Total 25B * 40 = 1T tokens
epochs: 2
epochs: 1
fsdp: True
fsdp-limit-all-gathers: True
# grad-checkpointing: False
grad-clip-norm: 1
log-every-n-steps: 20
model: "open_lm_7b"
name: "sample_7b"
precision: "amp_fp8"
precision: "amp_bfloat16"
report-to: "wandb"
seed: 124
train-data-mix-weights: [0.725, 0.275]
train-data: ["TODO"]
# train-data-mix-weights: [0.725, 0.275]
dataset-manifest: ["TODO"]
train-num-samples: 28_000_000_000
wandb-project-name: "lm1"
wandb-project-name: "lm7"
workers: 4
logs: /opt/ml/checkpoints/

# Some important parameters, double checked with Mitchell:
batch-size: 128
ffn-type: gemma_geglu
global-batch-size: 32
ffn-type: swiglu_torch
# fsdp-amp: False
fsdp-pure-bf16: True
fsdp-backward-prefetch: True
lr: 3.e-4
fsdp-use-orig-params: True
lr: 3.e-3
lr-cooldown-end: 3.e-5
model-norm: "gain_only_lp_layer_norm"
qk-norm: True
warmup: 5000
warmup: 2000
wd: 0.1
z-loss-coefficient: 1.e-4
attn-name: torch_attn
torchcompile: True
use_fp8: False

0 comments on commit aca1b75

Please sign in to comment.