-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-finetuning-mahti-gpu8.sh
40 lines (33 loc) · 1.12 KB
/
run-finetuning-mahti-gpu8.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash
#SBATCH --account=project_2001659
#SBATCH --partition=gpumedium
#SBATCH --nodes=2
#SBATCH --tasks-per-node=1
#SBATCH --cpus-per-task=128
#SBATCH --time=0:15:00
#SBATCH --gres=gpu:a100:4
module purge
module load pytorch/2.4
# This will store all the Hugging Face cache such as downloaded models
# and datasets in the project's scratch folder
export HF_HOME=/scratch/${SLURM_JOB_ACCOUNT}/${USER}/hf-cache
mkdir -p $HF_HOME
# Path to where the trained model and logging data will go
OUTPUT_DIR=/scratch/${SLURM_JOB_ACCOUNT}/${USER}/hf-data
mkdir -p $OUTPUT_DIR
# Disable internal parallelism of huggingface's tokenizer since we
# want to retain direct control of parallelism options.
export TOKENIZERS_PARALLELISM=false
# Use main node for Rendezvous settings
RDZV_HOST=$(hostname)
RDZV_PORT=29400
set -xv # print the command so that we can verify setting arguments correctly from the logs
srun torchrun \
--rdzv_id=$SLURM_JOB_ID \
--rdzv_backend=c10d \
--rdzv_endpoint="$RDZV_HOST:$RDZV_PORT" \
--nnodes=2 \
--nproc-per-node=4 \
finetuning.py $* \
--output-path $OUTPUT_DIR \
--num-workers 10