Skip to content

Commit

Permalink
reproducibility stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
josephdviviano committed Aug 29, 2024
1 parent 6d7b9d3 commit af4e52d
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# Remove old version of torchgfn
pip remove torch
pip uninstall torch

# Install the relevant OneCCL, mpiexec.hydra, Torch, and Compilers.
conda config --set channel_priority strict
Expand Down
10 changes: 3 additions & 7 deletions tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
#SBATCH --partition=long
#SBATCH --constraint=milan # 64 CPU Cores, 1core:12GB optimal ratio.
#SBATCH -N 2 # Number of nodes to request.
#SBATCH --ntasks=8 # Number of tasks to run in total (need to split among numa nodes).
#SBATCH --ntasks-per-node=4 # If you use ntasks-per-node you can scale.
#SBATCH --cpus-per-task=8 # Number of CPUs to run per task.
#SBATCH --time=00:02:00

# Note - 8 tasks,
#SBATCH --time=23:59:59

# Olexa notes:
# TODO: -B 2,16,1
Expand All @@ -26,7 +23,7 @@
# Initalize the conda environment on the target node, which should automatically set all
# oneapi variables for the user.
source /home/mila/v/vivianoj/miniconda3/bin/activate
conda activate torchgfn
conda activate torchgfn_multinode

# System dependent thing - to delete.
#source /swtools/intel/2024.0/oneapi-vars.sh
Expand All @@ -49,8 +46,7 @@ echo " + Slurm Job Num Nodes: ${SLURM_JOB_NUM_NODES}"
echo " + Slurm NodeList: ${SLURM_NODELIST}"

#mpiexec.hydra -np 2 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0xFFFF,0xFFFF0000] -genv CCL_WORKER_AFFINITY=32,48 -genv CCL_WORKER_COUNT=1 -genv O MP_NUM_THREADS=16 python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000
mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=1 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256

mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=8 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256

#./run_dist_ht.sh python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000

Expand Down

0 comments on commit af4e52d

Please sign in to comment.