From af4e52de95f09ea636dcdbfebd3121edd31de88f Mon Sep 17 00:00:00 2001 From: Joseph Viviano Date: Thu, 29 Aug 2024 17:26:19 -0400 Subject: [PATCH] reproducibility stuff --- .../examples/multinode/install_multinode_dependencies | 2 +- .../examples/multinode/mila.ddp_gfn.small.4.slurm | 10 +++------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tutorials/examples/multinode/install_multinode_dependencies b/tutorials/examples/multinode/install_multinode_dependencies index 8c8ea895..5c869af7 100644 --- a/tutorials/examples/multinode/install_multinode_dependencies +++ b/tutorials/examples/multinode/install_multinode_dependencies @@ -1,7 +1,7 @@ #!/bin/bash # Remove old version of torchgfn -pip remove torch +pip uninstall torch # Install the relevant OneCCL, mpiexec.hydra, Torch, and Compilers. conda config --set channel_priority strict diff --git a/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm b/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm index 9a6d4937..3c8168c3 100644 --- a/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm +++ b/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm @@ -7,12 +7,9 @@ #SBATCH --partition=long #SBATCH --constraint=milan # 64 CPU Cores, 1core:12GB optimal ratio. #SBATCH -N 2 # Number of nodes to request. -#SBATCH --ntasks=8 # Number of tasks to run in total (need to split among numa nodes). #SBATCH --ntasks-per-node=4 # If you use ntasks-per-node you can scale. #SBATCH --cpus-per-task=8 # Number of CPUs to run per task. -#SBATCH --time=00:02:00 - -# Note - 8 tasks, +#SBATCH --time=23:59:59 # Olexa notes: # TODO: -B 2,16,1 @@ -26,7 +23,7 @@ # Initalize the conda environment on the target node, which should automatically set all # oneapi variables for the user. source /home/mila/v/vivianoj/miniconda3/bin/activate -conda activate torchgfn +conda activate torchgfn_multinode # System dependent thing - to delete. #source /swtools/intel/2024.0/oneapi-vars.sh @@ -49,8 +46,7 @@ echo " + Slurm Job Num Nodes: ${SLURM_JOB_NUM_NODES}" echo " + Slurm NodeList: ${SLURM_NODELIST}" #mpiexec.hydra -np 2 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0xFFFF,0xFFFF0000] -genv CCL_WORKER_AFFINITY=32,48 -genv CCL_WORKER_COUNT=1 -genv O MP_NUM_THREADS=16 python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000 -mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=1 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256 - +mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=8 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256 #./run_dist_ht.sh python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000