From af4e52de95f09ea636dcdbfebd3121edd31de88f Mon Sep 17 00:00:00 2001
From: Joseph Viviano <joseph@viviano.ca>
Date: Thu, 29 Aug 2024 17:26:19 -0400
Subject: [PATCH] reproducibility stuff

---
 .../examples/multinode/install_multinode_dependencies  |  2 +-
 .../examples/multinode/mila.ddp_gfn.small.4.slurm      | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tutorials/examples/multinode/install_multinode_dependencies b/tutorials/examples/multinode/install_multinode_dependencies
index 8c8ea895..5c869af7 100644
--- a/tutorials/examples/multinode/install_multinode_dependencies
+++ b/tutorials/examples/multinode/install_multinode_dependencies
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Remove old version of torchgfn
-pip remove torch
+pip uninstall torch
 
 # Install the relevant OneCCL, mpiexec.hydra, Torch, and Compilers.
 conda config --set    channel_priority        strict
diff --git a/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm b/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm
index 9a6d4937..3c8168c3 100644
--- a/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm
+++ b/tutorials/examples/multinode/mila.ddp_gfn.small.4.slurm
@@ -7,12 +7,9 @@
 #SBATCH --partition=long
 #SBATCH --constraint=milan     # 64 CPU Cores, 1core:12GB optimal ratio.
 #SBATCH -N 2                   # Number of nodes to request.
-#SBATCH --ntasks=8             # Number of tasks to run in total (need to split among numa nodes).
 #SBATCH --ntasks-per-node=4    # If you use ntasks-per-node you can scale.
 #SBATCH --cpus-per-task=8      # Number of CPUs to run per task.
-#SBATCH --time=00:02:00
-
-# Note - 8 tasks,
+#SBATCH --time=23:59:59
 
 # Olexa notes:
 # TODO: -B 2,16,1
@@ -26,7 +23,7 @@
 # Initalize the conda environment on the target node, which should automatically set all
 # oneapi variables for the user.
 source /home/mila/v/vivianoj/miniconda3/bin/activate
-conda activate torchgfn
+conda activate torchgfn_multinode
 
 # System dependent thing - to delete.
 #source /swtools/intel/2024.0/oneapi-vars.sh
@@ -49,8 +46,7 @@ echo " + Slurm Job Num Nodes: ${SLURM_JOB_NUM_NODES}"
 echo " + Slurm NodeList: ${SLURM_NODELIST}"
 
 #mpiexec.hydra -np 2 -ppn 2 -l -genv I_MPI_PIN_DOMAIN=[0xFFFF,0xFFFF0000] -genv CCL_WORKER_AFFINITY=32,48 -genv CCL_WORKER_COUNT=1 -genv O     MP_NUM_THREADS=16 python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000
-mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=1 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256
-
+mpiexec.hydra -np 4 -ppn 4 -l -genv CCL_WORKER_COUNT=8 python -u ../train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256
 
 #./run_dist_ht.sh python -u train_hypergrid_multinode.py --ndim 8 --height 8 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000