From 4b93507699a289f56f939cdad99db775170c2d7f Mon Sep 17 00:00:00 2001
From: Joseph Viviano <joseph@viviano.ca>
Date: Tue, 18 Jun 2024 16:49:19 -0400
Subject: [PATCH] example mila slurm script

---
 tutorials/examples/ddp_gfn.small.4.mila.slurm | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 tutorials/examples/ddp_gfn.small.4.mila.slurm

diff --git a/tutorials/examples/ddp_gfn.small.4.mila.slurm b/tutorials/examples/ddp_gfn.small.4.mila.slurm
new file mode 100644
index 00000000..16945d8f
--- /dev/null
+++ b/tutorials/examples/ddp_gfn.small.4.mila.slurm
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+#SBATCH -o /network/scratch/v/vivianoj/torchgfn/logs/intel/slurm-%j.out
+#SBATCH -e /network/scratch/v/vivianoj/torchgfn/logs/intel/slurm-%j.err
+#SBATCH -J ddp
+#SBATCH --get-user-env
+#SBATCH --partition=long
+#SBATCH --ntasks=2
+#SBATCH --cpus-per-task=4
+#SBATCH --time=00:60:00
+
+#source /swtools/intel/2024.0/oneapi-vars.sh
+export I_MPI_HYDRA_BOOTSTRAP=slurm
+
+eval "$(conda shell.bash hook)"
+conda activate torchgfn_multinode
+
+export KMP_AFFINITY=compact,verbose
+export OMP_NUM_THREADS=56
+export MASTER_ADDR=$(hostname  | head -n 1)
+echo $MASTER_ADDR
+echo $SLURM_JOB_NUM_NODES
+echo $SLURM_NODELIST
+
+./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 512000 --batch_size 256000 &> scaling.out.4.4.512000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 128000 --batch_size 256000 &> scaling.out.4.4.128000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 64000 --batch_size 256000 &> scaling.out.4.4.64000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 32000 --batch_size 256000 &> scaling.out.4.4.32000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories 16000 --batch_size 256000 &> scaling.out.4.4.16000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories  8000 --batch_size 256000 &> scaling.out.4.4.8000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories  4000 --batch_size 256000 &> scaling.out.4.4.4000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories  2000 --batch_size 256000 &> scaling.out.4.4.2000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories  1000 --batch_size 256000 &> scaling.out.4.4.1000.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories   512 --batch_size 256000 &> scaling.out.4.4.512.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories   256 --batch_size 256000 &> scaling.out.4.4.256.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories   128 --batch_size 256000 &> scaling.out.4.4.128.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories    64 --batch_size 256000 &> scaling.out.4.4.64.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories    32 --batch_size 256000 &> scaling.out.4.4.32.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories    16 --batch_size 256000 &> scaling.out.4.4.16.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories     8 --batch_size 256000 &> scaling.out.4.4.8.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories     4 --batch_size 256000 &> scaling.out.4.4.4.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories     2 --batch_size 256000 &> scaling.out.4.4.2.256000
+#./run_dist_ht.sh -np 4 -ppn 4 python -u train_hypergrid_multinode.py --ndim 4 --height 64 --R0 0.01 --tied --loss TB --n_trajectories     1 --batch_size 256000 &> scaling.out.4.4.1.256000