Skip to content

Commit

Permalink
Job problem on Dardel
Browse files Browse the repository at this point in the history
  • Loading branch information
Hjorthmedh committed Mar 4, 2024
1 parent 1525b9c commit 5134cda
Show file tree
Hide file tree
Showing 9 changed files with 445 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
#SBATCH -J Simulate
#SBATCH -A naiss2023-5-231
#SBATCH --nodes=60
#SBATCH --tasks-per-node=128
#SBATCH --tasks-per-node=50
#SBATCH --mail-type=ALL

# 2024-02-16: 40 cores per node worked, had 28% free... trying increasing to 45 cores per node
# 2024-02-18: Increasing back up to 128 cores. Memory free is probably more
# dependent on total number of nodes allocated
# 2024-02-19: 128 cores failade memory.. testar 50 cores nu

# You need to point this as the directory where you created the network in
#NETWORK_DIR=/cfs/klemming/home/${USER:0:1}/$USER/Snudda/examples/parallel/KTH_PDC/networks/test_10k
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/bin/bash -l
#SBATCH --partition=main
#SBATCH -o log/runSnudda-%j-output.txt
#SBATCH -e log/runSnudda-%j-error.txt
#SBATCH -t 5:59:00
#SBATCH -J Snudda
#SBATCH -A naiss2023-5-231
#SBATCH --nodes=2
#SBATCH -n 256
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=930M
#SBATCH --mail-type=ALL
module load snic-env


#..
#export OMP_STACKSIZE=128G
ulimit -s unlimited


#let NWORKERS="$SLURM_NTASKS-2"
#let NWORKERS="100"
let NWORKERS="40"

# REMEMBER TO CREATE THE "log" DIRECTORY


export IPNWORKERS=$NWORKERS


export IPYTHONDIR="/cfs/klemming/scratch/${USER:0:1}/$USER/.ipython"
rm -r $IPYTHONDIR
export IPYTHON_PROFILE=default
source $HOME/Snudda/snudda_env/bin/activate


#.. Start the ipcontroller
export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom)
srun -n 1 -N 1 -c 2 --exact --overlap --mem=0 ./../../ipcontroller_new.sh &


echo ">>> waiting 60s for controller to start"
sleep 60

#.. Read in CONTROLLERIP
CONTROLLERIP=$(<controller_ip.txt)


##.. Start the engines
echo ">>> starting ${IPNWORKERS} engines "
#srun -n ${IPNWORKERS} -c 2 --exact --overlap ipengine --location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \
#--ipython-dir=${IPYTHONDIR} --timeout=30.0 --log-level=DEBUG \
#--BaseParallelApplication.verbose_crash=True --IPEngine.verbose_crash=True \
#--Kernel.stop_on_error_timeout=1.0 --IPythonKernel.stop_on_error_timeout=1.0 \
#Session.buffer_threshold=4096 Session.copy_threshold=250000 \
#Session.digest_history_size=250000 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \
#1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err &

#srun -n ${IPNWORKERS} -c 2 --exact --overlap valgrind --leak-check=full --show-leak-kinds=all \
#ipengine --location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \
#--ipython-dir=${IPYTHONDIR} --timeout=30.0 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \
#1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err &

export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom)
srun -n ${IPNWORKERS} -c 2 -N ${SLURM_JOB_NUM_NODES} --exact --overlap --mem=0 ipengine \
--location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \
--ipython-dir=${IPYTHONDIR} --timeout=30.0 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \
1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err &


echo ">>> waiting 60s for engines to start"
sleep 30

export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom)
srun -n 1 -N 1 --exact --overlap --mem=0 ./Dardel_runSnudda_lateral.sh


echo " "

echo "JOB END "`date` start_time_network_connect.txt

wait

Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/bin/bash



SNUDDA_DIR=$HOME/Snudda/snudda
JOBDIR=../networks/sten_4

# SIMSIZE=50000

# If the BasalGangliaData directory exists, then use that for our data
#/cfs/klemming/scratch/${USER:0:1}/$USER/BasalGangliaData/data
#BasalGangliaData/Parkinson/PD0
if [[ -d "$HOME/BasalGangliaData/data" ]]; then
export SNUDDA_DATA="$HOME/BasalGangliaData/data"
echo "Setting SNUDDA_DATA to $SNUDDA_DATA"
else
echo "SNUDDA_DATA environment variable not changed (may be empty): $SNUDDA_DATA"
fi

mkdir -p $JOBDIR

echo "Dardel_runSnudda.sh should be started with srun -n 1, to only get one process"

echo "SLURM_PROCID = $SLURM_PROCID"

if [ "$SLURM_PROCID" -gt 0 ]; then
mock_string="Not main process"
else

# For debug purposes:
echo "PATH: "$PATH
echo "IPYTHONDIR: "$IPYTHONDIR
echo "PYTHONPATH: "$PYTHONPATH
echo "LD_LIBRARY_PATH: "$LD_LIBRARY_PATH

echo ">>>>>> Main process starting ipcluster"
echo

echo "Start time: " > start_time_network_connect.txt
date >> start_time_network_connect.txt

echo ">>> Init: "`date`
# snudda init ${JOBDIR} --size ${SIMSIZE} --overwrite --randomseed 1234
python setup_sten_4.py ${JOBDIR}

if [ $? != 0 ]; then
echo "Something went wrong during init, aborting!"
ipcluster stop
exit -1
fi

# WE NOW START IPCLUSTER USING ipcontroller.sh INSTEAD...
#
# echo "SLURM_NODELIST = $SLURM_NODELIST"
# let NWORKERS="$SLURM_NTASKS - 1"
#
# echo ">>> NWORKERS " $NWORKERS
# echo ">>> Starting ipcluster `date`"
#
# #.. Start the ipcluster
# ipcluster start -n ${NWORKERS} \
# --ip='*' \
# --HeartMonitor.max_heartmonitor_misses=1000 \
# --HubFactory.registration_timeout=600 \
# --HeartMonitor.period=10000 &
#
#
# #.. Sleep to allow engines to start
# echo ">>> Wait 120s to allow engines to start"
# sleep 120 #60

echo ">>> Place: "`date`
snudda place ${JOBDIR} --verbose

if [ $? != 0 ]; then
echo "Something went wrong during placement, aborting!"
# ipcluster stop
exit -1
fi

echo ">>> Detect: "`date`
snudda detect ${JOBDIR} --hvsize 50 --parallel

if [ $? != 0 ]; then
echo "Something went wrong during detection, aborting!"
# ipcluster stop
exit -1
fi

echo ">>> Prune: "`date`
snudda prune ${JOBDIR} --parallel

if [ $? != 0 ]; then
echo "Something went wrong during pruning, aborting!"
# ipcluster stop
exit -1
fi

# Disable input generation at the moment

# echo ">>> Ablate: "`date`
# python ../ablate_network.py ${JOBDIR}

echo ">>> Input: "`date`
# snudda input ${JOBDIR} --parallel --time 18 --input input.json --networkFile ${JOBDIR}/network-synapses-minimal.hdf5
snudda input ${JOBDIR} --parallel --time 18 --input input.json


#.. Shut down cluster
# ipcluster stop
#.. Shutdown ipcontroller
echo "Shutting down ipcontroller"

python ../../ipcontroller_shutdown.py


date
#echo "JOB END "`date` start_time_network_connect.txt

echo "EXITING Dardel_runjob_lateral.sh"

fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/bin/bash -l
#SBATCH --partition=main
#SBATCH -o log/Simulate-%j-output.txt
#SBATCH -e log/Simulate-%j-error.txt
#SBATCH -t 8:59:00
#SBATCH --time-min=6:59:00
#SBATCH -J Simulate
#SBATCH -A naiss2023-5-231
#SBATCH --nodes=60
#SBATCH --tasks-per-node=50
#SBATCH --mail-type=ALL

# 2024-02-16: 40 nodes worked, had 28% free... trying increasing to 45 nodes
# 2024-03-01: 60 nodes, 50 workers worked...

# You need to point this as the directory where you created the network in
#NETWORK_DIR=/cfs/klemming/home/${USER:0:1}/$USER/Snudda/examples/parallel/KTH_PDC/networks/test_10k
NETWORK_DIR=../networks/sten_4

SIMULATION_CONFIG_WITH_SYNAPSES=experiment_config_sten_4-with-synapses-A.json
SIMULATION_CONFIG_NO_SYNAPSES=experiment_config_sten_4-no-synapses-A.json


# NETWORK_WITH_SYNAPSES_OUTPUT=$NETWORK_DIR/simulation/output-with-synapses-sten_1.hdf5
# NETWORK_NO_SYNAPSES_OUTPUT=$NETWORK_DIR/simulation/output-no-synapses-sten_1.hdf5


export N_WORKERS=$SLURM_NTASKS

module load snic-env
source $HOME/Snudda/snudda_env/bin/activate
SNUDDA_DIR=/cfs/klemming/home/"${USER:0:1}"/$USER/Snudda

# If the BasalGangliaData directory exists, then use that for our data
if [[ -d "/cfs/klemming/home/${USER:0:1}/$USER/BasalGangliaData/data" ]]; then
export SNUDDA_DATA="/cfs/klemming/home/${USER:0:1}/$USER/BasalGangliaData/data"
echo "Setting SNUDDA_DATA to $SNUDDA_DATA"
rm mechanisms
ln -s $SNUDDA_DATA/neurons/mechanisms/ mechanisms
else
echo "SNUDDA_DATA environment variable not changed (may be empty): $SNUDDA_DATA"
rm mechanisms
ln -s ../../../../snudda/data/neurons/mechanisms/
fi


NETWORK_INFO_FILE=$NETWORK_DIR/network-synapses.hdf5
# NETWORK_INFO_FILE=$NETWORK_DIR/network-synapses-minimal.hdf5
NETWORK_INPUT_FILE=$NETWORK_DIR/input-spikes.hdf5
# NETWORK_VOLTAGE_FILE=$NETWORK_DIR/simulation/voltage-trace-${SLURM_JOBID}.txt



echo "Network dir: "$NETWORK_DIR

export PATH=$SNUDDA_DIR/snudda_env/bin/:$PATH
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CRAY_LD_LIBRARY_PATH
export PYTHONPATH=$SNUDDA_DIR/snudda_env/lib/python3.9/

##############

rm -r x86_64

export CXX=CC
export CC=cc
export FC=ftn
export MPICC=cc
export MPICXX=CC

CC --version

echo "About to run nrnivmodl"
which nrnivmodl

# srun -n nrnivmodl mechanisms/

srun -n 1 nrnivmodl -incflags "-lltdl=/usr/lib64/libltdl.so.7 -lreadline=/lib64/libreadline.so.7 -lncurses=/lib64/libncurses.so.6.1" -loadflags "-DLTDL_LIBRARY=/usr/lib64/libltdl.so.7 -DREADLINE_LIBRARY=/lib64/libreadline.so.7 -DNCURSES_LIBRARY=/lib64/libncurses.so.6.1" mechanisms/

# GJ disabled
# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/x86_64/special -mpi -python $SNUDDA_DIR/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --disableGJ --time 3.5 --voltOut $NETWORK_VOLTAGE_FILE

# GJ active
# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 18 --outputFile $NETWORK_WITH_SYNAPSES_OUTPUT

# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 18 --disableSyn --outputFile $NETWORK_NO_SYNAPSES_OUTPUT


# Changed to using the simulation_config

srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py dummy_file dummy_file --simulation_config $SIMULATION_CONFIG_WITH_SYNAPSES

srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py dummy_file dummy_file --simulation_config $SIMULATION_CONFIG_NO_SYNAPSES


# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 5 --noVolt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"network_file": "../networks/sten_4/network-synapses.hdf5",
"input_file": "../networks/sten_4/input-spikes.hdf5",
"output_file": "../networks/sten_4/simulation/output-no-synapses-sten_4-A.hdf5",
"log_file": "../networks/sten_4/log/network-simulation-no-synapses-log-4A.txt",
"sample_dt": 0.01,
"time": 18.0,
"disable_synapses": true,
"record_all_soma": true,
"record_all_compartments": [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"network_file": "../networks/sten_4/network-synapses.hdf5",
"input_file": "../networks/sten_4/input-spikes.hdf5",
"output_file": "../networks/sten_4/simulation/output-with-synapses-sten_4-A.hdf5",
"log_file": "../networks/sten_4/log/network-simulation-with-synapses-log-4A.txt",
"sample_dt": 0.01,
"time": 18.0,
"record_all_soma": true,
"record_all_compartments": [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500]
}
Loading

0 comments on commit 5134cda

Please sign in to comment.