From 5134cdaa8863b7a662aeb171687ebb0e4ddeb55b Mon Sep 17 00:00:00 2001 From: Johannes Hjorth Date: Mon, 4 Mar 2024 10:30:30 +0100 Subject: [PATCH] Job problem on Dardel --- .../sten_3/Dardel_simulate_lateral-B.job | 3 +- .../sten_4/Dardel_runSnudda_lateral.job | 83 ++++++++++++ .../sten_4/Dardel_runSnudda_lateral.sh | 122 ++++++++++++++++++ .../sten_4/Dardel_simulate_lateral.job | 95 ++++++++++++++ ...xperiment_config_sten_4-no-synapses-A.json | 11 ++ ...eriment_config_sten_4-with-synapses-A.json | 10 ++ .../lateral_inhibition/sten_4/input.json | 62 +++++++++ .../lateral_inhibition/sten_4/log/.empty | 0 .../lateral_inhibition/sten_4/setup_sten_4.py | 60 +++++++++ 9 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.job create mode 100755 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.sh create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_simulate_lateral.job create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-no-synapses-A.json create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-with-synapses-A.json create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/input.json create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/log/.empty create mode 100644 examples/parallel/KTH_PDC/lateral_inhibition/sten_4/setup_sten_4.py diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_3/Dardel_simulate_lateral-B.job b/examples/parallel/KTH_PDC/lateral_inhibition/sten_3/Dardel_simulate_lateral-B.job index d27101d78..d5712b622 100644 --- a/examples/parallel/KTH_PDC/lateral_inhibition/sten_3/Dardel_simulate_lateral-B.job +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_3/Dardel_simulate_lateral-B.job @@ -7,12 +7,13 @@ #SBATCH -J Simulate #SBATCH -A naiss2023-5-231 #SBATCH --nodes=60 -#SBATCH --tasks-per-node=128 +#SBATCH --tasks-per-node=50 #SBATCH --mail-type=ALL # 2024-02-16: 40 cores per node worked, had 28% free... trying increasing to 45 cores per node # 2024-02-18: Increasing back up to 128 cores. Memory free is probably more # dependent on total number of nodes allocated +# 2024-02-19: 128 cores failade memory.. testar 50 cores nu # You need to point this as the directory where you created the network in #NETWORK_DIR=/cfs/klemming/home/${USER:0:1}/$USER/Snudda/examples/parallel/KTH_PDC/networks/test_10k diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.job b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.job new file mode 100644 index 000000000..e7684552e --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.job @@ -0,0 +1,83 @@ +#!/bin/bash -l +#SBATCH --partition=main +#SBATCH -o log/runSnudda-%j-output.txt +#SBATCH -e log/runSnudda-%j-error.txt +#SBATCH -t 5:59:00 +#SBATCH -J Snudda +#SBATCH -A naiss2023-5-231 +#SBATCH --nodes=2 +#SBATCH -n 256 +#SBATCH --cpus-per-task=2 +#SBATCH --mem-per-cpu=930M +#SBATCH --mail-type=ALL +module load snic-env + + +#.. +#export OMP_STACKSIZE=128G +ulimit -s unlimited + + +#let NWORKERS="$SLURM_NTASKS-2" +#let NWORKERS="100" +let NWORKERS="40" + +# REMEMBER TO CREATE THE "log" DIRECTORY + + +export IPNWORKERS=$NWORKERS + + +export IPYTHONDIR="/cfs/klemming/scratch/${USER:0:1}/$USER/.ipython" +rm -r $IPYTHONDIR +export IPYTHON_PROFILE=default +source $HOME/Snudda/snudda_env/bin/activate + + +#.. Start the ipcontroller +export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom) +srun -n 1 -N 1 -c 2 --exact --overlap --mem=0 ./../../ipcontroller_new.sh & + + +echo ">>> waiting 60s for controller to start" +sleep 60 + +#.. Read in CONTROLLERIP +CONTROLLERIP=$(>> starting ${IPNWORKERS} engines " +#srun -n ${IPNWORKERS} -c 2 --exact --overlap ipengine --location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \ +#--ipython-dir=${IPYTHONDIR} --timeout=30.0 --log-level=DEBUG \ +#--BaseParallelApplication.verbose_crash=True --IPEngine.verbose_crash=True \ +#--Kernel.stop_on_error_timeout=1.0 --IPythonKernel.stop_on_error_timeout=1.0 \ +#Session.buffer_threshold=4096 Session.copy_threshold=250000 \ +#Session.digest_history_size=250000 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \ +#1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err & + +#srun -n ${IPNWORKERS} -c 2 --exact --overlap valgrind --leak-check=full --show-leak-kinds=all \ +#ipengine --location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \ +#--ipython-dir=${IPYTHONDIR} --timeout=30.0 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \ +#1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err & + +export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom) +srun -n ${IPNWORKERS} -c 2 -N ${SLURM_JOB_NUM_NODES} --exact --overlap --mem=0 ipengine \ +--location=${CONTROLLERIP} --profile=${IPYTHON_PROFILE} --mpi \ +--ipython-dir=${IPYTHONDIR} --timeout=30.0 c.EngineFactory.max_heartbeat_misses=10 c.MPI.use='mpi4py' \ +1> ipe_${SLURM_JOBID}.out 2> ipe_${SLURM_JOBID}.err & + + +echo ">>> waiting 60s for engines to start" +sleep 30 + +export FI_CXI_DEFAULT_VNI=$(od -vAn -N4 -tu < /dev/urandom) +srun -n 1 -N 1 --exact --overlap --mem=0 ./Dardel_runSnudda_lateral.sh + + +echo " " + +echo "JOB END "`date` start_time_network_connect.txt + +wait + diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.sh b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.sh new file mode 100755 index 000000000..86164e60c --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_runSnudda_lateral.sh @@ -0,0 +1,122 @@ +#!/bin/bash + + + +SNUDDA_DIR=$HOME/Snudda/snudda +JOBDIR=../networks/sten_4 + +# SIMSIZE=50000 + +# If the BasalGangliaData directory exists, then use that for our data +#/cfs/klemming/scratch/${USER:0:1}/$USER/BasalGangliaData/data +#BasalGangliaData/Parkinson/PD0 +if [[ -d "$HOME/BasalGangliaData/data" ]]; then + export SNUDDA_DATA="$HOME/BasalGangliaData/data" + echo "Setting SNUDDA_DATA to $SNUDDA_DATA" +else + echo "SNUDDA_DATA environment variable not changed (may be empty): $SNUDDA_DATA" +fi + +mkdir -p $JOBDIR + +echo "Dardel_runSnudda.sh should be started with srun -n 1, to only get one process" + +echo "SLURM_PROCID = $SLURM_PROCID" + +if [ "$SLURM_PROCID" -gt 0 ]; then + mock_string="Not main process" +else + + # For debug purposes: + echo "PATH: "$PATH + echo "IPYTHONDIR: "$IPYTHONDIR + echo "PYTHONPATH: "$PYTHONPATH + echo "LD_LIBRARY_PATH: "$LD_LIBRARY_PATH + + echo ">>>>>> Main process starting ipcluster" + echo + + echo "Start time: " > start_time_network_connect.txt + date >> start_time_network_connect.txt + + echo ">>> Init: "`date` + # snudda init ${JOBDIR} --size ${SIMSIZE} --overwrite --randomseed 1234 + python setup_sten_4.py ${JOBDIR} + + if [ $? != 0 ]; then + echo "Something went wrong during init, aborting!" + ipcluster stop + exit -1 + fi + +# WE NOW START IPCLUSTER USING ipcontroller.sh INSTEAD... +# +# echo "SLURM_NODELIST = $SLURM_NODELIST" +# let NWORKERS="$SLURM_NTASKS - 1" +# +# echo ">>> NWORKERS " $NWORKERS +# echo ">>> Starting ipcluster `date`" +# +# #.. Start the ipcluster +# ipcluster start -n ${NWORKERS} \ +# --ip='*' \ +# --HeartMonitor.max_heartmonitor_misses=1000 \ +# --HubFactory.registration_timeout=600 \ +# --HeartMonitor.period=10000 & +# +# +# #.. Sleep to allow engines to start +# echo ">>> Wait 120s to allow engines to start" +# sleep 120 #60 + + echo ">>> Place: "`date` + snudda place ${JOBDIR} --verbose + + if [ $? != 0 ]; then + echo "Something went wrong during placement, aborting!" + # ipcluster stop + exit -1 + fi + + echo ">>> Detect: "`date` + snudda detect ${JOBDIR} --hvsize 50 --parallel + + if [ $? != 0 ]; then + echo "Something went wrong during detection, aborting!" + # ipcluster stop + exit -1 + fi + + echo ">>> Prune: "`date` + snudda prune ${JOBDIR} --parallel + + if [ $? != 0 ]; then + echo "Something went wrong during pruning, aborting!" + # ipcluster stop + exit -1 + fi + + # Disable input generation at the moment + + # echo ">>> Ablate: "`date` + # python ../ablate_network.py ${JOBDIR} + + echo ">>> Input: "`date` + # snudda input ${JOBDIR} --parallel --time 18 --input input.json --networkFile ${JOBDIR}/network-synapses-minimal.hdf5 + snudda input ${JOBDIR} --parallel --time 18 --input input.json + + + #.. Shut down cluster + # ipcluster stop + #.. Shutdown ipcontroller + echo "Shutting down ipcontroller" + + python ../../ipcontroller_shutdown.py + + + date + #echo "JOB END "`date` start_time_network_connect.txt + + echo "EXITING Dardel_runjob_lateral.sh" + +fi diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_simulate_lateral.job b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_simulate_lateral.job new file mode 100644 index 000000000..1383ffded --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/Dardel_simulate_lateral.job @@ -0,0 +1,95 @@ +#!/bin/bash -l +#SBATCH --partition=main +#SBATCH -o log/Simulate-%j-output.txt +#SBATCH -e log/Simulate-%j-error.txt +#SBATCH -t 8:59:00 +#SBATCH --time-min=6:59:00 +#SBATCH -J Simulate +#SBATCH -A naiss2023-5-231 +#SBATCH --nodes=60 +#SBATCH --tasks-per-node=50 +#SBATCH --mail-type=ALL + +# 2024-02-16: 40 nodes worked, had 28% free... trying increasing to 45 nodes +# 2024-03-01: 60 nodes, 50 workers worked... + +# You need to point this as the directory where you created the network in +#NETWORK_DIR=/cfs/klemming/home/${USER:0:1}/$USER/Snudda/examples/parallel/KTH_PDC/networks/test_10k +NETWORK_DIR=../networks/sten_4 + +SIMULATION_CONFIG_WITH_SYNAPSES=experiment_config_sten_4-with-synapses-A.json +SIMULATION_CONFIG_NO_SYNAPSES=experiment_config_sten_4-no-synapses-A.json + + +# NETWORK_WITH_SYNAPSES_OUTPUT=$NETWORK_DIR/simulation/output-with-synapses-sten_1.hdf5 +# NETWORK_NO_SYNAPSES_OUTPUT=$NETWORK_DIR/simulation/output-no-synapses-sten_1.hdf5 + + +export N_WORKERS=$SLURM_NTASKS + +module load snic-env +source $HOME/Snudda/snudda_env/bin/activate +SNUDDA_DIR=/cfs/klemming/home/"${USER:0:1}"/$USER/Snudda + +# If the BasalGangliaData directory exists, then use that for our data +if [[ -d "/cfs/klemming/home/${USER:0:1}/$USER/BasalGangliaData/data" ]]; then + export SNUDDA_DATA="/cfs/klemming/home/${USER:0:1}/$USER/BasalGangliaData/data" + echo "Setting SNUDDA_DATA to $SNUDDA_DATA" + rm mechanisms + ln -s $SNUDDA_DATA/neurons/mechanisms/ mechanisms +else + echo "SNUDDA_DATA environment variable not changed (may be empty): $SNUDDA_DATA" + rm mechanisms + ln -s ../../../../snudda/data/neurons/mechanisms/ +fi + + +NETWORK_INFO_FILE=$NETWORK_DIR/network-synapses.hdf5 +# NETWORK_INFO_FILE=$NETWORK_DIR/network-synapses-minimal.hdf5 +NETWORK_INPUT_FILE=$NETWORK_DIR/input-spikes.hdf5 +# NETWORK_VOLTAGE_FILE=$NETWORK_DIR/simulation/voltage-trace-${SLURM_JOBID}.txt + + + +echo "Network dir: "$NETWORK_DIR + +export PATH=$SNUDDA_DIR/snudda_env/bin/:$PATH +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CRAY_LD_LIBRARY_PATH +export PYTHONPATH=$SNUDDA_DIR/snudda_env/lib/python3.9/ + +############## + +rm -r x86_64 + +export CXX=CC +export CC=cc +export FC=ftn +export MPICC=cc +export MPICXX=CC + +CC --version + +echo "About to run nrnivmodl" +which nrnivmodl + +# srun -n nrnivmodl mechanisms/ + +srun -n 1 nrnivmodl -incflags "-lltdl=/usr/lib64/libltdl.so.7 -lreadline=/lib64/libreadline.so.7 -lncurses=/lib64/libncurses.so.6.1" -loadflags "-DLTDL_LIBRARY=/usr/lib64/libltdl.so.7 -DREADLINE_LIBRARY=/lib64/libreadline.so.7 -DNCURSES_LIBRARY=/lib64/libncurses.so.6.1" mechanisms/ + +# GJ disabled +# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/x86_64/special -mpi -python $SNUDDA_DIR/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --disableGJ --time 3.5 --voltOut $NETWORK_VOLTAGE_FILE + +# GJ active +# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 18 --outputFile $NETWORK_WITH_SYNAPSES_OUTPUT + +# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 18 --disableSyn --outputFile $NETWORK_NO_SYNAPSES_OUTPUT + + +# Changed to using the simulation_config + +srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py dummy_file dummy_file --simulation_config $SIMULATION_CONFIG_WITH_SYNAPSES + +srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py dummy_file dummy_file --simulation_config $SIMULATION_CONFIG_NO_SYNAPSES + + +# srun -n $N_WORKERS $SNUDDA_DIR/examples/parallel/KTH_PDC/x86_64/special -mpi -python $SNUDDA_DIR/snudda/simulate/simulate.py $NETWORK_INFO_FILE $NETWORK_INPUT_FILE --time 5 --noVolt diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-no-synapses-A.json b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-no-synapses-A.json new file mode 100644 index 000000000..266ee2d71 --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-no-synapses-A.json @@ -0,0 +1,11 @@ +{ + "network_file": "../networks/sten_4/network-synapses.hdf5", + "input_file": "../networks/sten_4/input-spikes.hdf5", + "output_file": "../networks/sten_4/simulation/output-no-synapses-sten_4-A.hdf5", + "log_file": "../networks/sten_4/log/network-simulation-no-synapses-log-4A.txt", + "sample_dt": 0.01, + "time": 18.0, + "disable_synapses": true, + "record_all_soma": true, + "record_all_compartments": [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500] +} diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-with-synapses-A.json b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-with-synapses-A.json new file mode 100644 index 000000000..2e7d9f676 --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/experiment_config_sten_4-with-synapses-A.json @@ -0,0 +1,10 @@ +{ + "network_file": "../networks/sten_4/network-synapses.hdf5", + "input_file": "../networks/sten_4/input-spikes.hdf5", + "output_file": "../networks/sten_4/simulation/output-with-synapses-sten_4-A.hdf5", + "log_file": "../networks/sten_4/log/network-simulation-with-synapses-log-4A.txt", + "sample_dt": 0.01, + "time": 18.0, + "record_all_soma": true, + "record_all_compartments": [0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500] +} diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/input.json b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/input.json new file mode 100644 index 000000000..7d5dd2ec7 --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/input.json @@ -0,0 +1,62 @@ +{ + "dSPN": { + "cortical:1" : { + "generator" : "poisson", + "start" : [1, 4, 7, 10, 13, 16], + "end" : [3, 6, 9, 12, 15, 18], + "frequency" : [4, 4, 6, 6, 8, 8], + "population_unit_id" : 1 + }, + + "cortical:2" : { + "generator" : "poisson", + "start" : [4, 10, 16], + "end" : [6, 12, 18], + "frequency" : [10], + "population_unit_id" : 2 + }, + + "cortical:0" : { + "generator" : "poisson", + "start" : [0], + "end" : [18], + "frequency" : [3], + "population_unit_id" : 0 + } + + }, + + "iSPN": { + "cortical:1" : { + "generator" : "poisson", + "start" : [1, 4, 7, 10, 13, 16], + "end" : [3, 6, 9, 12, 15, 18], + "frequency" : [4, 4, 6, 6, 8, 8], + "population_unit_id" : 1 + }, + + "cortical:2" : { + "generator" : "poisson", + "start" : [4, 10, 16], + "end" : [6, 12, 18], + "frequency" : [10], + "population_unit_id" : 2 + }, + "cortical:0" : { + "generator" : "poisson", + "start" : [0], + "end" : [18], + "frequency" : [3], + "population_unit_id" : 0 + } + + }, + "FS": { + "cortical" : { + "generator" : "poisson", + "start" : [0], + "end" : [18], + "frequency" : [2] + } + } +} diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/log/.empty b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/log/.empty new file mode 100644 index 000000000..e69de29bb diff --git a/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/setup_sten_4.py b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/setup_sten_4.py new file mode 100644 index 000000000..6dbc7c75f --- /dev/null +++ b/examples/parallel/KTH_PDC/lateral_inhibition/sten_4/setup_sten_4.py @@ -0,0 +1,60 @@ +import os +import sys +import numpy as np + +if len(sys.argv) > 1: + network_path = sys.argv[1] +else: + sys.exit("No network path specified!") + network_path="../networks/sten_4" + +# network_path = "networks/lateral_1" +# snudda_data = "$HOME/BasalGangliaData/data" +snudda_data = "../../../../../../BasalGangliaData/data/" + +print(f"Network_path = {network_path}, snudda data = {snudda_data}") + + +duration=18 + +import snudda.init + +n_total = 40000 +f_dSPN=0.475 +f_iSPN=0.475 +f_FS=0.013 +f_ChIN=0.011 +f_LTS=0.007 + +f_total = f_dSPN + f_iSPN + f_FS + f_ChIN + f_LTS + + +n_DSPN = int(n_total * f_dSPN / f_total) +n_ISPN = int(n_total * f_iSPN / f_total) +n_FS = int(n_total * f_FS / f_total) +n_LTS = int(n_total * f_LTS / f_total) +n_ChIN = int(n_total * f_ChIN / f_total) + +print("Starting SnuddaInit") +si = snudda.init.SnuddaInit(network_path=network_path, snudda_data=snudda_data, random_seed=12345, honor_stay_inside=False) +si.define_striatum(num_dSPN=n_DSPN, num_iSPN=n_ISPN, num_FS=n_FS, num_LTS=n_LTS, num_ChIN=n_ChIN, + volume_type="cube") + +print("Adding population units") + +# si.add_population_unit_random(structure_name="Striatum", neuron_types=["dSPN", "iSPN"], +# fraction_of_neurons=0.5, unit_id=1) +# si.add_population_unit_random(structure_name="Striatum", neuron_types=["dSPN", "iSPN"], +# fraction_of_neurons=0.5, unit_id=2) + +# The centre of the cube is [0.00475, 0.004, 0.00775]. num_neurons is optional +si.add_population_unit_density(structure_name="Striatum", neuron_types=["dSPN", "iSPN"], + unit_centre=np.array([0.00475, 0.004, 0.00775]) -np.array([0, 0, 100e-6]), + probability_function="(d < 300e-6) * 1", num_neurons=4000) +si.add_population_unit_density(structure_name="Striatum", neuron_types=["dSPN", "iSPN"], + unit_centre=np.array([0.00475, 0.004, 0.00775]) -np.array([0, 0, -100e-6]), + probability_function="(d < 300e-6) * 1", num_neurons=4000) + +print("Writing json") + +si.write_json()