From 16ad59323af0e92ff41ea72144a9ac33e9dbab9b Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 5 Jul 2023 20:55:54 +0300 Subject: [PATCH 01/32] split script into two pieces for simplicity --- lawrencium/src/main/bash/run_job.sh | 156 ------------------- lawrencium/src/main/bash/slurm_job.sh | 40 +++++ lawrencium/src/main/bash/slurm_job_runner.sh | 120 ++++++++++++++ 3 files changed, 160 insertions(+), 156 deletions(-) delete mode 100644 lawrencium/src/main/bash/run_job.sh create mode 100644 lawrencium/src/main/bash/slurm_job.sh create mode 100644 lawrencium/src/main/bash/slurm_job_runner.sh diff --git a/lawrencium/src/main/bash/run_job.sh b/lawrencium/src/main/bash/run_job.sh deleted file mode 100644 index 7ae4326403a..00000000000 --- a/lawrencium/src/main/bash/run_job.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash - -# This is the script to run BEAM simulation on Lawrencium cluster. -# By-default it expects 2 input arguments: , -# though both might be manually filled in the script body. - -CODE_PHRASE="Execute the body of the job." - -# Doing shell magic - the script will send itself as a job to the cluster, -# and first argument will tell what to do - start a job or execute the job body. -# This way we could avoid using two shell scripts - one to start a job and another as job body. -# if the first argument is not what we are looking for - then this shell script is used to start a job -if [[ "$1" != "$CODE_PHRASE" ]]; then - echo "Starting the job .." - - # what code, data and config to use for simulation - export BEAM_BRANCH_NAME="develop" - export BEAM_COMMIT_SHA="" - export BEAM_DATA_BRANCH_NAME="develop" - export BEAM_DATA_COMMIT_SHA="" - export BEAM_CONFIG="test/input/beamville/beam.conf" - export PROFILER="" # either empty, 'cpu' or 'cpumem' - - export PULL_CODE="true" - export PULL_DATA="true" - - # In order to see which partition and queue are available for current user - sacctmgr show association -p user=$USER. - PARTITION="es1" - QOS="es_normal" - MEMORY_LIMIT="480" ## in GB - - # if uploading to s3 required - both AWS key parts should be set - export S3_REGION="us-east-2" - export S3_PUBLISH="false" - export AWS_SECRET_ACCESS_KEY="" - export AWS_ACCESS_KEY_ID="" - - # for sending notifications to slack SLACK_HOOK required - # for sending updates to the spreadsheet SIMULATIONS_SPREADSHEET_UPDATE_URL required - export SEND_NOTIFICATION="false" - export SLACK_HOOK_WITH_TOKEN="" - export SIMULATIONS_SPREADSHEET_UPDATE_URL="" - - ACCOUNT="pc_beamcore" - - # INPUT Argument #1 - run name - RUN_NAME="$1" - # INPUT Argument #2 - expected simulation duration, needed for the cluster - # to understand an order of running jobs if there are not enough nodes. - # The duration should be a bit longer than simulation should take (approximately). - # But not longer than maximum possible duration - 3 days for now (3-00:00:00). - EXPECTED_EXECUTION_DURATION="$2" # D-HH:MM:SS, i.e. for 1 day, 2 hours and 30 minutes => 1-02:30:00 - - # required for doing speed comparison (BEAM simulation vs Google observations) - export GOOGLE_API_KEY="" - - if [[ -z "$MEMORY_LIMIT" ]]; then - echo "Error: MEMORY_LIMIT is not set." - exit 1 - else - # using the current memory limit as MAX RAM for BEAM simulation - export MAX_RAM="$MEMORY_LIMIT" - fi - - if [[ -z "$RUN_NAME" ]]; then - echo "Error: RUN_NAME is not set." - exit 1 - else - # adding current user name as part of simulation title - export NOTIFICATION_TITLED="$USER/$RUN_NAME" - fi - - RANDOM_PART=$(tr -dc A-Z0-9 1-02:30:00 + EXPECTED_EXECUTION_DURATION + + ACCOUNT # account used to run jobs on Lawrencium + PARTITION # which partition and QOS use to run job + QOS # In order to see which partition and queue are available for current user - sacctmgr show association -p user=$USER. + MEMORY_LIMIT # memory limit should be in GB +) + + +# Reading variables case-insensitively from input parameters according to input_parameters list. +# Read variables are exported into environment. +while [ $# -gt 0 ]; do + for var_name in "${input_parameters[@]}"; do + var_value=${1#*=} + # check if variable name in lower case and '=' symbol are in parameter + # check if variable value is not empty + if [[ ${1,,} == --"${var_name,,}="* && -n "$var_value" ]] ; then + export "$var_name"="$var_value" + fi + done + shift +done + + +# Checking that all required variables were set. +for var_name in "${input_parameters[@]}" ; do + var_value="${!var_name}" + + if [[ -z "$var_value" ]]; then + echo "Error! Variable '$var_name' is required!" + exit 1 + fi + + echo "'$var_name' = '$var_value'" +done + + +# using the current memory limit as MAX RAM for BEAM simulation +export MAX_RAM="$MEMORY_LIMIT" +# adding current user name as part of simulation title +export NOTIFICATION_TITLED="$USER/$RUN_NAME" + + +RANDOM_PART=$(tr -dc A-Z0-9 Date: Mon, 10 Jul 2023 16:35:57 +0300 Subject: [PATCH 02/32] initial functionality --- lawrencium/build.gradle | 84 +++++++++++++++++++++++++++++ lawrencium/src/main/bash/Readme.txt | 4 +- settings.gradle | 1 + 3 files changed, 87 insertions(+), 2 deletions(-) create mode 100644 lawrencium/build.gradle diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle new file mode 100644 index 00000000000..47a24aee488 --- /dev/null +++ b/lawrencium/build.gradle @@ -0,0 +1,84 @@ +group = 'beam' +version = '0.8.0' + +buildscript { + repositories { + mavenLocal() + mavenCentral() + maven { url "https://plugins.gradle.org/m2/" } + gradlePluginPortal() + } + dependencies { + classpath 'org.hidetake:gradle-ssh-plugin:2.10.1' + } +} + +apply plugin: 'org.hidetake.ssh' + +if (!project.hasProperty("lawrenciumUser")) { + ext.lawrenciumUser = "nikolay" +} + +remotes { + lawrenciumLoginNode { + host = 'lrc-login.lbl.gov' + user = "${lawrenciumUser}" + } +// require ssh.settings.authentications to include "password" +// execute from ssh.run inside session -> put from: "${project.rootDir}/lawrencium/src/main/bash/slurm_job.sh", into: "slurm_job.sh" +// execute from ssh.run inside session -> put from: "${project.rootDir}/lawrencium/src/main/bash/slurm_job_runner.sh", into: "slurm_job_runner.sh" +// lawrenciumTransferNode { +// host = 'lrc-xfer.lbl.gov' +// user = "${lawrenciumUser}" +// } +} + +tasks.register("ltest") { + doLast { + ssh.run { + settings { + authentications = [ "keyboard-interactive" ] + knownHosts = allowAnyHosts + fileTransfer = "scp" + password = "Zomb1986" + project.findProperty("otp") + } + + def job_url = "https://github.com/LBNL-UCB-STI/beam/blob/16ad59323af0e92ff41ea72144a9ac33e9dbab9b/lawrencium/src/main/bash/slurm_job.sh" + def job_runner_url = "https://github.com/LBNL-UCB-STI/beam/blob/16ad59323af0e92ff41ea72144a9ac33e9dbab9b/lawrencium/src/main/bash/slurm_job_runner.sh" + + session(remotes.lawrenciumLoginNode) { + execute "rm slurm_job.sh slurm_job_runner.sh" + execute "ls -lah" + execute "wget $job_url" + execute "wget $job_runner_url" + execute "ls -lah" + execute "chmod +x slurm_job_runner.sh slurm_job.sh" + execute "ls -lah" + execute "./slurm_job_runner.sh \ + --BEAM_BRANCH_NAME='develop' \ + --BEAM_COMMIT_SHA='' \ + --BEAM_DATA_BRANCH_NAME='develop' \ + --BEAM_DATA_COMMIT_SHA='' \ + --BEAM_CONFIG='test/input/beamville/beam.conf' \ + --RUN_NAME='beamville_test' \ + --PROFILER='' \ + --PULL_CODE='true' \ + --PULL_DATA='true' \ + --ACCOUNT='pc_beamcore' \ + --PARTITION='es1' \ + --QOS='es_normal' \ + --MEMORY_LIMIT='480' \ + --EXPECTED_EXECUTION_DURATION='0-00:30:00' \ + --DOCKER_IMAGE_NAME='beammodel/beam-environment' \ + --DOCKER_IMAGE_TAG='latest' \ + --S3_REGION='us-east-2' \ + --S3_PUBLISH='false' \ + --AWS_SECRET_ACCESS_KEY='' \ + --AWS_ACCESS_KEY_ID='' \ + --SEND_NOTIFICATION='false' \ + --SLACK_HOOK_WITH_TOKEN='' \ + --SIMULATIONS_SPREADSHEET_UPDATE_URL=''" + } + } + } +} \ No newline at end of file diff --git a/lawrencium/src/main/bash/Readme.txt b/lawrencium/src/main/bash/Readme.txt index 4394f37bbe5..6089a6910ca 100644 --- a/lawrencium/src/main/bash/Readme.txt +++ b/lawrencium/src/main/bash/Readme.txt @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53e4eb4c30589c3e830253b3b9160150135a983b84496c245a79a77d577ce9f0 -size 2630 +oid sha256:197a416902dba0f56158875406f99639131a937b52a3da6ef663c29d266e87c6 +size 3564 diff --git a/settings.gradle b/settings.gradle index 718c770c3a8..329f255c9aa 100755 --- a/settings.gradle +++ b/settings.gradle @@ -2,6 +2,7 @@ include 'beam-gui' include 'aws' include 'gcp' include 'nersc' +include 'lawrencium' include 'metrics2.0' include 'jupyter' From 5b0ddacf273924f3117e93b03b7e9c13d5ef0865 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 10 Jul 2023 18:06:34 +0300 Subject: [PATCH 03/32] bugfix --- lawrencium/src/main/bash/slurm_job_runner.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index e6028868ed6..f25f451531f 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -67,8 +67,8 @@ export MAX_RAM="$MEMORY_LIMIT" export NOTIFICATION_TITLED="$USER/$RUN_NAME" -RANDOM_PART=$(tr -dc A-Z0-9 Date: Mon, 10 Jul 2023 18:45:11 +0300 Subject: [PATCH 04/32] fix for optional params --- lawrencium/src/main/bash/slurm_job_runner.sh | 33 ++++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index f25f451531f..dde05df4308 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -1,13 +1,19 @@ #!/bin/bash +# Full list of optional input parameters +optional_input_parameters=( + BEAM_COMMIT_SHA + BEAM_DATA_COMMIT_SHA +) + # Full list of input parameters required to run the script input_parameters=( - BEAM_BRANCH_NAME BEAM_COMMIT_SHA # code branch and commit - BEAM_DATA_BRANCH_NAME BEAM_DATA_COMMIT_SHA # data branch and commit - BEAM_CONFIG # path to beam config - RUN_NAME # the name of simulation (will be used in notifications) - PROFILER # either empty, 'cpu' or 'cpumem' - MAX_RAM # max ram for beam + BEAM_BRANCH_NAME # code branch + BEAM_DATA_BRANCH_NAME # data branch + BEAM_CONFIG # path to beam config + RUN_NAME # the name of simulation (will be used in notifications) + PROFILER # either empty, 'cpu' or 'cpumem' + MAX_RAM # max ram for beam # BEAM-environment docker image name and docker image tag separately # i.e. 'beammodel/beam-environment' and 'latest' @@ -48,6 +54,21 @@ while [ $# -gt 0 ]; do done +# Reading optional variables case-insensitively from input parameters according to optional_input_parameters list. +# Read variables are exported into environment. +while [ $# -gt 0 ]; do + for var_name in "${optional_input_parameters[@]}"; do + var_value=${1#*=} + # check if variable name in lower case and '=' symbol are in parameter + # check if variable value is not empty + if [[ ${1,,} == --"${var_name,,}="* && -n "$var_value" ]] ; then + export "$var_name"="$var_value" + fi + done + shift +done + + # Checking that all required variables were set. for var_name in "${input_parameters[@]}" ; do var_value="${!var_name}" From fa19c28a89347cdda30ed2f90a9c0b27721a96b9 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 10 Jul 2023 19:10:23 +0300 Subject: [PATCH 05/32] optional param --- lawrencium/src/main/bash/slurm_job_runner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index dde05df4308..c798bd9ec6b 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -4,6 +4,7 @@ optional_input_parameters=( BEAM_COMMIT_SHA BEAM_DATA_COMMIT_SHA + PROFILER # either empty, 'cpu' or 'cpumem' ) # Full list of input parameters required to run the script @@ -12,7 +13,6 @@ input_parameters=( BEAM_DATA_BRANCH_NAME # data branch BEAM_CONFIG # path to beam config RUN_NAME # the name of simulation (will be used in notifications) - PROFILER # either empty, 'cpu' or 'cpumem' MAX_RAM # max ram for beam # BEAM-environment docker image name and docker image tag separately From ca0d1399a6422cb7dddb1765ca8ccd5b55ccd2f7 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 12 Jul 2023 17:13:00 +0300 Subject: [PATCH 06/32] splitting full docker image name into pieces --- lawrencium/src/main/bash/slurm_job.sh | 2 +- lawrencium/src/main/bash/slurm_job_runner.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index f7850134d48..b59bd3d8088 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -19,7 +19,7 @@ export NOTIFICATION_INSTANCE_REGION="" # there is no shutdown wait when we using Lawrencium export NOTIFICATION_SHUTDOWN_WAIT="" -FULL_DOCKER_IMAGE_NAME="docker://${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" +FULL_DOCKER_IMAGE_NAME="docker://${DOCKER_IMAGE_NAMESPACE}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" SINGULARITY_IMAGE_NAME="${DOCKER_IMAGE_NAME}_${DOCKER_IMAGE_TAG}.sif" # to use https for pulling data repository despite a url configured for it diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index c798bd9ec6b..2bb3c8163cf 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -15,9 +15,9 @@ input_parameters=( RUN_NAME # the name of simulation (will be used in notifications) MAX_RAM # max ram for beam - # BEAM-environment docker image name and docker image tag separately - # i.e. 'beammodel/beam-environment' and 'latest' - DOCKER_IMAGE_NAME DOCKER_IMAGE_TAG + # BEAM-environment docker image namespace, name and docker image tag separately + # i.e. 'beammodel' for namespace, 'beam-environment' for name and 'latest' as a tag + DOCKER_IMAGE_NAMESPACE DOCKER_IMAGE_NAME DOCKER_IMAGE_TAG S3_REGION S3_PUBLISH # if uploading to s3 required - both AWS key parts should be set AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID From 8b5c585fb45c59503a0e7474caeb621b2bded4c1 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 12 Jul 2023 17:13:11 +0300 Subject: [PATCH 07/32] correct urls --- lawrencium/build.gradle | 63 +++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 47a24aee488..4bd988996ab 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -43,41 +43,42 @@ tasks.register("ltest") { password = "Zomb1986" + project.findProperty("otp") } - def job_url = "https://github.com/LBNL-UCB-STI/beam/blob/16ad59323af0e92ff41ea72144a9ac33e9dbab9b/lawrencium/src/main/bash/slurm_job.sh" - def job_runner_url = "https://github.com/LBNL-UCB-STI/beam/blob/16ad59323af0e92ff41ea72144a9ac33e9dbab9b/lawrencium/src/main/bash/slurm_job_runner.sh" + def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" session(remotes.lawrenciumLoginNode) { - execute "rm slurm_job.sh slurm_job_runner.sh" - execute "ls -lah" - execute "wget $job_url" - execute "wget $job_runner_url" - execute "ls -lah" + execute "rm -f slurm_job.sh" + execute "rm -f slurm_job_runner.sh" + execute "wget $base_path/slurm_job.sh" + execute "wget $base_path/slurm_job_runner.sh" execute "chmod +x slurm_job_runner.sh slurm_job.sh" execute "ls -lah" - execute "./slurm_job_runner.sh \ - --BEAM_BRANCH_NAME='develop' \ - --BEAM_COMMIT_SHA='' \ - --BEAM_DATA_BRANCH_NAME='develop' \ - --BEAM_DATA_COMMIT_SHA='' \ - --BEAM_CONFIG='test/input/beamville/beam.conf' \ - --RUN_NAME='beamville_test' \ - --PROFILER='' \ - --PULL_CODE='true' \ - --PULL_DATA='true' \ - --ACCOUNT='pc_beamcore' \ - --PARTITION='es1' \ - --QOS='es_normal' \ - --MEMORY_LIMIT='480' \ - --EXPECTED_EXECUTION_DURATION='0-00:30:00' \ - --DOCKER_IMAGE_NAME='beammodel/beam-environment' \ - --DOCKER_IMAGE_TAG='latest' \ - --S3_REGION='us-east-2' \ - --S3_PUBLISH='false' \ - --AWS_SECRET_ACCESS_KEY='' \ - --AWS_ACCESS_KEY_ID='' \ - --SEND_NOTIFICATION='false' \ - --SLACK_HOOK_WITH_TOKEN='' \ - --SIMULATIONS_SPREADSHEET_UPDATE_URL=''" + execute "./slurm_job_runner.sh " + + "--BEAM_BRANCH_NAME='develop' " + + "--BEAM_COMMIT_SHA='' " + + "--BEAM_DATA_BRANCH_NAME='develop' " + + "--BEAM_DATA_COMMIT_SHA='' " + + "--BEAM_CONFIG='test/input/beamville/beam.conf' " + + "--RUN_NAME='beamville_test' " + + "--MAX_RAM='16' " + + "--PROFILER='' " + + "--PULL_CODE='true' " + + "--PULL_DATA='true' " + + "--ACCOUNT='pc_beamcore' " + + "--PARTITION='es1' " + + "--QOS='es_normal' " + + "--MEMORY_LIMIT='480' " + + "--EXPECTED_EXECUTION_DURATION='0-00:30:00' " + + "--DOCKER_IMAGE_NAMESPACE='beammodel' " + + "--DOCKER_IMAGE_NAME='beam-environment' " + + "--DOCKER_IMAGE_TAG='latest' " + + "--S3_REGION='us-east-2' " + + "--S3_PUBLISH='false' " + + "--AWS_SECRET_ACCESS_KEY='?' " + + "--AWS_ACCESS_KEY_ID='?' " + + "--SLACK_HOOK_WITH_TOKEN='?' " + + "--SEND_NOTIFICATION='false' " + + "--SLACK_HOOK_WITH_TOKEN='' " + + "--SIMULATIONS_SPREADSHEET_UPDATE_URL='?' " } } } From f111f21f3964954e74615c8977befac58709bfbb Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 17 Jul 2023 15:40:04 +0300 Subject: [PATCH 08/32] gradle functions --- lawrencium/build.gradle | 49 ++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 4bd988996ab..2f1761d4d6a 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -1,3 +1,5 @@ +import java.time.Instant + group = 'beam' version = '0.8.0' @@ -24,22 +26,14 @@ remotes { host = 'lrc-login.lbl.gov' user = "${lawrenciumUser}" } -// require ssh.settings.authentications to include "password" -// execute from ssh.run inside session -> put from: "${project.rootDir}/lawrencium/src/main/bash/slurm_job.sh", into: "slurm_job.sh" -// execute from ssh.run inside session -> put from: "${project.rootDir}/lawrencium/src/main/bash/slurm_job_runner.sh", into: "slurm_job_runner.sh" -// lawrenciumTransferNode { -// host = 'lrc-xfer.lbl.gov' -// user = "${lawrenciumUser}" -// } } -tasks.register("ltest") { +tasks.register("deployToLawrencium") { doLast { ssh.run { settings { - authentications = [ "keyboard-interactive" ] + authentications = ["keyboard-interactive"] knownHosts = allowAnyHosts - fileTransfer = "scp" password = "Zomb1986" + project.findProperty("otp") } @@ -48,10 +42,14 @@ tasks.register("ltest") { session(remotes.lawrenciumLoginNode) { execute "rm -f slurm_job.sh" execute "rm -f slurm_job_runner.sh" + // downloading files instead of sending them + // because it is not allowed to push files to login node + // file transfer node uses different type of authentication + // and both nodes are not working together within one session for some reason execute "wget $base_path/slurm_job.sh" execute "wget $base_path/slurm_job_runner.sh" execute "chmod +x slurm_job_runner.sh slurm_job.sh" - execute "ls -lah" + // execute "ls -lah" execute "./slurm_job_runner.sh " + "--BEAM_BRANCH_NAME='develop' " + "--BEAM_COMMIT_SHA='' " + @@ -79,6 +77,35 @@ tasks.register("ltest") { "--SEND_NOTIFICATION='false' " + "--SLACK_HOOK_WITH_TOKEN='' " + "--SIMULATIONS_SPREADSHEET_UPDATE_URL='?' " + execute "squeue -u $lawrenciumUser" + } + } + } +} + + +tasks.register("lawrenciumQueue") { + doLast { + ssh.run { + settings { + authentications = ["keyboard-interactive"] + knownHosts = allowAnyHosts + password = "Zomb1986" + project.findProperty("otp") + } + + def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" + + if (!project.hasProperty("fromDate")) { + Instant now = Instant.now() + Instant yesterday = now.minus(1, ChronoUnit.DAYS) + ext.fromDate = DateTimeFormatter.ofPattern("yyyyMMdd") + .withZone(ZoneId.of("UTC")) + .format(yesterday) + } + + + session(remotes.lawrenciumLoginNode) { + execute "sacct -u $lawrenciumUser --format=$sacctFormat -S 2023-03-2" } } } From 85b3c2bd5353d27d71194b3b377d655d6276589f Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 17 Jul 2023 17:51:14 +0300 Subject: [PATCH 09/32] changes in hash code of folders\submodules --- production/newyork | 2 +- production/sfbay | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/production/newyork b/production/newyork index 31c64acc53e..64e42468aab 160000 --- a/production/newyork +++ b/production/newyork @@ -1 +1 @@ -Subproject commit 31c64acc53ee235c53d304d2c8d1131398399b2c +Subproject commit 64e42468aabf21f5d16f14184b06234f89756e09 diff --git a/production/sfbay b/production/sfbay index 85f8c19a213..5fcac035282 160000 --- a/production/sfbay +++ b/production/sfbay @@ -1 +1 @@ -Subproject commit 85f8c19a213b2918c0832f28a1d978cb85900069 +Subproject commit 5fcac035282b941643a7e6e7c07cb503c52914ec From b7fd0e843175fdbc850fa7e70fe6d812364de5d5 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 19 Jul 2023 17:27:21 +0300 Subject: [PATCH 10/32] formatting --- nersc/build.gradle | 48 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/nersc/build.gradle b/nersc/build.gradle index d59596862e4..1947164c358 100644 --- a/nersc/build.gradle +++ b/nersc/build.gradle @@ -30,34 +30,34 @@ remotes { import org.apache.tools.ant.filters.ReplaceTokens task deployToNersc { - def propsFileName = "${project.rootDir}/gradle.deploy.properties" - if (project.hasProperty('propsFile')) { - propsFileName = project.findProperty('propsFile') - } + def propsFileName = "${project.rootDir}/gradle.deploy.properties" + if (project.hasProperty('propsFile')) { + propsFileName = project.findProperty('propsFile') + } - def propsFile = new Properties() - propsFile.load(project.file(propsFileName).newDataInputStream()) + def propsFile = new Properties() + propsFile.load(project.file(propsFileName).newDataInputStream()) - ext.getParameterValue = { paramName -> - if (project.hasProperty(paramName)) { - return project.findProperty(paramName) - } else { - return propsFile.getProperty(paramName) - } + ext.getParameterValue = { paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else { + return propsFile.getProperty(paramName) } + } - def runName = "${ext.getParameterValue('runName')}" - def git_user_email = "${getCurrentGitUserEmail()}" - def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}" - def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}" - def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}" - def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}" - def max_ram = '100g' - def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}" - def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}" - def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}" - def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}" - def region = "${ext.getParameterValue('region') ?: defaultRegion}" + def runName = "${ext.getParameterValue('runName')}" + def git_user_email = "${getCurrentGitUserEmail()}" + def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}" + def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}" + def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}" + def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}" + def max_ram = '100g' + def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}" + def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}" + def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}" + def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}" + def region = "${ext.getParameterValue('region') ?: defaultRegion}" doFirst { copy { From a32759b87dea77159f7b45ac9de46614340a4171 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Thu, 20 Jul 2023 18:16:01 +0300 Subject: [PATCH 11/32] lawrencium parameters --- gradle.deploy.properties | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/gradle.deploy.properties b/gradle.deploy.properties index 98c76984c3f..31a597ee7ec 100644 --- a/gradle.deploy.properties +++ b/gradle.deploy.properties @@ -11,6 +11,19 @@ beamConfigs=test/input/beamville/beam.conf cloudPlatform=Google instanceType=n2d-standard-4 +# Lawrencium cluster specifics +lawrenciumPartition=es1 +lawrenciumMemoryLimit=480 +lawrenciumQoS=es_normal +lawrenciumAccount=pc_beamcore +# the name of the user in cluster +lawrenciumUser= +# the password of the user in cluster, +# the gradle command also should include the one time password: -Potp= +lawrenciumPassword= +# expected duration of simulation to run on Lawrencium, should be less than maximum of 3 days. +expectedDuration='0-00:10:00' + # shutdownBehaviour = stop | terminate shutdownBehaviour=terminate @@ -20,6 +33,13 @@ s3Backup=true # for example: helics/run_pydss_federate.sh or helics/run_site_power_controller.sh cosimulationShellScript= +# parameters for configuring notifications, +# so far required only for Lawrencium cluster. +# should be set for notifications to work +sentNotification=true +# slackHookWithToken= +# simulationsSpreadsheetUrl= + # Run Jupyter Notebook together with BEAM runJupyter=false From a65f7460ef9395ca492ec57d0a4138d53e017be3 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Thu, 20 Jul 2023 18:16:30 +0300 Subject: [PATCH 12/32] gradle functions to run on cluster --- lawrencium/build.gradle | 194 ++++++++++++++----- lawrencium/src/main/bash/slurm_job_runner.sh | 1 + 2 files changed, 145 insertions(+), 50 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 2f1761d4d6a..7a44a032eb7 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -1,5 +1,3 @@ -import java.time.Instant - group = 'beam' version = '0.8.0' @@ -17,26 +15,127 @@ buildscript { apply plugin: 'org.hidetake.ssh' -if (!project.hasProperty("lawrenciumUser")) { - ext.lawrenciumUser = "nikolay" +List requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"] + +for (prop in requiredProps) { + if (!project.hasProperty(prop)) { + println "Property '$prop' is required and should be set!" + } +} + +def propsFileName = "${project.rootDir}/gradle.deploy.properties" +if (project.hasProperty('propsFile')) { + propsFileName = project.findProperty('propsFile') +} + +def propsFile = new Properties() +propsFile.load(project.file(propsFileName).newDataInputStream()) + +ext.getParameterValue = { String paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else { + return propsFile.getProperty(paramName) + } +} + +ext.getParameterOrEnvironmentValue = { String paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else if (System.getenv(paramName)) { + return System.getenv(paramName) + } else { + return propsFile.getProperty(paramName) + } } remotes { lawrenciumLoginNode { host = 'lrc-login.lbl.gov' - user = "${lawrenciumUser}" + user = project.findProperty('lawrenciumUser') ?: 'user-not-specified' + } +} + +ssh { + settings { + authentications = ["keyboard-interactive"] + knownHosts = allowAnyHosts } } tasks.register("deployToLawrencium") { doLast { ssh.run { - settings { - authentications = ["keyboard-interactive"] - knownHosts = allowAnyHosts - password = "Zomb1986" + project.findProperty("otp") - } + settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } + + def runName = getParameterValue('runName') + + def branch = getParameterValue('beamBranch') ?: getCurrentGitBranch() + def dataBranch = getParameterValue('dataBranch') ?: 'develop' + + def commit = getParameterValue('beamCommit') ?: 'HEAD' + def dataCommit = getParameterValue('dataCommit') ?: 'HEAD' + + def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs') + + // partition, memory limit and QoS should be changed together + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + def lawrenciumMemoryLimit = getParameterValue('lawrenciumMemoryLimit') ?: '480' + def lawrenciumQoS = getParameterValue('lawrenciumQoS') ?: 'es_normal' + // using memory limit for lawrencium cluster is usually fine + def max_ram = getParameterValue('forcedMaxRAM') ?: lawrenciumMemoryLimit + def profiler_type = getParameterValue('profiler_type') ?: 'cpumem' + + def sentNotification = getParameterValue('sentNotification') ?: true + def slackHookWithToken = getParameterValue('slackHookWithToken') ?: 'not-set' + def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: 'not-set' + + def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set' + + def s3_publish = getParameterValue('s3Backup') ?: true + def region = getParameterValue('region') ?: defaultRegion + def aws_access_key_id = getParameterOrEnvironmentValue('AWS_ACCESS_KEY_ID') ?: 'not-set' + def aws_secret_access_key = getParameterOrEnvironmentValue('AWS_SECRET_ACCESS_KEY') ?: 'not-set' + + // by-default maximum possible value is used + def expectedDuration = getParameterValue('expectedDuration') ?: '3-00:00:00' + + // in case the simulation should be run under different lawrencium account + def lawrenciumAccount = getParameterValue('lawrenciumAccount') ?: 'pc_beamcore' + + // which docker image to run + def dockerImageTag = getParameterValue('dockerImageTag') ?: 'latest' + def dockerImageName = getParameterValue('dockerImageName') ?: 'beam-environment' + def dockerImageNameSpace = getParameterValue('dockerImageNameSpace') ?: 'beammodel' + + def parametersToDeploy = "--RUN_NAME='$runName' " + + "--BEAM_BRANCH_NAME='$branch' " + + "--BEAM_COMMIT_SHA='$commit' " + + "--BEAM_DATA_BRANCH_NAME='$dataBranch' " + + "--BEAM_DATA_COMMIT_SHA='$dataCommit' " + + "--BEAM_CONFIG='$configs' " + + "--MAX_RAM='$max_ram' " + + "--PROFILER='$profiler_type' " + + "--ACCOUNT='$lawrenciumAccount' " + + "--PARTITION='$lawrenciumPartition' " + + "--QOS='$lawrenciumQoS' " + + "--MEMORY_LIMIT='$lawrenciumMemoryLimit' " + + "--EXPECTED_EXECUTION_DURATION='$expectedDuration' " + + "--S3_REGION='$region' " + + "--S3_PUBLISH='$s3_publish' " + + "--AWS_SECRET_ACCESS_KEY='$aws_secret_access_key' " + + "--AWS_ACCESS_KEY_ID='$aws_access_key_id' " + + "--SEND_NOTIFICATION='$sentNotification' " + + "--SLACK_HOOK_WITH_TOKEN='$slackHookWithToken' " + + "--SIMULATIONS_SPREADSHEET_UPDATE_URL='$simulationsSpreadsheetUrl' " + + "--GOOGLE_API_KEY='$google_api_key' " + + "--PULL_CODE='true' " + + "--PULL_DATA='true' " + + "--DOCKER_IMAGE_NAMESPACE='$dockerImageNameSpace' " + + "--DOCKER_IMAGE_NAME='$dockerImageName' " + + "--DOCKER_IMAGE_TAG='$dockerImageTag' " + def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" session(remotes.lawrenciumLoginNode) { @@ -49,34 +148,8 @@ tasks.register("deployToLawrencium") { execute "wget $base_path/slurm_job.sh" execute "wget $base_path/slurm_job_runner.sh" execute "chmod +x slurm_job_runner.sh slurm_job.sh" - // execute "ls -lah" - execute "./slurm_job_runner.sh " + - "--BEAM_BRANCH_NAME='develop' " + - "--BEAM_COMMIT_SHA='' " + - "--BEAM_DATA_BRANCH_NAME='develop' " + - "--BEAM_DATA_COMMIT_SHA='' " + - "--BEAM_CONFIG='test/input/beamville/beam.conf' " + - "--RUN_NAME='beamville_test' " + - "--MAX_RAM='16' " + - "--PROFILER='' " + - "--PULL_CODE='true' " + - "--PULL_DATA='true' " + - "--ACCOUNT='pc_beamcore' " + - "--PARTITION='es1' " + - "--QOS='es_normal' " + - "--MEMORY_LIMIT='480' " + - "--EXPECTED_EXECUTION_DURATION='0-00:30:00' " + - "--DOCKER_IMAGE_NAMESPACE='beammodel' " + - "--DOCKER_IMAGE_NAME='beam-environment' " + - "--DOCKER_IMAGE_TAG='latest' " + - "--S3_REGION='us-east-2' " + - "--S3_PUBLISH='false' " + - "--AWS_SECRET_ACCESS_KEY='?' " + - "--AWS_ACCESS_KEY_ID='?' " + - "--SLACK_HOOK_WITH_TOKEN='?' " + - "--SEND_NOTIFICATION='false' " + - "--SLACK_HOOK_WITH_TOKEN='' " + - "--SIMULATIONS_SPREADSHEET_UPDATE_URL='?' " + execute "./slurm_job_runner.sh " + parametersToDeploy + println "squeue -u $lawrenciumUser" execute "squeue -u $lawrenciumUser" } } @@ -87,26 +160,47 @@ tasks.register("deployToLawrencium") { tasks.register("lawrenciumQueue") { doLast { ssh.run { - settings { - authentications = ["keyboard-interactive"] - knownHosts = allowAnyHosts - password = "Zomb1986" + project.findProperty("otp") - } + settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" if (!project.hasProperty("fromDate")) { - Instant now = Instant.now() - Instant yesterday = now.minus(1, ChronoUnit.DAYS) - ext.fromDate = DateTimeFormatter.ofPattern("yyyyMMdd") - .withZone(ZoneId.of("UTC")) - .format(yesterday) + Date today = new Date() + Integer daysPast = (project.findProperty('forDays') ?: '1').toInteger() + Date yesterday = today - daysPast + ext.fromDate = yesterday.format("yyyy-MM-dd") } + def commandToGetJobsListForUser = "sacct -u $lawrenciumUser --format=$sacctFormat -S $fromDate" + def commandToGetQueueForUser = "squeue -u $lawrenciumUser" session(remotes.lawrenciumLoginNode) { - execute "sacct -u $lawrenciumUser --format=$sacctFormat -S 2023-03-2" + println " " + println commandToGetQueueForUser + println commandToGetJobsListForUser + println " " + execute commandToGetQueueForUser + execute commandToGetJobsListForUser } } } -} \ No newline at end of file +} + + +tasks.register("lawrenciumNodes") { + doLast { + ssh.run { + settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } + + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + + def commandToGetNodesInfo = "sinfo -N --long --partition=$lawrenciumPartition" + + session(remotes.lawrenciumLoginNode) { + println " " + println commandToGetNodesInfo + execute commandToGetNodesInfo + } + } + } +} diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index 2bb3c8163cf..ed9ed37190c 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -5,6 +5,7 @@ optional_input_parameters=( BEAM_COMMIT_SHA BEAM_DATA_COMMIT_SHA PROFILER # either empty, 'cpu' or 'cpumem' + GOOGLE_API_KEY ) # Full list of input parameters required to run the script From 528381bfa7d53033cfd5c8e7d03c7a114d093e3a Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 15:42:06 +0300 Subject: [PATCH 13/32] removing extra logging --- lawrencium/src/main/bash/slurm_job.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index b59bd3d8088..56fac5edcd0 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -26,9 +26,7 @@ SINGULARITY_IMAGE_NAME="${DOCKER_IMAGE_NAME}_${DOCKER_IMAGE_TAG}.sif" export ENFORCE_HTTPS_FOR_DATA_REPOSITORY="true" echo "Pulling docker image '$FULL_DOCKER_IMAGE_NAME' ..." -set -x singularity pull --force "$FULL_DOCKER_IMAGE_NAME" -set +x echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" From 08f4e8be1726685cde9c29bb78d66f6b489e7d9b Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 15:42:17 +0300 Subject: [PATCH 14/32] refactored --- lawrencium/build.gradle | 259 +++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 126 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 7a44a032eb7..34f50dad423 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -1,3 +1,6 @@ +import com.amazonaws.auth.* +import com.amazonaws.auth.profile.ProfileCredentialsProvider + group = 'beam' version = '0.8.0' @@ -10,19 +13,12 @@ buildscript { } dependencies { classpath 'org.hidetake:gradle-ssh-plugin:2.10.1' + classpath 'com.amazonaws:aws-java-sdk:1.11.83' } } apply plugin: 'org.hidetake.ssh' -List requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"] - -for (prop in requiredProps) { - if (!project.hasProperty(prop)) { - println "Property '$prop' is required and should be set!" - } -} - def propsFileName = "${project.rootDir}/gradle.deploy.properties" if (project.hasProperty('propsFile')) { propsFileName = project.findProperty('propsFile') @@ -63,144 +59,155 @@ ssh { } } +ext.executeLawrenciumSSH = { List commandsList -> + List requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"] + for (prop in requiredProps) { + if (!getParameterValue(prop)) { + println "Property '$prop' is required and should be set!" + } + } + + ssh.run { + settings { + authentications = ["keyboard-interactive"] + knownHosts = allowAnyHosts + password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") + } + + session(remotes.lawrenciumLoginNode) { + for (command in commandsList) { + println command + execute command + } + } + } +} + + tasks.register("deployToLawrencium") { doLast { - ssh.run { - settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } - - def runName = getParameterValue('runName') - - def branch = getParameterValue('beamBranch') ?: getCurrentGitBranch() - def dataBranch = getParameterValue('dataBranch') ?: 'develop' - - def commit = getParameterValue('beamCommit') ?: 'HEAD' - def dataCommit = getParameterValue('dataCommit') ?: 'HEAD' - - def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs') - - // partition, memory limit and QoS should be changed together - def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' - def lawrenciumMemoryLimit = getParameterValue('lawrenciumMemoryLimit') ?: '480' - def lawrenciumQoS = getParameterValue('lawrenciumQoS') ?: 'es_normal' - - // using memory limit for lawrencium cluster is usually fine - def max_ram = getParameterValue('forcedMaxRAM') ?: lawrenciumMemoryLimit - def profiler_type = getParameterValue('profiler_type') ?: 'cpumem' - - def sentNotification = getParameterValue('sentNotification') ?: true - def slackHookWithToken = getParameterValue('slackHookWithToken') ?: 'not-set' - def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: 'not-set' - - def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set' - - def s3_publish = getParameterValue('s3Backup') ?: true - def region = getParameterValue('region') ?: defaultRegion - def aws_access_key_id = getParameterOrEnvironmentValue('AWS_ACCESS_KEY_ID') ?: 'not-set' - def aws_secret_access_key = getParameterOrEnvironmentValue('AWS_SECRET_ACCESS_KEY') ?: 'not-set' - - // by-default maximum possible value is used - def expectedDuration = getParameterValue('expectedDuration') ?: '3-00:00:00' - - // in case the simulation should be run under different lawrencium account - def lawrenciumAccount = getParameterValue('lawrenciumAccount') ?: 'pc_beamcore' - - // which docker image to run - def dockerImageTag = getParameterValue('dockerImageTag') ?: 'latest' - def dockerImageName = getParameterValue('dockerImageName') ?: 'beam-environment' - def dockerImageNameSpace = getParameterValue('dockerImageNameSpace') ?: 'beammodel' - - def parametersToDeploy = "--RUN_NAME='$runName' " + - "--BEAM_BRANCH_NAME='$branch' " + - "--BEAM_COMMIT_SHA='$commit' " + - "--BEAM_DATA_BRANCH_NAME='$dataBranch' " + - "--BEAM_DATA_COMMIT_SHA='$dataCommit' " + - "--BEAM_CONFIG='$configs' " + - "--MAX_RAM='$max_ram' " + - "--PROFILER='$profiler_type' " + - "--ACCOUNT='$lawrenciumAccount' " + - "--PARTITION='$lawrenciumPartition' " + - "--QOS='$lawrenciumQoS' " + - "--MEMORY_LIMIT='$lawrenciumMemoryLimit' " + - "--EXPECTED_EXECUTION_DURATION='$expectedDuration' " + - "--S3_REGION='$region' " + - "--S3_PUBLISH='$s3_publish' " + - "--AWS_SECRET_ACCESS_KEY='$aws_secret_access_key' " + - "--AWS_ACCESS_KEY_ID='$aws_access_key_id' " + - "--SEND_NOTIFICATION='$sentNotification' " + - "--SLACK_HOOK_WITH_TOKEN='$slackHookWithToken' " + - "--SIMULATIONS_SPREADSHEET_UPDATE_URL='$simulationsSpreadsheetUrl' " + - "--GOOGLE_API_KEY='$google_api_key' " + - "--PULL_CODE='true' " + - "--PULL_DATA='true' " + - "--DOCKER_IMAGE_NAMESPACE='$dockerImageNameSpace' " + - "--DOCKER_IMAGE_NAME='$dockerImageName' " + - "--DOCKER_IMAGE_TAG='$dockerImageTag' " - - def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" - - session(remotes.lawrenciumLoginNode) { - execute "rm -f slurm_job.sh" - execute "rm -f slurm_job_runner.sh" + def runName = getParameterValue('runName') + + def branch = getParameterValue('beamBranch') ?: getCurrentGitBranch() + def dataBranch = getParameterValue('dataBranch') ?: 'develop' + + def commit = getParameterValue('beamCommit') ?: 'HEAD' + def dataCommit = getParameterValue('dataCommit') ?: 'HEAD' + + def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs') + + // partition, memory limit and QoS should be changed together + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + def lawrenciumMemoryLimit = getParameterValue('lawrenciumMemoryLimit') ?: '480' + def lawrenciumQoS = getParameterValue('lawrenciumQoS') ?: 'es_normal' + + // using memory limit for lawrencium cluster is usually fine + def max_ram = getParameterValue('forcedMaxRAM') ?: lawrenciumMemoryLimit + def profiler_type = getParameterValue('profiler_type') ?: 'cpumem' + + def sentNotification = getParameterValue('sentNotification') ?: true + def slackHookWithToken = getParameterValue('slackHookWithToken') ?: 'not-set' + def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: 'not-set' + + def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set' + + def s3_publish = getParameterValue('s3Backup') ?: true + def region = getParameterValue('region') ?: defaultRegion + + // reading AWS credentials from various credentials providers + AWSCredentialsProvider creds = new AWSCredentialsProviderChain( + new EnvironmentVariableCredentialsProvider(), + new SystemPropertiesCredentialsProvider(), + new ProfileCredentialsProvider(), + new EC2ContainerCredentialsProviderWrapper() + ) + + String aws_access_key_id = creds.getCredentials().AWSAccessKeyId + String aws_secret_access_key = creds.getCredentials().AWSSecretKey + + // by-default maximum possible value is used + def expectedDuration = getParameterValue('expectedDuration') ?: '3-00:00:00' + + // in case the simulation should be run under different lawrencium account + def lawrenciumAccount = getParameterValue('lawrenciumAccount') ?: 'pc_beamcore' + + // which docker image to run + def dockerImageTag = getParameterValue('dockerImageTag') ?: 'latest' + def dockerImageName = getParameterValue('dockerImageName') ?: 'beam-environment' + def dockerImageNameSpace = getParameterValue('dockerImageNameSpace') ?: 'beammodel' + + def parametersToDeploy = "--RUN_NAME='$runName' " + + "--BEAM_BRANCH_NAME='$branch' " + + "--BEAM_COMMIT_SHA='$commit' " + + "--BEAM_DATA_BRANCH_NAME='$dataBranch' " + + "--BEAM_DATA_COMMIT_SHA='$dataCommit' " + + "--BEAM_CONFIG='$configs' " + + "--MAX_RAM='$max_ram' " + + "--PROFILER='$profiler_type' " + + "--ACCOUNT='$lawrenciumAccount' " + + "--PARTITION='$lawrenciumPartition' " + + "--QOS='$lawrenciumQoS' " + + "--MEMORY_LIMIT='$lawrenciumMemoryLimit' " + + "--EXPECTED_EXECUTION_DURATION='$expectedDuration' " + + "--S3_REGION='$region' " + + "--S3_PUBLISH='$s3_publish' " + + "--AWS_SECRET_ACCESS_KEY='$aws_secret_access_key' " + + "--AWS_ACCESS_KEY_ID='$aws_access_key_id' " + + "--SEND_NOTIFICATION='$sentNotification' " + + "--SLACK_HOOK_WITH_TOKEN='$slackHookWithToken' " + + "--SIMULATIONS_SPREADSHEET_UPDATE_URL='$simulationsSpreadsheetUrl' " + + "--GOOGLE_API_KEY='$google_api_key' " + + "--PULL_CODE='true' " + + "--PULL_DATA='true' " + + "--DOCKER_IMAGE_NAMESPACE='$dockerImageNameSpace' " + + "--DOCKER_IMAGE_NAME='$dockerImageName' " + + "--DOCKER_IMAGE_TAG='$dockerImageTag' " + + def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" + + executeLawrenciumSSH([ + "rm -f slurm_job.sh", + "rm -f slurm_job_runner.sh", // downloading files instead of sending them // because it is not allowed to push files to login node // file transfer node uses different type of authentication // and both nodes are not working together within one session for some reason - execute "wget $base_path/slurm_job.sh" - execute "wget $base_path/slurm_job_runner.sh" - execute "chmod +x slurm_job_runner.sh slurm_job.sh" - execute "./slurm_job_runner.sh " + parametersToDeploy - println "squeue -u $lawrenciumUser" - execute "squeue -u $lawrenciumUser" - } - } + "wget $base_path/slurm_job.sh", + "wget $base_path/slurm_job_runner.sh", + "chmod +x slurm_job_runner.sh slurm_job.sh", + "./slurm_job_runner.sh " + parametersToDeploy, + "squeue -u $lawrenciumUser" + ]) } } tasks.register("lawrenciumQueue") { doLast { - ssh.run { - settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } - - def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" - - if (!project.hasProperty("fromDate")) { - Date today = new Date() - Integer daysPast = (project.findProperty('forDays') ?: '1').toInteger() - Date yesterday = today - daysPast - ext.fromDate = yesterday.format("yyyy-MM-dd") - } + def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" + if (!project.hasProperty("fromDate")) { + Date today = new Date() + Integer daysPast = (project.findProperty('forDays') ?: '1').toInteger() + Date yesterday = today - daysPast + ext.fromDate = yesterday.format("yyyy-MM-dd") + } - def commandToGetJobsListForUser = "sacct -u $lawrenciumUser --format=$sacctFormat -S $fromDate" - def commandToGetQueueForUser = "squeue -u $lawrenciumUser" + def commandToGetJobsListForUser = "sacct -u $lawrenciumUser --format=$sacctFormat -S $fromDate" + def commandToGetQueueForUser = "squeue -u $lawrenciumUser" - session(remotes.lawrenciumLoginNode) { - println " " - println commandToGetQueueForUser - println commandToGetJobsListForUser - println " " - execute commandToGetQueueForUser - execute commandToGetJobsListForUser - } - } + executeLawrenciumSSH([ + commandToGetJobsListForUser, + commandToGetQueueForUser + ]) } } tasks.register("lawrenciumNodes") { doLast { - ssh.run { - settings { password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") } - - def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + def commandToGetNodesInfo = "sinfo -N --long --partition=$lawrenciumPartition" - def commandToGetNodesInfo = "sinfo -N --long --partition=$lawrenciumPartition" - - session(remotes.lawrenciumLoginNode) { - println " " - println commandToGetNodesInfo - execute commandToGetNodesInfo - } - } + executeLawrenciumSSH([commandToGetNodesInfo]) } } From 9b5b9c31f99bab88e8d831a839cd87dc7f932388 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 16:39:46 +0300 Subject: [PATCH 15/32] documentation of parameteres to deploy to lawrencium --- docs/developers.rst | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/docs/developers.rst b/docs/developers.rst index 0be170a0345..d94e47fd05b 100755 --- a/docs/developers.rst +++ b/docs/developers.rst @@ -240,8 +240,48 @@ Similarly for experiment batch, you can specify comma-separated experiment files For demo and presentation material, please follow the link_ on google drive. -BEAM run on NERSC -~~~~~~~~~~~~~~~~~ +BEAM run on Lawrencium cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to run BEAM on Lawrencium cluster one needs to get a user and password and configure OTP (one time password) generator for accessing the cluster. +Each time one runs a command on a cluster a user name, user password and one time password should be specified:: + + ./gradlew deployToLawrencium --PlawrenciumUser= -PlawrenciumPassword= -Potp= + +You need to define the deploy properties that are similar to the ones for AWS deploy. +Lawrencium-specific properties, such as lawrenciumPartition and lawrenciumQoS are a combination from 'sacctmgr show association -p user=$USER' command, +lawrenciumMemoryLimit should have a value a bit less than the amount of memory that node from selected partition usually has. + + +* **runName**: to specify instance name. +* **beamBranch**: To specify the branch for simulation, current source branch will be used as default branch. +* **beamCommit**: The commit SHA to run simulation. use `HEAD` if you want to run with latest commit, default is `HEAD`. +* **dataBranch**: To specify the data branch (branch on production data repository) for simulation, 'develop' branch will be used as default data branch. +* **dataCommit**: The commit SHA for the the data branch, default is `HEAD` +* **beamConfigs**: A comma `,` separated list of `beam.conf` files. It should be relative path under the project home. You can create branch level defaults by specifying the branch name with `.configs` suffix like `master.configs`. Branch level default will be used if `beamConfigs` is not present. +* **s3Backup**: to specify if copying results to s3 bucket is needed, default is `true`. +* **region**: Use this parameter to select the AWS region for the run, all instances would be created in specified region. Default `region` is `us-east-2`. + +* **sentNotification**: The boolean flag that turn on / off sending notifications about simulation start / stop to slack and google spreadsheet. +* **slackHookWithToken**: The URL for slack hook with token in it for sending slack notifications. +* **simulationsSpreadsheetUrl**: The URL for sending json in order to update simulations spreadsheet. + +* **lawrenciumPartition**: A name of a partition on which deploy should be done. By-default 'es1' as best choice. +* **lawrenciumMemoryLimit**: A memory limit for a node to pick from the selected partition. By-default '480' as best choice. +* **lawrenciumQoS**: A name of QoS for the selected partition. Currently for 'es1' - 'es_normal'. + +* **lawrenciumAccount** OPTIONAL: The account that will be used in order to start a job on the cluster, filled-in by-default. +* **expectedDuration** OPTIONAL: The expected duration of simulation, the current maximum and a default value is 3 days ('3-00:00:00'). +* **forcedMaxRAM** OPTIONAL: By-default it is equal to lawrenciumMemoryLimit, one needs to change it if for some reasons this amount of memory is not correct for running BEAM. + +* **dockerImageNameSpace**, **dockerImageName**, **dockerImageTag** OPTIONAL: beam-environment docker parameters, by-default are: 'beammodel', 'beam-environment' and 'latest' respectively. + + +Your task is going to be added to the queue and when it starts/finishes you receive a notification on your git user email. It may take 1-24 hours (or even more) for the task to get started. It depends on the NERSC workload. In your user home directory on NERSC you can find the output file of your task that looks like `slurm-.out`. The BEAM output directory is resides at `$SCRATCH/beam_runs/`. Also the output is uploaded to s3 if `s3Backup` is set to true. + + +BEAM run on NERSC cluster +~~~~~~~~~~~~~~~~~~~~~~~~~ In order to run BEAM on NERSC one needs to get an `ssh key `_ that allows you to ssh to NERSC systems without further authentication until the key expires (24 hours). You also need to specify your user name on NERSC in the following property: **nerscUser**, i.e:: From 1138521bf7a103ec0cd3cbbb8a5005c1e5ddec57 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 17:52:00 +0300 Subject: [PATCH 16/32] documentation of parameters for lawrencium gradle commands --- docs/developers.rst | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/developers.rst b/docs/developers.rst index d94e47fd05b..5f89cbdde57 100755 --- a/docs/developers.rst +++ b/docs/developers.rst @@ -244,14 +244,19 @@ BEAM run on Lawrencium cluster ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In order to run BEAM on Lawrencium cluster one needs to get a user and password and configure OTP (one time password) generator for accessing the cluster. -Each time one runs a command on a cluster a user name, user password and one time password should be specified:: +Each time one runs a command on a cluster a user name, user password and one time password should be specified :: - ./gradlew deployToLawrencium --PlawrenciumUser= -PlawrenciumPassword= -Potp= + ./gradlew --PlawrenciumUser= -PlawrenciumPassword= -Potp= You need to define the deploy properties that are similar to the ones for AWS deploy. Lawrencium-specific properties, such as lawrenciumPartition and lawrenciumQoS are a combination from 'sacctmgr show association -p user=$USER' command, lawrenciumMemoryLimit should have a value a bit less than the amount of memory that node from selected partition usually has. +To deploy a beam simulation to the lawrencium cluster one needs to use 'deployToLawrencium' command with following parameters specified +either directly or as parameters in gradle.deploy.properties file :: + ./gradlew deployToLawrencium --PlawrenciumUser= -PlawrenciumPassword= -Potp= + +Here are parameters that are required or optional for deploing to lawrencium cluster: * **runName**: to specify instance name. * **beamBranch**: To specify the branch for simulation, current source branch will be used as default branch. @@ -277,11 +282,23 @@ lawrenciumMemoryLimit should have a value a bit less than the amount of memory t * **dockerImageNameSpace**, **dockerImageName**, **dockerImageTag** OPTIONAL: beam-environment docker parameters, by-default are: 'beammodel', 'beam-environment' and 'latest' respectively. -Your task is going to be added to the queue and when it starts/finishes you receive a notification on your git user email. It may take 1-24 hours (or even more) for the task to get started. It depends on the NERSC workload. In your user home directory on NERSC you can find the output file of your task that looks like `slurm-.out`. The BEAM output directory is resides at `$SCRATCH/beam_runs/`. Also the output is uploaded to s3 if `s3Backup` is set to true. +The task is going to be added to the queue. It may take 1-24 hours (or even more) for the task to get started - it depends on the Lawrencium cluster workload. +The BEAM output directory is resides at '/global/scratch/users/$USER/out_beam_$DATETIME.$RANDOM_PART.$PARTITION.$QOS.$MEMORY_LIMIT'. +Also the output is uploaded to s3 if `s3Backup` is set to true. + + +There is a command to view the queue and history of jobs for a specific user :: + + ./gradlew lawrenciumQueue --PlawrenciumUser= -PlawrenciumPassword= -Potp= + +Here are parameters of the command: +* **fromDate** OPTIONAL: A date since which the queue and jobs history should be displayed. By-default one day before now. +* **forDays** OPTIONAL: An alternative way to specify a fromDate. Calculated as current day minus forDays. By-default 1. +* **queueUser** OPTIONAL: A user to retrieve queue and jobs history for, by-default has the same value as lawrenciumUser. -BEAM run on NERSC cluster -~~~~~~~~~~~~~~~~~~~~~~~~~ +BEAM run on NERSC +~~~~~~~~~~~~~~~~~ In order to run BEAM on NERSC one needs to get an `ssh key `_ that allows you to ssh to NERSC systems without further authentication until the key expires (24 hours). You also need to specify your user name on NERSC in the following property: **nerscUser**, i.e:: From 4a9097d62eb24d9c3126414475c46f76e26745ad Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 17:52:19 +0300 Subject: [PATCH 17/32] using additional parameter to specify user --- lawrencium/build.gradle | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 34f50dad423..b1213a98658 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -192,8 +192,12 @@ tasks.register("lawrenciumQueue") { ext.fromDate = yesterday.format("yyyy-MM-dd") } - def commandToGetJobsListForUser = "sacct -u $lawrenciumUser --format=$sacctFormat -S $fromDate" - def commandToGetQueueForUser = "squeue -u $lawrenciumUser" + if (!project.hasProperty("queueUser")) { + ext.queueUser = lawrenciumUser + } + + def commandToGetJobsListForUser = "sacct -u $queueUser --format=$sacctFormat -S $fromDate" + def commandToGetQueueForUser = "squeue -u $queueUser" executeLawrenciumSSH([ commandToGetJobsListForUser, From 27560837100e2d8c7608f2e029f720eb98f2a439 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 26 Jul 2023 18:12:54 +0300 Subject: [PATCH 18/32] debugging --- lawrencium/src/main/bash/slurm_job.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index 56fac5edcd0..61881150dd7 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -29,7 +29,9 @@ echo "Pulling docker image '$FULL_DOCKER_IMAGE_NAME' ..." singularity pull --force "$FULL_DOCKER_IMAGE_NAME" echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." -singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" +# singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" + +singularity run hello-world_latest.sif echo "Removing a link to the job's log file." echo "The original job log file is in '$JOB_LOG_FILE_PATH'" From bf08d3f197cd17b04c8a3bc0986a3efb084a415f Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Fri, 28 Jul 2023 17:22:52 +0300 Subject: [PATCH 19/32] debugging --- lawrencium/src/main/bash/slurm_job.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index 61881150dd7..cc8886eb7e9 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -28,9 +28,10 @@ export ENFORCE_HTTPS_FOR_DATA_REPOSITORY="true" echo "Pulling docker image '$FULL_DOCKER_IMAGE_NAME' ..." singularity pull --force "$FULL_DOCKER_IMAGE_NAME" -echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." -# singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" +#echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." +#singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" +echo "Running singularity run hello-world_latest.sif" singularity run hello-world_latest.sif echo "Removing a link to the job's log file." From 95e6f3e0014ae53dbbe4a5580c10d6d0ab94c801 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Fri, 28 Jul 2023 17:49:03 +0300 Subject: [PATCH 20/32] debugging --- lawrencium/src/main/bash/slurm_job.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index cc8886eb7e9..7e65b9fdb23 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -31,8 +31,9 @@ singularity pull --force "$FULL_DOCKER_IMAGE_NAME" #echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." #singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" -echo "Running singularity run hello-world_latest.sif" -singularity run hello-world_latest.sif +set -x +sudo singularity run hello-world_latest.sif +set +x echo "Removing a link to the job's log file." echo "The original job log file is in '$JOB_LOG_FILE_PATH'" From 7377bbe5f60092738bdeed68ffe75eb023c1472f Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Fri, 28 Jul 2023 19:47:31 +0300 Subject: [PATCH 21/32] debugging --- lawrencium/src/main/bash/debug_job.sh | 8 ++++++ lawrencium/src/main/bash/debug_job_runner.sh | 26 ++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 lawrencium/src/main/bash/debug_job.sh create mode 100644 lawrencium/src/main/bash/debug_job_runner.sh diff --git a/lawrencium/src/main/bash/debug_job.sh b/lawrencium/src/main/bash/debug_job.sh new file mode 100644 index 00000000000..3a0a019d52b --- /dev/null +++ b/lawrencium/src/main/bash/debug_job.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -x + +singularity pull --force "docker://hello-world:latest" +singularity run "hello-world_latest.sif" + +set +x diff --git a/lawrencium/src/main/bash/debug_job_runner.sh b/lawrencium/src/main/bash/debug_job_runner.sh new file mode 100644 index 00000000000..59f81d313c8 --- /dev/null +++ b/lawrencium/src/main/bash/debug_job_runner.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +RANDOM_PART="$(tr -dc A-Z0-9 Date: Sun, 30 Jul 2023 13:26:04 +0300 Subject: [PATCH 22/32] bugfix --- docs/developers.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/developers.rst b/docs/developers.rst index 5f89cbdde57..de552e30cd0 100755 --- a/docs/developers.rst +++ b/docs/developers.rst @@ -246,7 +246,7 @@ BEAM run on Lawrencium cluster In order to run BEAM on Lawrencium cluster one needs to get a user and password and configure OTP (one time password) generator for accessing the cluster. Each time one runs a command on a cluster a user name, user password and one time password should be specified :: - ./gradlew --PlawrenciumUser= -PlawrenciumPassword= -Potp= + ./gradlew -PlawrenciumUser= -PlawrenciumPassword= -Potp= You need to define the deploy properties that are similar to the ones for AWS deploy. Lawrencium-specific properties, such as lawrenciumPartition and lawrenciumQoS are a combination from 'sacctmgr show association -p user=$USER' command, @@ -254,7 +254,7 @@ lawrenciumMemoryLimit should have a value a bit less than the amount of memory t To deploy a beam simulation to the lawrencium cluster one needs to use 'deployToLawrencium' command with following parameters specified either directly or as parameters in gradle.deploy.properties file :: - ./gradlew deployToLawrencium --PlawrenciumUser= -PlawrenciumPassword= -Potp= + ./gradlew deployToLawrencium -PlawrenciumUser= -PlawrenciumPassword= -Potp= Here are parameters that are required or optional for deploing to lawrencium cluster: @@ -289,7 +289,7 @@ Also the output is uploaded to s3 if `s3Backup` is set to true. There is a command to view the queue and history of jobs for a specific user :: - ./gradlew lawrenciumQueue --PlawrenciumUser= -PlawrenciumPassword= -Potp= + ./gradlew lawrenciumQueue -PlawrenciumUser= -PlawrenciumPassword= -Potp= Here are parameters of the command: From 735ceebeee926c0320d3a381858d495149189399 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Sun, 30 Jul 2023 13:26:31 +0300 Subject: [PATCH 23/32] original scripts --- lawrencium/src/main/bash/slurm_job.sh | 9 +++------ lawrencium/src/main/bash/slurm_job_runner.sh | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/lawrencium/src/main/bash/slurm_job.sh b/lawrencium/src/main/bash/slurm_job.sh index 7e65b9fdb23..e4975f2f6fa 100644 --- a/lawrencium/src/main/bash/slurm_job.sh +++ b/lawrencium/src/main/bash/slurm_job.sh @@ -25,14 +25,11 @@ SINGULARITY_IMAGE_NAME="${DOCKER_IMAGE_NAME}_${DOCKER_IMAGE_TAG}.sif" # to use https for pulling data repository despite a url configured for it export ENFORCE_HTTPS_FOR_DATA_REPOSITORY="true" -echo "Pulling docker image '$FULL_DOCKER_IMAGE_NAME' ..." -singularity pull --force "$FULL_DOCKER_IMAGE_NAME" +set -x -#echo "Running singularity image '$SINGULARITY_IMAGE_NAME' ..." -#singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" +singularity pull --force "$FULL_DOCKER_IMAGE_NAME" +singularity run -B "$BEAM_DIR:/app/sources" "$SINGULARITY_IMAGE_NAME" -set -x -sudo singularity run hello-world_latest.sif set +x echo "Removing a link to the job's log file." diff --git a/lawrencium/src/main/bash/slurm_job_runner.sh b/lawrencium/src/main/bash/slurm_job_runner.sh index ed9ed37190c..eea4b417162 100644 --- a/lawrencium/src/main/bash/slurm_job_runner.sh +++ b/lawrencium/src/main/bash/slurm_job_runner.sh @@ -139,4 +139,4 @@ sbatch --partition="$PARTITION" \ --output="$JOB_LOG_FILE_PATH" \ --time="$EXPECTED_EXECUTION_DURATION" \ slurm_job.sh -set +x \ No newline at end of file +set +x From e3c5f12dbfb62c684684f80e48a0f836ddd2404f Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 2 Aug 2023 18:24:22 +0300 Subject: [PATCH 24/32] debugging lawrencuim --- lawrencium/build.gradle | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index b1213a98658..c85b53b56b1 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -52,13 +52,6 @@ remotes { } } -ssh { - settings { - authentications = ["keyboard-interactive"] - knownHosts = allowAnyHosts - } -} - ext.executeLawrenciumSSH = { List commandsList -> List requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"] for (prop in requiredProps) { @@ -165,17 +158,24 @@ tasks.register("deployToLawrencium") { def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" +// def job_name = "slurm_job.sh" +// def runner_name = "slurm_job_runner.sh" + + def job_name = "debug_job.sh" + def runner_name = "debug_job_runner.sh" + executeLawrenciumSSH([ - "rm -f slurm_job.sh", - "rm -f slurm_job_runner.sh", + "rm -f $job_name", + "rm -f $runner_name", // downloading files instead of sending them // because it is not allowed to push files to login node // file transfer node uses different type of authentication // and both nodes are not working together within one session for some reason - "wget $base_path/slurm_job.sh", - "wget $base_path/slurm_job_runner.sh", - "chmod +x slurm_job_runner.sh slurm_job.sh", - "./slurm_job_runner.sh " + parametersToDeploy, + "wget $base_path/$job_name", + "wget $base_path/$runner_name", + "chmod 777 $job_name", + "chmod 777 $runner_name", + "./$runner_name " + parametersToDeploy, "squeue -u $lawrenciumUser" ]) } @@ -215,3 +215,9 @@ tasks.register("lawrenciumNodes") { executeLawrenciumSSH([commandToGetNodesInfo]) } } + +tasks.register("lawrenciumTest") { + doLast { + executeLawrenciumSSH(["singularity run hello-world_latest.sif"]) + } +} From c345fa877c1646b1639548fec0a17bf9eae7a24e Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 9 Aug 2023 17:25:36 +0300 Subject: [PATCH 25/32] less ram --- lawrencium/src/main/bash/debug_job_runner.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lawrencium/src/main/bash/debug_job_runner.sh b/lawrencium/src/main/bash/debug_job_runner.sh index 59f81d313c8..067c651328b 100644 --- a/lawrencium/src/main/bash/debug_job_runner.sh +++ b/lawrencium/src/main/bash/debug_job_runner.sh @@ -6,7 +6,7 @@ JOB_NAME="$RANDOM_PART.$DATETIME" PARTITION="es1" QOS="es_normal" -MEMORY_LIMIT="480" +MEMORY_LIMIT="80" ACCOUNT="pc_beamcore" JOB_LOG_FILE_PATH="/global/scratch/users/$USER/test_log_${DATETIME}_${RANDOM_PART}.log" EXPECTED_EXECUTION_DURATION="0-01:00:00" From 09afff5cd425be3aaa4fcde2255ae3cc7a436cf8 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Wed, 9 Aug 2023 17:25:50 +0300 Subject: [PATCH 26/32] not needed --- lawrencium/build.gradle | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index c85b53b56b1..d97e714e685 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -216,8 +216,3 @@ tasks.register("lawrenciumNodes") { } } -tasks.register("lawrenciumTest") { - doLast { - executeLawrenciumSSH(["singularity run hello-world_latest.sif"]) - } -} From afd45dd040860f1fff1097d7d4b8d632849ffe9e Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Fri, 18 Aug 2023 17:33:35 +0300 Subject: [PATCH 27/32] using lr6 by-default --- gradle.deploy.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle.deploy.properties b/gradle.deploy.properties index 31a597ee7ec..ab0eccf259d 100644 --- a/gradle.deploy.properties +++ b/gradle.deploy.properties @@ -12,7 +12,7 @@ cloudPlatform=Google instanceType=n2d-standard-4 # Lawrencium cluster specifics -lawrenciumPartition=es1 +lawrenciumPartition=lr6 lawrenciumMemoryLimit=480 lawrenciumQoS=es_normal lawrenciumAccount=pc_beamcore From 9ad9fc084bb385f54b6a4b971ee7e3cadb504e98 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Fri, 18 Aug 2023 17:33:49 +0300 Subject: [PATCH 28/32] function to get available resources --- lawrencium/build.gradle | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index d97e714e685..851ca8babb3 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -216,3 +216,11 @@ tasks.register("lawrenciumNodes") { } } +tasks.register("lawrenciumResourcesAvailable") { + doLast { + if (!project.hasProperty("queueUser")) { + ext.queueUser = lawrenciumUser + } + executeLawrenciumSSH(["sacctmgr show association -p user=$queueUser"]) + } +} From 54fe6b28af89b452a8b4445fa5e3cb5a91a3ddc3 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Sat, 19 Aug 2023 14:26:39 +0300 Subject: [PATCH 29/32] reverting production links changes --- production/newyork | 2 +- production/sfbay | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/production/newyork b/production/newyork index 64e42468aab..31c64acc53e 160000 --- a/production/newyork +++ b/production/newyork @@ -1 +1 @@ -Subproject commit 64e42468aabf21f5d16f14184b06234f89756e09 +Subproject commit 31c64acc53ee235c53d304d2c8d1131398399b2c diff --git a/production/sfbay b/production/sfbay index 5fcac035282..aa5f24cb14f 160000 --- a/production/sfbay +++ b/production/sfbay @@ -1 +1 @@ -Subproject commit 5fcac035282b941643a7e6e7c07cb503c52914ec +Subproject commit aa5f24cb14f80ff535c20a28d64d7cbf239df6a3 From b33915a18f1c9024b5c5e8f0334a9e5752928dea Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 21 Aug 2023 22:16:57 +0300 Subject: [PATCH 30/32] task to cancel a job + general improvements --- lawrencium/build.gradle | 81 ++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 14 deletions(-) diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle index 851ca8babb3..35a0eb3d3e1 100644 --- a/lawrencium/build.gradle +++ b/lawrencium/build.gradle @@ -45,10 +45,46 @@ ext.getParameterOrEnvironmentValue = { String paramName -> } } +ext.getNodeMemoryRequirement = { String partitionName -> + switch (partitionName) { + case "lr_bigmem": + return "144" // CPU cores: 32 + case "lr3": + return "480" // CPU cores: 32 + case "es1": + return "480" // CPU cores: 64; GPU instance + case "lr7": + return "240" // CPU cores: 56 + case "cm1": + return "224" // CPU cores: 48 + case "cf1": + return "180" // CPU cores: 64 + } +} + +ext.getQOS = { String partitionName -> + switch (partitionName) { + case "lr_bigmem": + return "lr_normal" // CPU cores: 32 + case "lr3": + return "lr_normal" // CPU cores: 32 + case "es1": + return "es_normal" // CPU cores: 64; GPU instance + case "lr7": + return "lr_normal" // CPU cores: 56 + case "cm1": + return "cm1_normal" // CPU cores: 48 + case "cf1": + return "cf_normal" // CPU cores: 64 + } +} + +String lawrenciumUser = getParameterValue('lawrenciumUser') ?: 'user-not-specified' + remotes { lawrenciumLoginNode { host = 'lrc-login.lbl.gov' - user = project.findProperty('lawrenciumUser') ?: 'user-not-specified' + user = lawrenciumUser } } @@ -60,11 +96,13 @@ ext.executeLawrenciumSSH = { List commandsList -> } } + String fullPassword = getParameterValue("lawrenciumPassword") + getParameterValue("otp") + ssh.run { settings { authentications = ["keyboard-interactive"] knownHosts = allowAnyHosts - password = getParameterValue("lawrenciumPassword") + getParameterValue("otp") + password = fullPassword } session(remotes.lawrenciumLoginNode) { @@ -90,17 +128,24 @@ tasks.register("deployToLawrencium") { def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs') // partition, memory limit and QoS should be changed together - def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' - def lawrenciumMemoryLimit = getParameterValue('lawrenciumMemoryLimit') ?: '480' - def lawrenciumQoS = getParameterValue('lawrenciumQoS') ?: 'es_normal' + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'lr6' + def lawrenciumMemoryLimit = getNodeMemoryRequirement(lawrenciumPartition) + def lawrenciumQoS = getQOS(lawrenciumPartition) // using memory limit for lawrencium cluster is usually fine - def max_ram = getParameterValue('forcedMaxRAM') ?: lawrenciumMemoryLimit + def defaultMemoryLimit = { -> + if (lawrenciumPartition == 'lr_bigmem') { + return (lawrenciumMemoryLimit.toInteger() * 1024).toString() + } else { + return lawrenciumMemoryLimit + } + } + def max_ram = getParameterValue('forcedMaxRAM') ?: defaultMemoryLimit def profiler_type = getParameterValue('profiler_type') ?: 'cpumem' def sentNotification = getParameterValue('sentNotification') ?: true - def slackHookWithToken = getParameterValue('slackHookWithToken') ?: 'not-set' - def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: 'not-set' + def slackHookWithToken = getParameterValue('slackHookWithToken') ?: '' + def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: '' def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set' @@ -158,11 +203,8 @@ tasks.register("deployToLawrencium") { def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" -// def job_name = "slurm_job.sh" -// def runner_name = "slurm_job_runner.sh" - - def job_name = "debug_job.sh" - def runner_name = "debug_job_runner.sh" + def job_name = "slurm_job.sh" + def runner_name = "slurm_job_runner.sh" executeLawrenciumSSH([ "rm -f $job_name", @@ -187,7 +229,7 @@ tasks.register("lawrenciumQueue") { def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" if (!project.hasProperty("fromDate")) { Date today = new Date() - Integer daysPast = (project.findProperty('forDays') ?: '1').toInteger() + Integer daysPast = (getParameterValue('forDays') ?: '1').toInteger() Date yesterday = today - daysPast ext.fromDate = yesterday.format("yyyy-MM-dd") } @@ -224,3 +266,14 @@ tasks.register("lawrenciumResourcesAvailable") { executeLawrenciumSSH(["sacctmgr show association -p user=$queueUser"]) } } + +tasks.register("lawrenciumJobCancel") { + doLast { + def jobId = getParameterValue("jobId") + + def commandToCancelJob = "scancel $jobId" + def commandToGetQueueForUser = "squeue -u $lawrenciumUser" + + executeLawrenciumSSH([commandToCancelJob, commandToGetQueueForUser]) + } +} From 085a6c628baf4defad052a855c6890d6cf0bfd23 Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 21 Aug 2023 22:17:25 +0300 Subject: [PATCH 31/32] separation of values to print into regular and secret --- docker/beam-environment/entrypoint.sh | 112 +++++++++++++------------- 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/docker/beam-environment/entrypoint.sh b/docker/beam-environment/entrypoint.sh index 4b057ed4f68..c52268a7466 100644 --- a/docker/beam-environment/entrypoint.sh +++ b/docker/beam-environment/entrypoint.sh @@ -2,7 +2,7 @@ echo "Starting entrypoint script, at $(date "+%Y-%m-%d-%H:%M:%S")" -function print_error(){ +function print_error() { echo "" echo "ERROR!" echo "ERROR!" @@ -14,9 +14,9 @@ function print_error(){ ## ## print variables that might be set only outside of image ## -required_variables_from_outside=( BEAM_CONFIG MAX_RAM ) +required_variables_from_outside=(BEAM_CONFIG MAX_RAM) echo "Following variables are required for BEAM to work (variable name -> 'current value'):" -for v in "${required_variables_from_outside[@]}" ; do +for v in "${required_variables_from_outside[@]}"; do vval1="$v" vval="${!vval1}" echo "$v -> '$vval'" @@ -27,7 +27,6 @@ for v in "${required_variables_from_outside[@]}" ; do fi done - ## ## print optional variables that might be set only outside of image ## @@ -36,24 +35,38 @@ optional_variables_from_outside=( BEAM_BRANCH_NAME BEAM_DATA_BRANCH_NAME PULL_CODE PULL_DATA ENFORCE_HTTPS_FOR_DATA_REPOSITORY - S3_PUBLISH AWS_SECRET_ACCESS_KEY AWS_ACCESS_KEY_ID S3_REGION + S3_PUBLISH PROFILER SIMULATION_HOST_LOG_FILE GRADLE_CACHE_PATH - NOTIFICATION_SEND - SIMULATIONS_SPREADSHEET_UPDATE_URL SLACK_HOOK_WITH_TOKEN + SEND_NOTIFICATION NOTIFICATION_TITLED NOTIFICATION_SHUTDOWN_WAIT NOTIFICATION_INSTANCE_ID NOTIFICATION_INSTANCE_TYPE NOTIFICATION_HOST_NAME NOTIFICATION_WEB_BROWSER NOTIFICATION_INSTANCE_REGION NOTIFICATION_SIGOPT_CLIENT_ID NOTIFICATION_SIGOPT_DEV_ID ) echo "Following variables are optional (variable name -> 'current value'):" -for v in "${optional_variables_from_outside[@]}" ; do +for v in "${optional_variables_from_outside[@]}"; do vval1="$v" vval="${!vval1}" echo "$v -> '$vval'" done +optional_secrets_from_outside=( + AWS_SECRET_ACCESS_KEY + AWS_ACCESS_KEY_ID + SIMULATIONS_SPREADSHEET_UPDATE_URL + SLACK_HOOK_WITH_TOKEN +) +for v in "${optional_secrets_from_outside[@]}"; do + vval1="$v" + vval="${!vval1}" + if [[ -z "$vval" ]]; then + echo "Variable '$v' is EMPTY, expected to be containing a secret!" + else + echo "Variable '$v' contains a secret inside." + fi +done function send_slack_notification() { if [[ -z "$SLACK_HOOK_WITH_TOKEN" ]]; then @@ -81,14 +94,12 @@ function send_json_to_spreadsheet() { fi } - ## ## location to project folder ## path_to_project_parent="/app/sources" cd "$path_to_project_parent" || echo "ERROR: The path '$path_to_project_parent' is not available" - ## ## either pull the code or use the code from mounted folder ## @@ -106,7 +117,7 @@ if [ "$PULL_CODE" = true ]; then if [[ $BEAM_COMMIT_SHA ]]; then echo "Resetting the code to commit '$BEAM_COMMIT_SHA'" - git reset --hard "$BEAM_COMMIT_SHA"; + git reset --hard "$BEAM_COMMIT_SHA" else BEAM_COMMIT_SHA=$(git log -1 --pretty=format:%H) echo "Using the latest commit in the branch '$BEAM_BRANCH_NAME' - '$BEAM_COMMIT_SHA'" @@ -118,13 +129,11 @@ else echo "Using the branch '$BEAM_BRANCH_NAME', commit '$BEAM_COMMIT_SHA'" fi - ## ## logging CPU | RAM usage during simulation ## cpu_ram_log="/app/sources/cpu_ram_usage.csv" -/app/write_cpu_ram_usage.sh > "$cpu_ram_log" & - +/app/write_cpu_ram_usage.sh >"$cpu_ram_log" & ## ## Remember the BEAM working folder location. @@ -133,7 +142,6 @@ cpu_ram_log="/app/sources/cpu_ram_usage.csv" beam_path=$(pwd) echo "Working from '$beam_path'" - ## ## Required to avoid configuring ssh keys because by-default all our git data submodules configured to use ssh. ## @@ -142,7 +150,6 @@ if [ "$ENFORCE_HTTPS_FOR_DATA_REPOSITORY" = true ]; then git config --global url."https://github.com/LBNL-UCB-STI".insteadOf "git@github.com:LBNL-UCB-STI" fi - ## ## pulling data from github if enabled or checking if data location was mounted separately ## @@ -154,24 +161,22 @@ if [ "$PULL_DATA" = true ]; then echo "Pulling the data from github (PULL_DATA set to '$PULL_DATA')." production_data_submodules=$(git submodule | awk '{ print $2 }') - for i in $production_data_submodules; - do - if [[ $BEAM_CONFIG == $i* ]]; - then - echo "Loading remote production data for $i" - git config "submodule.$i.branch" "$BEAM_DATA_BRANCH_NAME" - git submodule update --init --remote "$i" - cd "$i" || echo "ERROR: The path '$i' is not available" - if [[ $BEAM_DATA_COMMIT_SHA ]]; then - echo "Checking out the data commit '$BEAM_DATA_COMMIT_SHA'" - git checkout "$BEAM_DATA_COMMIT_SHA" - else - BEAM_DATA_COMMIT_SHA=$(git log -1 --pretty=format:%H) - echo "Latest commit is '$BEAM_DATA_COMMIT_SHA'" - fi - cd - || echo "ERROR: Can't move to the previous location" - fi; - done + for i in $production_data_submodules; do + if [[ $BEAM_CONFIG == $i* ]]; then + echo "Loading remote production data for $i" + git config "submodule.$i.branch" "$BEAM_DATA_BRANCH_NAME" + git submodule update --init --remote "$i" + cd "$i" || echo "ERROR: The path '$i' is not available" + if [[ $BEAM_DATA_COMMIT_SHA ]]; then + echo "Checking out the data commit '$BEAM_DATA_COMMIT_SHA'" + git checkout "$BEAM_DATA_COMMIT_SHA" + else + BEAM_DATA_COMMIT_SHA=$(git log -1 --pretty=format:%H) + echo "Latest commit is '$BEAM_DATA_COMMIT_SHA'" + fi + cd - || echo "ERROR: Can't move to the previous location" + fi + done echo "Doing lfs pull" git lfs pull @@ -192,11 +197,10 @@ else fi fi - ## ## notification ## -if [ "$NOTIFICATION_SEND" = true ]; then +if [ "$SEND_NOTIFICATION" = true ]; then send_slack_notification "Run Started Run Name $NOTIFICATION_TITLED Instance ID $NOTIFICATION_INSTANCE_ID @@ -225,10 +229,9 @@ if [ "$NOTIFICATION_SEND" = true ]; then \"sigopt_client_id\":\"$NOTIFICATION_SIGOPT_CLIENT_ID\", \"sigopt_dev_id\":\"$NOTIFICATION_SIGOPT_DEV_ID\"" else - echo "Sending notifications is disabled (NOTIFICATION_SEND set to '$NOTIFICATION_SEND')." + echo "Sending notifications is disabled (SEND_NOTIFICATION set to '$SEND_NOTIFICATION')." fi - ## ## calculating a location for gradle cache ## @@ -241,32 +244,29 @@ else mkdir -p "$GRADLE_CACHE_PATH" fi - ## ## we shouldn't use the gradle daemon on NERSC, it seems that it's somehow shared within different nodes ## and all the subsequent runs have output dir somewhere else. ## ./gradlew --no-daemon --gradle-user-home="$GRADLE_CACHE_PATH" clean :run -PappArgs="['--config', '$BEAM_CONFIG']" -PmaxRAM="$MAX_RAM" -Pprofiler_type="$PROFILER" - ## ## Calculate the final status of simulation ## log_file="$(find "$beam_path/output" -maxdepth 2 -mindepth 2 -type d -print -quit)/beamLog.out" if [[ ! -f $log_file ]]; then - echo "Unable to locate the beamLog.out file" - final_status="Unable to start" + echo "Unable to locate the beamLog.out file" + final_status="Unable to start" else last_line=$(tail "$log_file" -n 1) if [[ $last_line == *"Exiting BEAM"* ]]; then - final_status="Run Completed" + final_status="Run Completed" else - final_status="Run Failed" + final_status="Run Failed" fi fi echo "The final status of simulation is '$final_status'" - ## ## calculating the health of simulation ## @@ -280,21 +280,20 @@ python3 src/main/python/general_analysis/simulation_health_analysis.py $simulati while IFS="," read -r metric count; do export "$metric=$count" health_metrics="$health_metrics, $metric:$count" -done < $simulation_health_analysis_output_file +done <$simulation_health_analysis_output_file health_metrics="$(echo "$health_metrics" | cut -c3-)" echo "Health metrics: '$health_metrics'" - ## ## Working with output of simulation ## sleep 10s FINAL_PATH="" for file in output/*; do - for path2 in "$file"/*; do - FINAL_PATH="$path2"; - done; -done; + for path2 in "$file"/*; do + FINAL_PATH="$path2" + done +done echo "Found output dir: $FINAL_PATH" echo "Moving debug files to output folder." @@ -303,7 +302,7 @@ for file in "$beam_path"/*.jfr; do echo "Zipping $file" zip "$file.zip" "$file" mv "$file.zip" "$FINAL_PATH" -done; +done echo "Moving health metrics." mv "$simulation_health_analysis_output_file" "$FINAL_PATH" @@ -324,12 +323,11 @@ fi echo "Fixing permission issues related to access to files created from inside of image." chmod 777 -R "$FINAL_PATH" - ## ## uploading output to s3 if enabled ## if [ "$S3_PUBLISH" = true ]; then - aws --region "$S3_REGION" s3 cp "$FINAL_PATH" s3://beam-outputs/"$FINAL_PATH" --recursive; + aws --region "$S3_REGION" s3 cp "$FINAL_PATH" s3://beam-outputs/"$FINAL_PATH" --recursive --no-progress --only-show-errors s3output_url="https://s3.$S3_REGION.amazonaws.com/beam-outputs/index.html#$FINAL_PATH" simulation_output_link="$s3output_url" echo "Uploaded to $s3output_url" @@ -337,11 +335,10 @@ else echo "S3 publishing is disabled (S3_PUBLISH set to '$S3_PUBLISH')." fi - ## ## notification ## -if [ "$NOTIFICATION_SEND" = true ]; then +if [ "$SEND_NOTIFICATION" = true ]; then send_slack_notification "Run Completed Run Name $NOTIFICATION_TITLED Instance ID $NOTIFICATION_INSTANCE_ID @@ -378,10 +375,9 @@ if [ "$NOTIFICATION_SEND" = true ]; then \"sigopt_client_id\":\"$NOTIFICATION_SIGOPT_CLIENT_ID\", \"sigopt_dev_id\":\"$NOTIFICATION_SIGOPT_DEV_ID\"" else - echo "Sending notifications is disabled (NOTIFICATION_SEND set to '$NOTIFICATION_SEND')." + echo "Sending notifications is disabled (SEND_NOTIFICATION set to '$SEND_NOTIFICATION')." fi - echo "" echo "Completed at $(date "+%Y-%m-%d-%H:%M:%S")" echo "" From 5bcafbd0ce868ee0365a6ce1d7cabc9f4fd0814e Mon Sep 17 00:00:00 2001 From: Nikolay Ilyin Date: Mon, 21 Aug 2023 22:18:41 +0300 Subject: [PATCH 32/32] better default values --- gradle.deploy.properties | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gradle.deploy.properties b/gradle.deploy.properties index ab0eccf259d..cbd398b2e0a 100644 --- a/gradle.deploy.properties +++ b/gradle.deploy.properties @@ -12,9 +12,8 @@ cloudPlatform=Google instanceType=n2d-standard-4 # Lawrencium cluster specifics -lawrenciumPartition=lr6 -lawrenciumMemoryLimit=480 -lawrenciumQoS=es_normal +# lr_bigmem lr3 es1 lr7 cm1 cf1 +lawrenciumPartition=lr7 lawrenciumAccount=pc_beamcore # the name of the user in cluster lawrenciumUser= @@ -22,7 +21,7 @@ lawrenciumUser= # the gradle command also should include the one time password: -Potp= lawrenciumPassword= # expected duration of simulation to run on Lawrencium, should be less than maximum of 3 days. -expectedDuration='0-00:10:00' +expectedDuration='0-10:00:00' # shutdownBehaviour = stop | terminate shutdownBehaviour=terminate