diff --git a/gradle.deploy.properties b/gradle.deploy.properties index 98c76984c3f..cbd398b2e0a 100644 --- a/gradle.deploy.properties +++ b/gradle.deploy.properties @@ -11,6 +11,18 @@ beamConfigs=test/input/beamville/beam.conf cloudPlatform=Google instanceType=n2d-standard-4 +# Lawrencium cluster specifics +# lr_bigmem lr3 es1 lr7 cm1 cf1 +lawrenciumPartition=lr7 +lawrenciumAccount=pc_beamcore +# the name of the user in cluster +lawrenciumUser= +# the password of the user in cluster, +# the gradle command also should include the one time password: -Potp= +lawrenciumPassword= +# expected duration of simulation to run on Lawrencium, should be less than maximum of 3 days. +expectedDuration='0-10:00:00' + # shutdownBehaviour = stop | terminate shutdownBehaviour=terminate @@ -20,6 +32,13 @@ s3Backup=true # for example: helics/run_pydss_federate.sh or helics/run_site_power_controller.sh cosimulationShellScript= +# parameters for configuring notifications, +# so far required only for Lawrencium cluster. +# should be set for notifications to work +sentNotification=true +# slackHookWithToken= +# simulationsSpreadsheetUrl= + # Run Jupyter Notebook together with BEAM runJupyter=false diff --git a/lawrencium/build.gradle b/lawrencium/build.gradle new file mode 100644 index 00000000000..35a0eb3d3e1 --- /dev/null +++ b/lawrencium/build.gradle @@ -0,0 +1,279 @@ +import com.amazonaws.auth.* +import com.amazonaws.auth.profile.ProfileCredentialsProvider + +group = 'beam' +version = '0.8.0' + +buildscript { + repositories { + mavenLocal() + mavenCentral() + maven { url "https://plugins.gradle.org/m2/" } + gradlePluginPortal() + } + dependencies { + classpath 'org.hidetake:gradle-ssh-plugin:2.10.1' + classpath 'com.amazonaws:aws-java-sdk:1.11.83' + } +} + +apply plugin: 'org.hidetake.ssh' + +def propsFileName = "${project.rootDir}/gradle.deploy.properties" +if (project.hasProperty('propsFile')) { + propsFileName = project.findProperty('propsFile') +} + +def propsFile = new Properties() +propsFile.load(project.file(propsFileName).newDataInputStream()) + +ext.getParameterValue = { String paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else { + return propsFile.getProperty(paramName) + } +} + +ext.getParameterOrEnvironmentValue = { String paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else if (System.getenv(paramName)) { + return System.getenv(paramName) + } else { + return propsFile.getProperty(paramName) + } +} + +ext.getNodeMemoryRequirement = { String partitionName -> + switch (partitionName) { + case "lr_bigmem": + return "144" // CPU cores: 32 + case "lr3": + return "480" // CPU cores: 32 + case "es1": + return "480" // CPU cores: 64; GPU instance + case "lr7": + return "240" // CPU cores: 56 + case "cm1": + return "224" // CPU cores: 48 + case "cf1": + return "180" // CPU cores: 64 + } +} + +ext.getQOS = { String partitionName -> + switch (partitionName) { + case "lr_bigmem": + return "lr_normal" // CPU cores: 32 + case "lr3": + return "lr_normal" // CPU cores: 32 + case "es1": + return "es_normal" // CPU cores: 64; GPU instance + case "lr7": + return "lr_normal" // CPU cores: 56 + case "cm1": + return "cm1_normal" // CPU cores: 48 + case "cf1": + return "cf_normal" // CPU cores: 64 + } +} + +String lawrenciumUser = getParameterValue('lawrenciumUser') ?: 'user-not-specified' + +remotes { + lawrenciumLoginNode { + host = 'lrc-login.lbl.gov' + user = lawrenciumUser + } +} + +ext.executeLawrenciumSSH = { List commandsList -> + List requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"] + for (prop in requiredProps) { + if (!getParameterValue(prop)) { + println "Property '$prop' is required and should be set!" + } + } + + String fullPassword = getParameterValue("lawrenciumPassword") + getParameterValue("otp") + + ssh.run { + settings { + authentications = ["keyboard-interactive"] + knownHosts = allowAnyHosts + password = fullPassword + } + + session(remotes.lawrenciumLoginNode) { + for (command in commandsList) { + println command + execute command + } + } + } +} + + +tasks.register("deployToLawrencium") { + doLast { + def runName = getParameterValue('runName') + + def branch = getParameterValue('beamBranch') ?: getCurrentGitBranch() + def dataBranch = getParameterValue('dataBranch') ?: 'develop' + + def commit = getParameterValue('beamCommit') ?: 'HEAD' + def dataCommit = getParameterValue('dataCommit') ?: 'HEAD' + + def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs') + + // partition, memory limit and QoS should be changed together + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'lr6' + def lawrenciumMemoryLimit = getNodeMemoryRequirement(lawrenciumPartition) + def lawrenciumQoS = getQOS(lawrenciumPartition) + + // using memory limit for lawrencium cluster is usually fine + def defaultMemoryLimit = { -> + if (lawrenciumPartition == 'lr_bigmem') { + return (lawrenciumMemoryLimit.toInteger() * 1024).toString() + } else { + return lawrenciumMemoryLimit + } + } + def max_ram = getParameterValue('forcedMaxRAM') ?: defaultMemoryLimit + def profiler_type = getParameterValue('profiler_type') ?: 'cpumem' + + def sentNotification = getParameterValue('sentNotification') ?: true + def slackHookWithToken = getParameterValue('slackHookWithToken') ?: '' + def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: '' + + def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set' + + def s3_publish = getParameterValue('s3Backup') ?: true + def region = getParameterValue('region') ?: defaultRegion + + // reading AWS credentials from various credentials providers + AWSCredentialsProvider creds = new AWSCredentialsProviderChain( + new EnvironmentVariableCredentialsProvider(), + new SystemPropertiesCredentialsProvider(), + new ProfileCredentialsProvider(), + new EC2ContainerCredentialsProviderWrapper() + ) + + String aws_access_key_id = creds.getCredentials().AWSAccessKeyId + String aws_secret_access_key = creds.getCredentials().AWSSecretKey + + // by-default maximum possible value is used + def expectedDuration = getParameterValue('expectedDuration') ?: '3-00:00:00' + + // in case the simulation should be run under different lawrencium account + def lawrenciumAccount = getParameterValue('lawrenciumAccount') ?: 'pc_beamcore' + + // which docker image to run + def dockerImageTag = getParameterValue('dockerImageTag') ?: 'latest' + def dockerImageName = getParameterValue('dockerImageName') ?: 'beam-environment' + def dockerImageNameSpace = getParameterValue('dockerImageNameSpace') ?: 'beammodel' + + def parametersToDeploy = "--RUN_NAME='$runName' " + + "--BEAM_BRANCH_NAME='$branch' " + + "--BEAM_COMMIT_SHA='$commit' " + + "--BEAM_DATA_BRANCH_NAME='$dataBranch' " + + "--BEAM_DATA_COMMIT_SHA='$dataCommit' " + + "--BEAM_CONFIG='$configs' " + + "--MAX_RAM='$max_ram' " + + "--PROFILER='$profiler_type' " + + "--ACCOUNT='$lawrenciumAccount' " + + "--PARTITION='$lawrenciumPartition' " + + "--QOS='$lawrenciumQoS' " + + "--MEMORY_LIMIT='$lawrenciumMemoryLimit' " + + "--EXPECTED_EXECUTION_DURATION='$expectedDuration' " + + "--S3_REGION='$region' " + + "--S3_PUBLISH='$s3_publish' " + + "--AWS_SECRET_ACCESS_KEY='$aws_secret_access_key' " + + "--AWS_ACCESS_KEY_ID='$aws_access_key_id' " + + "--SEND_NOTIFICATION='$sentNotification' " + + "--SLACK_HOOK_WITH_TOKEN='$slackHookWithToken' " + + "--SIMULATIONS_SPREADSHEET_UPDATE_URL='$simulationsSpreadsheetUrl' " + + "--GOOGLE_API_KEY='$google_api_key' " + + "--PULL_CODE='true' " + + "--PULL_DATA='true' " + + "--DOCKER_IMAGE_NAMESPACE='$dockerImageNameSpace' " + + "--DOCKER_IMAGE_NAME='$dockerImageName' " + + "--DOCKER_IMAGE_TAG='$dockerImageTag' " + + def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash" + + def job_name = "slurm_job.sh" + def runner_name = "slurm_job_runner.sh" + + executeLawrenciumSSH([ + "rm -f $job_name", + "rm -f $runner_name", + // downloading files instead of sending them + // because it is not allowed to push files to login node + // file transfer node uses different type of authentication + // and both nodes are not working together within one session for some reason + "wget $base_path/$job_name", + "wget $base_path/$runner_name", + "chmod 777 $job_name", + "chmod 777 $runner_name", + "./$runner_name " + parametersToDeploy, + "squeue -u $lawrenciumUser" + ]) + } +} + + +tasks.register("lawrenciumQueue") { + doLast { + def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time" + if (!project.hasProperty("fromDate")) { + Date today = new Date() + Integer daysPast = (getParameterValue('forDays') ?: '1').toInteger() + Date yesterday = today - daysPast + ext.fromDate = yesterday.format("yyyy-MM-dd") + } + + if (!project.hasProperty("queueUser")) { + ext.queueUser = lawrenciumUser + } + + def commandToGetJobsListForUser = "sacct -u $queueUser --format=$sacctFormat -S $fromDate" + def commandToGetQueueForUser = "squeue -u $queueUser" + + executeLawrenciumSSH([ + commandToGetJobsListForUser, + commandToGetQueueForUser + ]) + } +} + + +tasks.register("lawrenciumNodes") { + doLast { + def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1' + def commandToGetNodesInfo = "sinfo -N --long --partition=$lawrenciumPartition" + + executeLawrenciumSSH([commandToGetNodesInfo]) + } +} + +tasks.register("lawrenciumResourcesAvailable") { + doLast { + if (!project.hasProperty("queueUser")) { + ext.queueUser = lawrenciumUser + } + executeLawrenciumSSH(["sacctmgr show association -p user=$queueUser"]) + } +} + +tasks.register("lawrenciumJobCancel") { + doLast { + def jobId = getParameterValue("jobId") + + def commandToCancelJob = "scancel $jobId" + def commandToGetQueueForUser = "squeue -u $lawrenciumUser" + + executeLawrenciumSSH([commandToCancelJob, commandToGetQueueForUser]) + } +} diff --git a/nersc/build.gradle b/nersc/build.gradle index d59596862e4..1947164c358 100644 --- a/nersc/build.gradle +++ b/nersc/build.gradle @@ -30,34 +30,34 @@ remotes { import org.apache.tools.ant.filters.ReplaceTokens task deployToNersc { - def propsFileName = "${project.rootDir}/gradle.deploy.properties" - if (project.hasProperty('propsFile')) { - propsFileName = project.findProperty('propsFile') - } + def propsFileName = "${project.rootDir}/gradle.deploy.properties" + if (project.hasProperty('propsFile')) { + propsFileName = project.findProperty('propsFile') + } - def propsFile = new Properties() - propsFile.load(project.file(propsFileName).newDataInputStream()) + def propsFile = new Properties() + propsFile.load(project.file(propsFileName).newDataInputStream()) - ext.getParameterValue = { paramName -> - if (project.hasProperty(paramName)) { - return project.findProperty(paramName) - } else { - return propsFile.getProperty(paramName) - } + ext.getParameterValue = { paramName -> + if (project.hasProperty(paramName)) { + return project.findProperty(paramName) + } else { + return propsFile.getProperty(paramName) } + } - def runName = "${ext.getParameterValue('runName')}" - def git_user_email = "${getCurrentGitUserEmail()}" - def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}" - def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}" - def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}" - def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}" - def max_ram = '100g' - def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}" - def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}" - def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}" - def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}" - def region = "${ext.getParameterValue('region') ?: defaultRegion}" + def runName = "${ext.getParameterValue('runName')}" + def git_user_email = "${getCurrentGitUserEmail()}" + def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}" + def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}" + def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}" + def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}" + def max_ram = '100g' + def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}" + def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}" + def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}" + def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}" + def region = "${ext.getParameterValue('region') ?: defaultRegion}" doFirst { copy { diff --git a/settings.gradle b/settings.gradle index 718c770c3a8..329f255c9aa 100755 --- a/settings.gradle +++ b/settings.gradle @@ -2,6 +2,7 @@ include 'beam-gui' include 'aws' include 'gcp' include 'nersc' +include 'lawrencium' include 'metrics2.0' include 'jupyter'