Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lawrencium: running BEAM on cluster with gradle command #3759

Draft
wants to merge 38 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
16ad593
split script into two pieces for simplicity
nikolayilyin Jul 5, 2023
15ead6d
initial functionality
nikolayilyin Jul 10, 2023
5b0ddac
bugfix
nikolayilyin Jul 10, 2023
dcf5ebd
fix for optional params
nikolayilyin Jul 10, 2023
fa19c28
optional param
nikolayilyin Jul 10, 2023
ca0d139
splitting full docker image name into pieces
nikolayilyin Jul 12, 2023
8b5c585
correct urls
nikolayilyin Jul 12, 2023
db6f24b
Merge remote-tracking branch 'origin/develop' into inm/lawrencium-aut…
nikolayilyin Jul 17, 2023
f111f21
gradle functions
nikolayilyin Jul 17, 2023
85b3c2b
changes in hash code of folders\submodules
nikolayilyin Jul 17, 2023
b7fd0e8
formatting
nikolayilyin Jul 19, 2023
a32759b
lawrencium parameters
nikolayilyin Jul 20, 2023
a65f746
gradle functions to run on cluster
nikolayilyin Jul 20, 2023
528381b
removing extra logging
nikolayilyin Jul 26, 2023
08f4e8b
refactored
nikolayilyin Jul 26, 2023
9b5b9c3
documentation of parameteres to deploy to lawrencium
nikolayilyin Jul 26, 2023
1138521
documentation of parameters for lawrencium gradle commands
nikolayilyin Jul 26, 2023
4a9097d
using additional parameter to specify user
nikolayilyin Jul 26, 2023
2756083
debugging
nikolayilyin Jul 26, 2023
bf08d3f
debugging
nikolayilyin Jul 28, 2023
95e6f3e
debugging
nikolayilyin Jul 28, 2023
7377bbe
debugging
nikolayilyin Jul 28, 2023
ca6b635
bugfix
nikolayilyin Jul 30, 2023
735ceeb
original scripts
nikolayilyin Jul 30, 2023
e3c5f12
debugging lawrencuim
nikolayilyin Aug 2, 2023
bc8b6d0
Merge branch 'inm/jupyter-notebooks-add-folders' into inm/lawrencium-…
nikolayilyin Aug 8, 2023
c819565
Merge branch 'inm/jupyter-notebooks-add-folders' into inm/lawrencium-…
nikolayilyin Aug 8, 2023
c345fa8
less ram
nikolayilyin Aug 9, 2023
09afff5
not needed
nikolayilyin Aug 9, 2023
4e0b761
Merge remote-tracking branch 'origin/develop' into inm/lawrencium-aut…
nikolayilyin Aug 10, 2023
afd45dd
using lr6 by-default
nikolayilyin Aug 18, 2023
9ad9fc0
function to get available resources
nikolayilyin Aug 18, 2023
54fe6b2
reverting production links changes
nikolayilyin Aug 19, 2023
2ccd3d8
Merge remote-tracking branch 'origin/develop' into inm/lawrencium-aut…
nikolayilyin Aug 19, 2023
b33915a
task to cancel a job + general improvements
nikolayilyin Aug 21, 2023
085a6c6
separation of values to print into regular and secret
nikolayilyin Aug 21, 2023
5bcafbd
better default values
nikolayilyin Aug 21, 2023
0513689
Merge branch 'develop' into inm/lawrencium-automatization
nikolayilyin Aug 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions gradle.deploy.properties
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@ beamConfigs=test/input/beamville/beam.conf
cloudPlatform=Google
instanceType=n2d-standard-4

# Lawrencium cluster specifics
# lr_bigmem lr3 es1 lr7 cm1 cf1
lawrenciumPartition=lr7
lawrenciumAccount=pc_beamcore
# the name of the user in cluster
lawrenciumUser=
# the password of the user in cluster,
# the gradle command also should include the one time password: -Potp=<One Time Password>
lawrenciumPassword=
# expected duration of simulation to run on Lawrencium, should be less than maximum of 3 days.
expectedDuration='0-10:00:00'

# shutdownBehaviour = stop | terminate
shutdownBehaviour=terminate

Expand All @@ -20,6 +32,13 @@ s3Backup=true
# for example: helics/run_pydss_federate.sh or helics/run_site_power_controller.sh
cosimulationShellScript=

# parameters for configuring notifications,
# so far required only for Lawrencium cluster.
# should be set for notifications to work
sentNotification=true
# slackHookWithToken=
# simulationsSpreadsheetUrl=

# Run Jupyter Notebook together with BEAM
runJupyter=false

Expand Down
279 changes: 279 additions & 0 deletions lawrencium/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
import com.amazonaws.auth.*
import com.amazonaws.auth.profile.ProfileCredentialsProvider

group = 'beam'
version = '0.8.0'

buildscript {
repositories {
mavenLocal()
mavenCentral()
maven { url "https://plugins.gradle.org/m2/" }
gradlePluginPortal()
}
dependencies {
classpath 'org.hidetake:gradle-ssh-plugin:2.10.1'
classpath 'com.amazonaws:aws-java-sdk:1.11.83'
}
}

apply plugin: 'org.hidetake.ssh'

def propsFileName = "${project.rootDir}/gradle.deploy.properties"
if (project.hasProperty('propsFile')) {
propsFileName = project.findProperty('propsFile')
}

def propsFile = new Properties()
propsFile.load(project.file(propsFileName).newDataInputStream())

ext.getParameterValue = { String paramName ->
if (project.hasProperty(paramName)) {
return project.findProperty(paramName)
} else {
return propsFile.getProperty(paramName)
}
}

ext.getParameterOrEnvironmentValue = { String paramName ->
if (project.hasProperty(paramName)) {
return project.findProperty(paramName)
} else if (System.getenv(paramName)) {
return System.getenv(paramName)
} else {
return propsFile.getProperty(paramName)
}
}

ext.getNodeMemoryRequirement = { String partitionName ->
switch (partitionName) {
case "lr_bigmem":
return "144" // CPU cores: 32
case "lr3":
return "480" // CPU cores: 32
case "es1":
return "480" // CPU cores: 64; GPU instance
case "lr7":
return "240" // CPU cores: 56
case "cm1":
return "224" // CPU cores: 48
case "cf1":
return "180" // CPU cores: 64
}
}

ext.getQOS = { String partitionName ->
switch (partitionName) {
case "lr_bigmem":
return "lr_normal" // CPU cores: 32
case "lr3":
return "lr_normal" // CPU cores: 32
case "es1":
return "es_normal" // CPU cores: 64; GPU instance
case "lr7":
return "lr_normal" // CPU cores: 56
case "cm1":
return "cm1_normal" // CPU cores: 48
case "cf1":
return "cf_normal" // CPU cores: 64
}
}

String lawrenciumUser = getParameterValue('lawrenciumUser') ?: 'user-not-specified'

remotes {
lawrenciumLoginNode {
host = 'lrc-login.lbl.gov'
user = lawrenciumUser
}
}

ext.executeLawrenciumSSH = { List<String> commandsList ->
List<String> requiredProps = ["lawrenciumUser", "lawrenciumPassword", "otp"]
for (prop in requiredProps) {
if (!getParameterValue(prop)) {
println "Property '$prop' is required and should be set!"
}
}

String fullPassword = getParameterValue("lawrenciumPassword") + getParameterValue("otp")

ssh.run {
settings {
authentications = ["keyboard-interactive"]
knownHosts = allowAnyHosts
password = fullPassword
}

session(remotes.lawrenciumLoginNode) {
for (command in commandsList) {
println command
execute command
}
}
}
}


tasks.register("deployToLawrencium") {
doLast {
def runName = getParameterValue('runName')

def branch = getParameterValue('beamBranch') ?: getCurrentGitBranch()
def dataBranch = getParameterValue('dataBranch') ?: 'develop'

def commit = getParameterValue('beamCommit') ?: 'HEAD'
def dataCommit = getParameterValue('dataCommit') ?: 'HEAD'

def configs = getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')

// partition, memory limit and QoS should be changed together
def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'lr6'
def lawrenciumMemoryLimit = getNodeMemoryRequirement(lawrenciumPartition)
def lawrenciumQoS = getQOS(lawrenciumPartition)

// using memory limit for lawrencium cluster is usually fine
def defaultMemoryLimit = { ->
if (lawrenciumPartition == 'lr_bigmem') {
return (lawrenciumMemoryLimit.toInteger() * 1024).toString()
} else {
return lawrenciumMemoryLimit
}
}
def max_ram = getParameterValue('forcedMaxRAM') ?: defaultMemoryLimit
def profiler_type = getParameterValue('profiler_type') ?: 'cpumem'

def sentNotification = getParameterValue('sentNotification') ?: true
def slackHookWithToken = getParameterValue('slackHookWithToken') ?: ''
def simulationsSpreadsheetUrl = getParameterValue('simulationsSpreadsheetUrl') ?: ''

def google_api_key = getParameterOrEnvironmentValue('GOOGLE_API_KEY') ?: 'not-set'

def s3_publish = getParameterValue('s3Backup') ?: true
def region = getParameterValue('region') ?: defaultRegion

// reading AWS credentials from various credentials providers
AWSCredentialsProvider creds = new AWSCredentialsProviderChain(
new EnvironmentVariableCredentialsProvider(),
new SystemPropertiesCredentialsProvider(),
new ProfileCredentialsProvider(),
new EC2ContainerCredentialsProviderWrapper()
)

String aws_access_key_id = creds.getCredentials().AWSAccessKeyId
String aws_secret_access_key = creds.getCredentials().AWSSecretKey

// by-default maximum possible value is used
def expectedDuration = getParameterValue('expectedDuration') ?: '3-00:00:00'

// in case the simulation should be run under different lawrencium account
def lawrenciumAccount = getParameterValue('lawrenciumAccount') ?: 'pc_beamcore'

// which docker image to run
def dockerImageTag = getParameterValue('dockerImageTag') ?: 'latest'
def dockerImageName = getParameterValue('dockerImageName') ?: 'beam-environment'
def dockerImageNameSpace = getParameterValue('dockerImageNameSpace') ?: 'beammodel'

def parametersToDeploy = "--RUN_NAME='$runName' " +
"--BEAM_BRANCH_NAME='$branch' " +
"--BEAM_COMMIT_SHA='$commit' " +
"--BEAM_DATA_BRANCH_NAME='$dataBranch' " +
"--BEAM_DATA_COMMIT_SHA='$dataCommit' " +
"--BEAM_CONFIG='$configs' " +
"--MAX_RAM='$max_ram' " +
"--PROFILER='$profiler_type' " +
"--ACCOUNT='$lawrenciumAccount' " +
"--PARTITION='$lawrenciumPartition' " +
"--QOS='$lawrenciumQoS' " +
"--MEMORY_LIMIT='$lawrenciumMemoryLimit' " +
"--EXPECTED_EXECUTION_DURATION='$expectedDuration' " +
"--S3_REGION='$region' " +
"--S3_PUBLISH='$s3_publish' " +
"--AWS_SECRET_ACCESS_KEY='$aws_secret_access_key' " +
"--AWS_ACCESS_KEY_ID='$aws_access_key_id' " +
"--SEND_NOTIFICATION='$sentNotification' " +
"--SLACK_HOOK_WITH_TOKEN='$slackHookWithToken' " +
"--SIMULATIONS_SPREADSHEET_UPDATE_URL='$simulationsSpreadsheetUrl' " +
"--GOOGLE_API_KEY='$google_api_key' " +
"--PULL_CODE='true' " +
"--PULL_DATA='true' " +
"--DOCKER_IMAGE_NAMESPACE='$dockerImageNameSpace' " +
"--DOCKER_IMAGE_NAME='$dockerImageName' " +
"--DOCKER_IMAGE_TAG='$dockerImageTag' "

def base_path = "https://raw.githubusercontent.com/LBNL-UCB-STI/beam/inm/lawrencium-automatization/lawrencium/src/main/bash"

def job_name = "slurm_job.sh"
def runner_name = "slurm_job_runner.sh"

executeLawrenciumSSH([
"rm -f $job_name",
"rm -f $runner_name",
// downloading files instead of sending them
// because it is not allowed to push files to login node
// file transfer node uses different type of authentication
// and both nodes are not working together within one session for some reason
"wget $base_path/$job_name",
"wget $base_path/$runner_name",
"chmod 777 $job_name",
"chmod 777 $runner_name",
"./$runner_name " + parametersToDeploy,
"squeue -u $lawrenciumUser"
])
}
}


tasks.register("lawrenciumQueue") {
doLast {
def sacctFormat = "JobID,JobName%30,state,start,end,elapsed,nnodes,ncpus,nodelist,user,partition,maxrss,maxvmsize,time"
if (!project.hasProperty("fromDate")) {
Date today = new Date()
Integer daysPast = (getParameterValue('forDays') ?: '1').toInteger()
Date yesterday = today - daysPast
ext.fromDate = yesterday.format("yyyy-MM-dd")
}

if (!project.hasProperty("queueUser")) {
ext.queueUser = lawrenciumUser
}

def commandToGetJobsListForUser = "sacct -u $queueUser --format=$sacctFormat -S $fromDate"
def commandToGetQueueForUser = "squeue -u $queueUser"

executeLawrenciumSSH([
commandToGetJobsListForUser,
commandToGetQueueForUser
])
}
}


tasks.register("lawrenciumNodes") {
doLast {
def lawrenciumPartition = getParameterValue('lawrenciumPartition') ?: 'es1'
def commandToGetNodesInfo = "sinfo -N --long --partition=$lawrenciumPartition"

executeLawrenciumSSH([commandToGetNodesInfo])
}
}

tasks.register("lawrenciumResourcesAvailable") {
doLast {
if (!project.hasProperty("queueUser")) {
ext.queueUser = lawrenciumUser
}
executeLawrenciumSSH(["sacctmgr show association -p user=$queueUser"])
}
}

tasks.register("lawrenciumJobCancel") {
doLast {
def jobId = getParameterValue("jobId")

def commandToCancelJob = "scancel $jobId"
def commandToGetQueueForUser = "squeue -u $lawrenciumUser"

executeLawrenciumSSH([commandToCancelJob, commandToGetQueueForUser])
}
}
48 changes: 24 additions & 24 deletions nersc/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -30,34 +30,34 @@ remotes {
import org.apache.tools.ant.filters.ReplaceTokens

task deployToNersc {
def propsFileName = "${project.rootDir}/gradle.deploy.properties"
if (project.hasProperty('propsFile')) {
propsFileName = project.findProperty('propsFile')
}
def propsFileName = "${project.rootDir}/gradle.deploy.properties"
if (project.hasProperty('propsFile')) {
propsFileName = project.findProperty('propsFile')
}

def propsFile = new Properties()
propsFile.load(project.file(propsFileName).newDataInputStream())
def propsFile = new Properties()
propsFile.load(project.file(propsFileName).newDataInputStream())

ext.getParameterValue = { paramName ->
if (project.hasProperty(paramName)) {
return project.findProperty(paramName)
} else {
return propsFile.getProperty(paramName)
}
ext.getParameterValue = { paramName ->
if (project.hasProperty(paramName)) {
return project.findProperty(paramName)
} else {
return propsFile.getProperty(paramName)
}
}

def runName = "${ext.getParameterValue('runName')}"
def git_user_email = "${getCurrentGitUserEmail()}"
def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}"
def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}"
def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}"
def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}"
def max_ram = '100g'
def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}"
def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}"
def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}"
def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}"
def region = "${ext.getParameterValue('region') ?: defaultRegion}"
def runName = "${ext.getParameterValue('runName')}"
def git_user_email = "${getCurrentGitUserEmail()}"
def branch = "${ext.getParameterValue('beamBranch') ?: getCurrentGitBranch()}"
def dataBranch = "${ext.getParameterValue('dataBranch') ?: 'develop'}"
def commit = "${ext.getParameterValue('beamCommit') ?: 'HEAD'}"
def configs = "${ext.getParameterValue('beamConfigs') ?: ext.getParameterValue(getCurrentGitBranch() + '.configs')}"
def max_ram = '100g'
def google_api_key = "${System.getenv('GOOGLE_API_KEY') ?: 'not-set-in-env'}"
def s3_publish = "${ext.getParameterValue('s3Backup') ?: true}"
def aws_access_key_id = "${System.getenv('AWS_ACCESS_KEY_ID') ?: 'not-set-in-env'}"
def aws_secret_access_key = "${System.getenv('AWS_SECRET_ACCESS_KEY') ?: 'not-set-in-env'}"
def region = "${ext.getParameterValue('region') ?: defaultRegion}"

doFirst {
copy {
Expand Down
1 change: 1 addition & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ include 'beam-gui'
include 'aws'
include 'gcp'
include 'nersc'
include 'lawrencium'
include 'metrics2.0'
include 'jupyter'

Expand Down