-
Notifications
You must be signed in to change notification settings - Fork 23
RFC.5
RP is supposed to work out of the box, at least for a number of supported target resources. That is currently only possible by including configuration files in the RP release for those resources. Configuration options include details about resource access, hardware layout, batch system properties, environment settings, and pilot agent configuration. This RFC proposes a new configuration schema, with the intent to
- separate configuration for different software layers and modules (launcher, bootstrap, executor);
- avoid redundancy of config settings;
- support API level selection of certain config options; and
- cleanly support new bootstrapping and partitioning mechanisms.
Toward those points, the configuration is split into two parts: resource configuration, and agent configuration. The resource configuration is concerned with all resource specific settings required to bootstrap into a well defined Python virtualenv; the agent configuration covers all settings applied to the modules within that virtualenv. The latter are in principle resource independent - but in reality that is not always the case. For example, a software dependency may need a different module load on one machine versus the other. Those elements are thus still stored in the resource config section, and are then referenced from the agent config section. This applies specifically to ORTE related configuration options.
# file: src/radical/pilot/config/resource_xsede.json
{
"stampede2": {
"resource" : {
"description" : "https://www.tacc.utexas.edu/stampede2/",
"lrms" : "SLURM",
"cores_per_node" : 68,
"gpus_per_node" : 1
},
"access" : {
"default_queue" : "normal",
"mandatory_args" : ["project"],
"schemas" : {
"default" : "gsissh",
"gsissh" : {
"job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
"filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
},
"ssh" : {
"job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
"filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
},
"go": {
"job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
"filesystem_endpoint" : "go://xsede#stampede2/"
}
}
},
"bootstrap" : {
"valid_roots" : ["/scratch", "$SCRATCH", "/work", "$WORK"],
"default_sandbox_base" : "$WORK",
"python_dist" : "default",
"virtenv_mode" : "create",
"rp_version" : "local",
"pre_bootstrap_0" : ["module load TACC",
"module load intel/17.0.4",
"module load python/2.7.13"],
"export_to_cu" : ["LMOD_CMD",
"LMOD_SYSTEM_DEFAULT_MODULES",
"LD_LIBRARY_PATH"],
"cu_pre_exec" : ["module restore"],
"options" : {
"orte" : {
"pre_bootstrap_2" : ["module use --append /home/amerzky/ompi/modules",
"module load openmpi/2017_02_17_6da4dbb"]
},
"ortelib" : {
"pre_bootstrap_2" : ["module use --append /home/amerzky/ompi/modules",
"module load openmpi/2017_02_17_6da4dbb"]
},
"spark" : {
"pre_bootstrap_2" : ["module load jdk64/1.8.0"]
}
}
}
},
... # other resources follow
}
{
# `scheduler` : algorithm to place units on the agent resources
# `spawner` : method to spawn compute unit processes
# `task_launch_method` : method to reach compute nodes (for non-mpi tasks)
# `mpi_launch_method` : method to reach compute nodes (for mpi tasks)
"config" {
"default" : "ssh",
"ssh" : {
"scheduler" : "CONTINUOUS",
"spawner" : "POPEN",
"task_launch_method" : "SSH",
"mpi_launch_method" : "MPIRUN_RSH"
},
"aprun" : {
"scheduler" : "CONTINUOUS",
"spawner" : "POPEN",
"task_launch_method" : "APRUN",
"mpi_launch_method" : "APRUN"
},
"orte" : {
"scheduler" : "CONTINUOUS",
"spawner" : "POPEN",
"task_launch_method" : "ORTE",
"mpi_launch_method" : "ORTE"
},
"ortelib" : {
"scheduler" : "CONTINUOUS",
"spawner" : "ORTELIB",
"task_launch_method" : "ORTELIB",
"mpi_launch_method" : "ORTELIB"
}
},
# `max_io_loglength` : max number of cu out/err chars to push to db
# `bulk_collection_size` : max number of updates to put into a db bulk
# `bulk_collection_time` : max seconds to collect db notification bulks
# `db_poll_sleeptime` : seconds to sleep between database polls
"tuning" : {
"default" : "bulked",
"bulked" : {
"max_io_loglength" : 1024,
"bulk_collection_size" : 100,
"bulk_collection_time" : 1.0,
"db_poll_sleeptime" : 1.0
}
},
# `shared`: if true, use agent node also for CUs (not yet implemented)
# `target`: sub-agents hosting additional component instances can run on the
# same node ('local') or on other nodes ('node') as `agent_0`.
# agent_0 must always have target 'local'.
# The `UpdateWorker` component must live in agent_0, as only that agent is
# known to have connectivity toward the DB.
# All bridges must live in agent_0.
"layout" : {
"default" : "beowulf",
"beowulf" : {
"agent_0" : {
"target" : "local",
"shared" : false,
"components" : {
"UpdateWorker" : {"count" : 1},
"AgentStagingInputComponent" : {"count" : 1},
"AgentSchedulingComponent" : {"count" : 1},
"AgentExecutingComponent" : {"count" : 1},
"AgentStagingOutputComponent" : {"count" : 1}
},
"bridges" : {
"agent_staging_input_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_scheduling_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_executing_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_staging_output_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_unschedule_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_schedule_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"log_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"state_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"control_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0}
}
}
},
"cray" : {
"agent_0" : {
"target" : "local",
"shared" : false,
"components" : {
"UpdateWorker" : {"count" : 1},
},
"agent_1": {
"target" : "node",
"shared" : false,
"components": {
"AgentStagingInputComponent" : {"count" : 1},
"AgentSchedulingComponent" : {"count" : 1},
"AgentExecutingComponent" : {"count" : 1},
"AgentStagingOutputComponent" : {"count" : 1}
}
},
"bridges" : {
"agent_staging_input_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_scheduling_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_executing_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_staging_output_queue" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_unschedule_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"agent_schedule_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"log_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"state_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0},
"control_pubsub" : {"log_level" : "debug",
"stall_hwm" : 1,
"bulk_size" : 0}
}
}
}
}
}