-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #58 from ll7/ll7/issue57-Slurm-multithreading-sett…
…ing-might-be-slow Ll7/issue57-Slurm-multithreading-setting-might-be-slow
- Loading branch information
Showing
8 changed files
with
256 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
"""Train a robot in robot_sf on a SLURM server with resource tracking.""" | ||
|
||
import sys | ||
import psutil | ||
import GPUtil | ||
import os | ||
from loguru import logger | ||
from stable_baselines3 import PPO | ||
from stable_baselines3.common.env_util import make_vec_env | ||
from stable_baselines3.common.vec_env import SubprocVecEnv | ||
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList, BaseCallback | ||
from robot_sf.gym_env.robot_env import RobotEnv | ||
from robot_sf.gym_env.env_config import EnvSettings | ||
from robot_sf.feature_extractor import DynamicsExtractor | ||
from robot_sf.tb_logging import DrivingMetricsCallback | ||
|
||
class LogResourceUsageCallback(BaseCallback): | ||
"""Custom callback to log CPU and GPU usage to TensorBoard.""" | ||
|
||
def _on_step(self) -> bool: | ||
"""Log CPU and GPU usage and memory utilization at each step.""" | ||
cpu_usage = psutil.cpu_percent() | ||
gpus = GPUtil.getGPUs() | ||
gpu_usage = [gpu.load * 100 for gpu in gpus] if gpus else [0] | ||
gpu_memory_util = [gpu.memoryUtil * 100 for gpu in gpus] if gpus else [0] | ||
|
||
# Log to TensorBoard | ||
self.logger.record('cpu_usage', cpu_usage) | ||
for idx, (usage, mem_util) in enumerate(zip(gpu_usage, gpu_memory_util)): | ||
self.logger.record(f'gpu_{idx}_usage', usage) | ||
self.logger.record(f'gpu_{idx}_memory_util', mem_util) | ||
|
||
return True | ||
|
||
def training( | ||
n_envs: int = os.cpu_count(), | ||
ped_densities: list[float] = None, | ||
difficulty: int = 2 | ||
): | ||
"""Train a robot in robot_sf. | ||
Args: | ||
n_envs: Number of environments to run in parallel. | ||
ped_densities: List of pedestrian densities to use. | ||
difficulty: Difficulty of the simulation. | ||
""" | ||
logger.info(f"Number of CPUs: {n_envs}") | ||
if ped_densities is None: | ||
ped_densities = [0.01, 0.02, 0.04, 0.08] | ||
def make_env(): | ||
config = EnvSettings() | ||
config.sim_config.ped_density_by_difficulty = ped_densities | ||
config.sim_config.difficulty = difficulty | ||
return RobotEnv(config) | ||
|
||
env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv) | ||
|
||
policy_kwargs = dict(features_extractor_class=DynamicsExtractor) | ||
model = PPO( | ||
"MultiInputPolicy", | ||
env, | ||
tensorboard_log="./logs/ppo_logs/", | ||
policy_kwargs=policy_kwargs | ||
) | ||
save_model_callback = CheckpointCallback( | ||
500_000 // n_envs, | ||
"./model/backup", | ||
"ppo_model" | ||
) | ||
collect_metrics_callback = DrivingMetricsCallback(n_envs) | ||
combined_callback = CallbackList( | ||
[save_model_callback, collect_metrics_callback, LogResourceUsageCallback()] | ||
) | ||
|
||
logger.info("Start learning") | ||
|
||
model.learn( | ||
total_timesteps=1_000_000, | ||
progress_bar=True, | ||
callback=combined_callback | ||
) | ||
|
||
|
||
logger.info("Save model") | ||
model.save("./model/ppo_model") | ||
|
||
if __name__ == '__main__': | ||
logger.info(f"Python path: {sys.executable}") | ||
logger.info(f"Python version: {sys.version}") | ||
|
||
logger.info("Start training") | ||
training() | ||
logger.info("End training") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#! /bin/bash | ||
|
||
# Script to query jobs in a specified SLURM partition | ||
|
||
# date and time | ||
echo "Partion queue at $(date)" | ||
|
||
# Default partition name | ||
PARTITION="epyc-gpu-test" | ||
|
||
# Allow overriding the partition name via command line argument | ||
if [ $# -eq 1 ]; then | ||
PARTITION="$1" | ||
fi | ||
|
||
# Check if squeue command exists | ||
if ! command -v squeue &> /dev/null; then | ||
echo "Error: squeue command not found. Is SLURM installed?" | ||
exit 1 | ||
fi | ||
|
||
# Run squeue command and capture its exit status | ||
squeue -p "$PARTITION" -l | ||
exit_status=$? | ||
|
||
# Check if squeue command was successful | ||
if [ $exit_status -ne 0 ]; then | ||
echo "Error: squeue command failed with exit status $exit_status" | ||
exit $exit_status | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#! /bin/bash | ||
|
||
# Script to query jobs in a specified SLURM partition | ||
|
||
# current date and time | ||
|
||
echo "Partition queue start time at $(date)" | ||
|
||
# Default partition name | ||
PARTITION="epyc-gpu-test" | ||
|
||
# Allow overriding the partition name via command line argument | ||
if [ $# -eq 1 ]; then | ||
PARTITION="$1" | ||
fi | ||
|
||
# Check if squeue command exists | ||
if ! command -v squeue &> /dev/null; then | ||
echo "Error: squeue command not found. Is SLURM installed?" | ||
exit 1 | ||
fi | ||
|
||
# Run squeue command and capture its exit status | ||
output=$(squeue -p "$PARTITION" --start -u $USER 2>&1) | ||
exit_status=$? | ||
|
||
# Check if squeue command was successful | ||
if [ $exit_status -ne 0 ]; then | ||
echo "Error: squeue command failed with exit status $exit_status" | ||
exit $exit_status | ||
fi | ||
# Display the output | ||
echo "$output" | ||
|
||
if $VERBOSE; then | ||
echo "Query completed successfully." | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
# Set the email as an environment variable in .bashrc if not already set | ||
|
||
# Check for email argument | ||
if [ -z "$1" ]; then | ||
echo "Usage: $0 <email>" | ||
exit 1 | ||
fi | ||
|
||
# Check if SLURM_EMAIL is already set in the environment | ||
if [ -z "$SLURM_EMAIL" ]; then | ||
# Check if SLURM_EMAIL is already in .bashrc | ||
if ! grep -q "export SLURM_EMAIL" ~/.bashrc; then | ||
# Add SLURM_EMAIL to .bashrc | ||
echo "export SLURM_EMAIL=$1" >> ~/.bashrc | ||
echo "SLURM_EMAIL has been added to ~/.bashrc" | ||
source ~/.bashrc | ||
echo "We also sourced ~/.bashrc" | ||
else | ||
echo "SLURM_EMAIL is already in ~/.bashrc" | ||
fi | ||
|
||
# Set SLURM_EMAIL for the current session | ||
export SLURM_EMAIL="$1" | ||
echo "SLURM_EMAIL has been set to $SLURM_EMAIL for the current session" | ||
else | ||
echo "SLURM_EMAIL is already set to $SLURM_EMAIL" | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
#! /bin/bash | ||
|
||
squeue -u $USER | ||
squeue -u $USER -l |