Skip to content

Commit

Permalink
Merge pull request #58 from ll7/ll7/issue57-Slurm-multithreading-sett…
Browse files Browse the repository at this point in the history
…ing-might-be-slow

Ll7/issue57-Slurm-multithreading-setting-might-be-slow
  • Loading branch information
ll7 authored Sep 26, 2024
2 parents b61373b + ac407b7 commit b62e6b7
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 7 deletions.
3 changes: 2 additions & 1 deletion .vscode/extensions.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"ms-python.python",
"ms-python.pylint",
"github.vscode-pull-request-github"
"github.vscode-pull-request-github",
"ms-toolsai.tensorboard"
]
}
92 changes: 92 additions & 0 deletions SLURM/log_gpu_cpu_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Train a robot in robot_sf on a SLURM server with resource tracking."""

import sys
import psutil
import GPUtil
import os
from loguru import logger
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList, BaseCallback
from robot_sf.gym_env.robot_env import RobotEnv
from robot_sf.gym_env.env_config import EnvSettings
from robot_sf.feature_extractor import DynamicsExtractor
from robot_sf.tb_logging import DrivingMetricsCallback

class LogResourceUsageCallback(BaseCallback):
"""Custom callback to log CPU and GPU usage to TensorBoard."""

def _on_step(self) -> bool:
"""Log CPU and GPU usage and memory utilization at each step."""
cpu_usage = psutil.cpu_percent()
gpus = GPUtil.getGPUs()
gpu_usage = [gpu.load * 100 for gpu in gpus] if gpus else [0]
gpu_memory_util = [gpu.memoryUtil * 100 for gpu in gpus] if gpus else [0]

# Log to TensorBoard
self.logger.record('cpu_usage', cpu_usage)
for idx, (usage, mem_util) in enumerate(zip(gpu_usage, gpu_memory_util)):
self.logger.record(f'gpu_{idx}_usage', usage)
self.logger.record(f'gpu_{idx}_memory_util', mem_util)

return True

def training(
n_envs: int = os.cpu_count(),
ped_densities: list[float] = None,
difficulty: int = 2
):
"""Train a robot in robot_sf.
Args:
n_envs: Number of environments to run in parallel.
ped_densities: List of pedestrian densities to use.
difficulty: Difficulty of the simulation.
"""
logger.info(f"Number of CPUs: {n_envs}")
if ped_densities is None:
ped_densities = [0.01, 0.02, 0.04, 0.08]
def make_env():
config = EnvSettings()
config.sim_config.ped_density_by_difficulty = ped_densities
config.sim_config.difficulty = difficulty
return RobotEnv(config)

env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)

policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
model = PPO(
"MultiInputPolicy",
env,
tensorboard_log="./logs/ppo_logs/",
policy_kwargs=policy_kwargs
)
save_model_callback = CheckpointCallback(
500_000 // n_envs,
"./model/backup",
"ppo_model"
)
collect_metrics_callback = DrivingMetricsCallback(n_envs)
combined_callback = CallbackList(
[save_model_callback, collect_metrics_callback, LogResourceUsageCallback()]
)

logger.info("Start learning")

model.learn(
total_timesteps=1_000_000,
progress_bar=True,
callback=combined_callback
)


logger.info("Save model")
model.save("./model/ppo_model")

if __name__ == '__main__':
logger.info(f"Python path: {sys.executable}")
logger.info(f"Python version: {sys.version}")

logger.info("Start training")
training()
logger.info("End training")
30 changes: 30 additions & 0 deletions SLURM/partition_queue.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#! /bin/bash

# Script to query jobs in a specified SLURM partition

# date and time
echo "Partion queue at $(date)"

# Default partition name
PARTITION="epyc-gpu-test"

# Allow overriding the partition name via command line argument
if [ $# -eq 1 ]; then
PARTITION="$1"
fi

# Check if squeue command exists
if ! command -v squeue &> /dev/null; then
echo "Error: squeue command not found. Is SLURM installed?"
exit 1
fi

# Run squeue command and capture its exit status
squeue -p "$PARTITION" -l
exit_status=$?

# Check if squeue command was successful
if [ $exit_status -ne 0 ]; then
echo "Error: squeue command failed with exit status $exit_status"
exit $exit_status
fi
37 changes: 37 additions & 0 deletions SLURM/partition_queue_start_time.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#! /bin/bash

# Script to query jobs in a specified SLURM partition

# current date and time

echo "Partition queue start time at $(date)"

# Default partition name
PARTITION="epyc-gpu-test"

# Allow overriding the partition name via command line argument
if [ $# -eq 1 ]; then
PARTITION="$1"
fi

# Check if squeue command exists
if ! command -v squeue &> /dev/null; then
echo "Error: squeue command not found. Is SLURM installed?"
exit 1
fi

# Run squeue command and capture its exit status
output=$(squeue -p "$PARTITION" --start -u $USER 2>&1)
exit_status=$?

# Check if squeue command was successful
if [ $exit_status -ne 0 ]; then
echo "Error: squeue command failed with exit status $exit_status"
exit $exit_status
fi
# Display the output
echo "$output"

if $VERBOSE; then
echo "Query completed successfully."
fi
40 changes: 40 additions & 0 deletions SLURM/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,43 @@
```bash
sbatch slurm_train.sl
```

## Track CPU and GPU usage

Activate the conda environment:

```bash
conda activate conda_env
```

Manually install the packages:

```bash
pip install psutil gputil
```

Modify the `slurm_train.sl` file to run the training with util callback:

```bash
python log_gpu_cpu_usage.py
```

Run the script:

```bash
sbatch slurm_train.sl
```

To view the CPU and GPU usage results, follow these steps:

1. The tracking results are saved in a log file. The exact location of the log file will depend on the configuration in your `slurm_train.sl` and `log_gpu_cpu_usage.py` script. Please refer to the script to find the specific directory or file name where the results are stored.

2. The results should be saved in a tensorboard log file. You can view the results by running the following command:

```bash
tensorboard --logdir=<path_to_log_directory>
```

Make sure to adjust the instructions based on your specific setup and requirements.


28 changes: 28 additions & 0 deletions SLURM/set_email_env_variable.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# Set the email as an environment variable in .bashrc if not already set

# Check for email argument
if [ -z "$1" ]; then
echo "Usage: $0 <email>"
exit 1
fi

# Check if SLURM_EMAIL is already set in the environment
if [ -z "$SLURM_EMAIL" ]; then
# Check if SLURM_EMAIL is already in .bashrc
if ! grep -q "export SLURM_EMAIL" ~/.bashrc; then
# Add SLURM_EMAIL to .bashrc
echo "export SLURM_EMAIL=$1" >> ~/.bashrc
echo "SLURM_EMAIL has been added to ~/.bashrc"
source ~/.bashrc
echo "We also sourced ~/.bashrc"
else
echo "SLURM_EMAIL is already in ~/.bashrc"
fi

# Set SLURM_EMAIL for the current session
export SLURM_EMAIL="$1"
echo "SLURM_EMAIL has been set to $SLURM_EMAIL for the current session"
else
echo "SLURM_EMAIL is already set to $SLURM_EMAIL"
fi
31 changes: 26 additions & 5 deletions SLURM/slurm_train.sl
Original file line number Diff line number Diff line change
@@ -1,16 +1,33 @@
#!/usr/bin/env bash

#SBATCH --job-name=robot-sf
#SBATCH --partition=epyc-gpu
#SBATCH --time=10:00:00
#SBATCH --partition=epyc-gpu-test
#SBATCH --time=2:00:00

# Request memory per CPU
#SBATCH --mem-per-cpu=2G
# Request n CPUs for your task.
#SBATCH --cpus-per-task=64
# Request GPU Ressources (model:number)
#SBATCH --gpus=a100:1


# Check if SLURM_EMAIL is set
if [ -z "$SLURM_EMAIL" ]; then
echo "SLURM_EMAIL is not set. Please set it before running the script."
else
# Add email notification
#SBATCH --mail-user=$SLURM_EMAIL
#SBATCH --mail-type=END,FAIL
echo "SLURM_EMAIL is set to $SLURM_EMAIL"
fi


# # echo date and time
echo "Starting script at $(date)"

# # Create experiment description
echo "Run experiment with OMP_NUM_THREADS=1 because multithreading is in sb3"

# Clear all interactively loaded modules
module purge

Expand All @@ -25,6 +42,10 @@ conda activate conda_env
# if you are adding your own level of parallelzation, you
# probably want to set OMP_NUM_THREADS=1 instead, in order
# to prevent the creation of too many threads (massive slowdown!)
export OMP_NUM_THREADS=1

# No need to pass number of tasks to srun
srun python3 slurm_PPO_robot_sf.py
srun python3 log_gpu_cpu_usage.py

# echo date and time
# echo "Ending script at $(date)"
2 changes: 1 addition & 1 deletion SLURM/user_queue.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#! /bin/bash

squeue -u $USER
squeue -u $USER -l

0 comments on commit b62e6b7

Please sign in to comment.