Merge pull request #58 from ll7/ll7/issue57-Slurm-multithreading-sett…

…ing-might-be-slow Ll7/issue57-Slurm-multithreading-setting-might-be-slow
ll7 · Sep 26, 2024 · b62e6b7 · b62e6b7
2 parents b61373b + ac407b7
commit b62e6b7
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 7 deletions.
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -3,6 +3,7 @@
 
         "ms-python.python",
         "ms-python.pylint",
-        "github.vscode-pull-request-github"
+        "github.vscode-pull-request-github",
+        "ms-toolsai.tensorboard"
     ]
 }
diff --git a/SLURM/log_gpu_cpu_usage.py b/SLURM/log_gpu_cpu_usage.py
@@ -0,0 +1,92 @@
+"""Train a robot in robot_sf on a SLURM server with resource tracking."""
+
+import sys
+import psutil
+import GPUtil
+import os
+from loguru import logger
+from stable_baselines3 import PPO
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList, BaseCallback
+from robot_sf.gym_env.robot_env import RobotEnv
+from robot_sf.gym_env.env_config import EnvSettings
+from robot_sf.feature_extractor import DynamicsExtractor
+from robot_sf.tb_logging import DrivingMetricsCallback
+
+class LogResourceUsageCallback(BaseCallback):
+    """Custom callback to log CPU and GPU usage to TensorBoard."""
+
+    def _on_step(self) -> bool:
+        """Log CPU and GPU usage and memory utilization at each step."""
+        cpu_usage = psutil.cpu_percent()
+        gpus = GPUtil.getGPUs()
+        gpu_usage = [gpu.load * 100 for gpu in gpus] if gpus else [0]
+        gpu_memory_util = [gpu.memoryUtil * 100 for gpu in gpus] if gpus else [0]
+
+        # Log to TensorBoard
+        self.logger.record('cpu_usage', cpu_usage)
+        for idx, (usage, mem_util) in enumerate(zip(gpu_usage, gpu_memory_util)):
+            self.logger.record(f'gpu_{idx}_usage', usage)
+            self.logger.record(f'gpu_{idx}_memory_util', mem_util)
+
+        return True
+
+def training(
+        n_envs: int = os.cpu_count(),
+        ped_densities: list[float] = None,
+        difficulty: int = 2
+        ):
+    """Train a robot in robot_sf.
+    Args:
+        n_envs: Number of environments to run in parallel.
+        ped_densities: List of pedestrian densities to use.
+        difficulty: Difficulty of the simulation.
+    """
+    logger.info(f"Number of CPUs: {n_envs}")
+    if ped_densities is None:
+        ped_densities = [0.01, 0.02, 0.04, 0.08]
+    def make_env():
+        config = EnvSettings()
+        config.sim_config.ped_density_by_difficulty = ped_densities
+        config.sim_config.difficulty = difficulty
+        return RobotEnv(config)
+
+    env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)
+
+    policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
+    model = PPO(
+        "MultiInputPolicy",
+        env,
+        tensorboard_log="./logs/ppo_logs/",
+        policy_kwargs=policy_kwargs
+    )
+    save_model_callback = CheckpointCallback(
+        500_000 // n_envs,
+        "./model/backup",
+        "ppo_model"
+    )
+    collect_metrics_callback = DrivingMetricsCallback(n_envs)
+    combined_callback = CallbackList(
+        [save_model_callback, collect_metrics_callback, LogResourceUsageCallback()]
+    )
+
+    logger.info("Start learning")
+
+    model.learn(
+        total_timesteps=1_000_000,
+        progress_bar=True,
+        callback=combined_callback
+    )
+
+
+    logger.info("Save model")
+    model.save("./model/ppo_model")
+
+if __name__ == '__main__':
+    logger.info(f"Python path: {sys.executable}")
+    logger.info(f"Python version: {sys.version}")
+
+    logger.info("Start training")
+    training()
+    logger.info("End training")
diff --git a/SLURM/partition_queue.sh b/SLURM/partition_queue.sh
@@ -0,0 +1,30 @@
+#! /bin/bash
+
+# Script to query jobs in a specified SLURM partition
+
+# date and time
+echo "Partion queue at $(date)"
+
+# Default partition name
+PARTITION="epyc-gpu-test"
+
+# Allow overriding the partition name via command line argument
+if [ $# -eq 1 ]; then
+    PARTITION="$1"
+fi
+
+# Check if squeue command exists
+if ! command -v squeue &> /dev/null; then
+    echo "Error: squeue command not found. Is SLURM installed?"
+    exit 1
+fi
+
+# Run squeue command and capture its exit status
+squeue -p "$PARTITION" -l
+exit_status=$?
+
+# Check if squeue command was successful
+if [ $exit_status -ne 0 ]; then
+    echo "Error: squeue command failed with exit status $exit_status"
+    exit $exit_status
+fi
diff --git a/SLURM/partition_queue_start_time.sh b/SLURM/partition_queue_start_time.sh
@@ -0,0 +1,37 @@
+#! /bin/bash
+
+# Script to query jobs in a specified SLURM partition
+
+# current date and time
+
+echo "Partition queue start time at $(date)"
+
+# Default partition name
+PARTITION="epyc-gpu-test"
+
+# Allow overriding the partition name via command line argument
+if [ $# -eq 1 ]; then
+    PARTITION="$1"
+fi
+
+# Check if squeue command exists
+if ! command -v squeue &> /dev/null; then
+    echo "Error: squeue command not found. Is SLURM installed?"
+    exit 1
+fi
+
+# Run squeue command and capture its exit status
+output=$(squeue -p "$PARTITION" --start -u $USER 2>&1)
+exit_status=$?
+
+# Check if squeue command was successful
+if [ $exit_status -ne 0 ]; then
+    echo "Error: squeue command failed with exit status $exit_status"
+    exit $exit_status
+fi
+# Display the output
+echo "$output"
+
+if $VERBOSE; then
+    echo "Query completed successfully."
+fi
diff --git a/SLURM/readme.md b/SLURM/readme.md
@@ -3,3 +3,43 @@
 ```bash
 sbatch slurm_train.sl
 ```
+
+## Track CPU and GPU usage
+
+Activate the conda environment:
+
+```bash
+conda activate conda_env
+```
+
+Manually install the packages:
+
+```bash
+pip install psutil gputil
+```
+
+Modify the `slurm_train.sl` file to run the training with util callback:
+
+```bash
+python log_gpu_cpu_usage.py
+```
+
+Run the script:
+
+```bash
+sbatch slurm_train.sl
+```
+
+To view the CPU and GPU usage results, follow these steps:
+
+1. The tracking results are saved in a log file. The exact location of the log file will depend on the configuration in your `slurm_train.sl` and `log_gpu_cpu_usage.py` script. Please refer to the script to find the specific directory or file name where the results are stored.
+
+2. The results should be saved in a tensorboard log file. You can view the results by running the following command:
+
+```bash
+tensorboard --logdir=<path_to_log_directory>
+```
+
+Make sure to adjust the instructions based on your specific setup and requirements.
+
+
diff --git a/SLURM/set_email_env_variable.sh b/SLURM/set_email_env_variable.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Set the email as an environment variable in .bashrc if not already set
+
+# Check for email argument
+if [ -z "$1" ]; then
+    echo "Usage: $0 <email>"
+    exit 1
+fi
+
+# Check if SLURM_EMAIL is already set in the environment
+if [ -z "$SLURM_EMAIL" ]; then
+    # Check if SLURM_EMAIL is already in .bashrc
+    if ! grep -q "export SLURM_EMAIL" ~/.bashrc; then
+        # Add SLURM_EMAIL to .bashrc
+        echo "export SLURM_EMAIL=$1" >> ~/.bashrc
+        echo "SLURM_EMAIL has been added to ~/.bashrc"
+        source ~/.bashrc
+        echo "We also sourced ~/.bashrc"
+    else
+        echo "SLURM_EMAIL is already in ~/.bashrc"
+    fi
+
+    # Set SLURM_EMAIL for the current session
+    export SLURM_EMAIL="$1"
+    echo "SLURM_EMAIL has been set to $SLURM_EMAIL for the current session"
+else
+    echo "SLURM_EMAIL is already set to $SLURM_EMAIL"
+fi
diff --git a/SLURM/slurm_train.sl b/SLURM/slurm_train.sl
@@ -1,16 +1,33 @@
 #!/usr/bin/env bash
-  
+
 #SBATCH --job-name=robot-sf
-#SBATCH --partition=epyc-gpu
-#SBATCH --time=10:00:00
+#SBATCH --partition=epyc-gpu-test
+#SBATCH --time=2:00:00
 
 # Request memory per CPU
 #SBATCH --mem-per-cpu=2G
 # Request n CPUs for your task.
 #SBATCH --cpus-per-task=64
 # Request GPU Ressources (model:number)
 #SBATCH --gpus=a100:1
-
+
+# Check if SLURM_EMAIL is set
+if [ -z "$SLURM_EMAIL" ]; then
+  echo "SLURM_EMAIL is not set. Please set it before running the script."
+else
+  # Add email notification
+  #SBATCH --mail-user=$SLURM_EMAIL
+  #SBATCH --mail-type=END,FAIL
+  echo "SLURM_EMAIL is set to $SLURM_EMAIL"
+fi
+
+
+# # echo date and time
+echo "Starting script at $(date)"
+
+# # Create experiment description
+echo "Run experiment with OMP_NUM_THREADS=1 because multithreading is in sb3"
+
 # Clear all interactively loaded modules
 module purge
 
@@ -25,6 +42,10 @@ conda activate conda_env
 # if you are adding your own level of parallelzation, you
 # probably want to set OMP_NUM_THREADS=1 instead, in order
 # to prevent the creation of too many threads (massive slowdown!)
+export OMP_NUM_THREADS=1
 
 # No need to pass number of tasks to srun
-srun python3 slurm_PPO_robot_sf.py
+srun python3 log_gpu_cpu_usage.py
+
+# echo date and time
+# echo "Ending script at $(date)"
diff --git a/SLURM/user_queue.sh b/SLURM/user_queue.sh
@@ -1,3 +1,3 @@
 #! /bin/bash
 
-squeue -u $USER
+squeue -u $USER -l