From a585fd73c67b5e408583f40896b62d1c8d108352 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 16:59:03 +0200 Subject: [PATCH 01/18] Refactor GitHub issue branch title format --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 05cb754..a43f428 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "githubIssues.issueBranchTitle": "${user}/issue${issueNumber}-${sanitizedIssueTitle}" + "githubIssues.issueBranchTitle": "${issueNumber}-${sanitizedIssueTitle}" } \ No newline at end of file From 6b98aba95750c6387adac6386cca064057bc6f7f Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 17:25:03 +0200 Subject: [PATCH 02/18] chore: Add CPU and GPU usage tracking to SLURM readme --- SLURM/readme.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/SLURM/readme.md b/SLURM/readme.md index d4c5c89..bf96f5c 100644 --- a/SLURM/readme.md +++ b/SLURM/readme.md @@ -3,3 +3,30 @@ ```bash sbatch slurm_train.sl ``` + +## Track CPU and GPU usage + +Activate the conda environment: + +```bash +conda activate conda_env +``` + +Manually install the packages: + +```bash +pip install psutil gputil +``` + +Modify the `slurm_train.sl` file to run the training with util callback: + +```bash +python log_gpu_cpu_usage.py +``` + +Run the script: + +```bash +sbatch slurm_train.sl +``` + From 084adbbdb224605f3c457ab14346e0237c78e34b Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 17:25:13 +0200 Subject: [PATCH 03/18] chore: Update SLURM user_queue.sh script to include detailed job information The SLURM user_queue.sh script has been updated to include the "-l" flag when querying the job queue. This provides more detailed information about the jobs owned by the user. This change improves the usability of the script by providing additional context for each job in the queue. --- SLURM/user_queue.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SLURM/user_queue.sh b/SLURM/user_queue.sh index 08fe4f1..2e43c40 100755 --- a/SLURM/user_queue.sh +++ b/SLURM/user_queue.sh @@ -1,3 +1,3 @@ #! /bin/bash -squeue -u $USER +squeue -u $USER -l From 04a4dc5b9c79defdf43b3fb031e9921b98472c4f Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 17:25:23 +0200 Subject: [PATCH 04/18] Update SLURM training script for robot_sf --- SLURM/slurm_train.sl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SLURM/slurm_train.sl b/SLURM/slurm_train.sl index 87800c5..1d54b99 100644 --- a/SLURM/slurm_train.sl +++ b/SLURM/slurm_train.sl @@ -1,8 +1,8 @@ #!/usr/bin/env bash #SBATCH --job-name=robot-sf -#SBATCH --partition=epyc-gpu -#SBATCH --time=10:00:00 +#SBATCH --partition=epyc-gpu-test +#SBATCH --time=2:00:00 # Request memory per CPU #SBATCH --mem-per-cpu=2G @@ -27,4 +27,4 @@ conda activate conda_env # to prevent the creation of too many threads (massive slowdown!) # No need to pass number of tasks to srun -srun python3 slurm_PPO_robot_sf.py +srun python3 log_gpu_cpu_usage.py From 431393d1436157a555c009069810e5dbd241c79c Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 17:25:29 +0200 Subject: [PATCH 05/18] chore: Add resource usage tracking to SLURM training script --- SLURM/log_gpu_cpu_usage.py | 82 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 SLURM/log_gpu_cpu_usage.py diff --git a/SLURM/log_gpu_cpu_usage.py b/SLURM/log_gpu_cpu_usage.py new file mode 100644 index 0000000..22f4845 --- /dev/null +++ b/SLURM/log_gpu_cpu_usage.py @@ -0,0 +1,82 @@ +"""Train a robot in robot_sf on a SLURM server with resource tracking.""" + +import sys +import psutil +import GPUtil +from loguru import logger +from stable_baselines3 import PPO +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.vec_env import SubprocVecEnv +from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList, BaseCallback +from robot_sf.gym_env.robot_env import RobotEnv +from robot_sf.gym_env.env_config import EnvSettings +from robot_sf.feature_extractor import DynamicsExtractor +from robot_sf.tb_logging import DrivingMetricsCallback + +class LogResourceUsageCallback(BaseCallback): + """Custom callback to log CPU and GPU usage to TensorBoard.""" + + def __init__(self, verbose=0): + super(LogResourceUsageCallback, self).__init__(verbose) + + def _on_step(self) -> bool: + """Log CPU and GPU usage at each step.""" + cpu_usage = psutil.cpu_percent() + gpus = GPUtil.getGPUs() + gpu_usage = gpus[0].load * 100 if gpus else 0 # Assuming using the first GPU + + # Log to TensorBoard + self.logger.record('cpu_usage', cpu_usage) + self.logger.record('gpu_usage', gpu_usage) + + return True + +def training(): + n_envs = 64 + ped_densities = [0.01, 0.02, 0.04, 0.08] + difficulty = 2 + + def make_env(): + config = EnvSettings() + config.sim_config.ped_density_by_difficulty = ped_densities + config.sim_config.difficulty = difficulty + return RobotEnv(config) + + env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv) + + policy_kwargs = dict(features_extractor_class=DynamicsExtractor) + model = PPO( + "MultiInputPolicy", + env, + tensorboard_log="./logs/ppo_logs/", + policy_kwargs=policy_kwargs + ) + save_model_callback = CheckpointCallback( + 500_000 // n_envs, + "./model/backup", + "ppo_model" + ) + collect_metrics_callback = DrivingMetricsCallback(n_envs) + combined_callback = CallbackList( + [save_model_callback, collect_metrics_callback, LogResourceUsageCallback()] + ) + + logger.info("Start learning") + + model.learn( + total_timesteps=1_000_000, + progress_bar=True, + callback=combined_callback + ) + + + logger.info("Save model") + model.save("./model/ppo_model") + +if __name__ == '__main__': + logger.info(f"Python path: {sys.executable}") + logger.info(f"Python version: {sys.version}") + + logger.info("Start training") + training() + logger.info("End training") From f9efdad5bbed84190e77c67d9dc69dc77c260876 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Tue, 24 Sep 2024 17:25:40 +0200 Subject: [PATCH 06/18] chore: Add SLURM partition_queue.sh script for querying job queue A new script, `partition_queue.sh`, has been added to the SLURM directory. This script allows users to query the job queue for the `epyc-gpu-test` partition. It provides a convenient way to check the status of jobs in this specific partition. This addition enhances the functionality of the SLURM scripts and improves the user experience. --- SLURM/partition_queue.sh | 3 +++ 1 file changed, 3 insertions(+) create mode 100755 SLURM/partition_queue.sh diff --git a/SLURM/partition_queue.sh b/SLURM/partition_queue.sh new file mode 100755 index 0000000..dd0a19a --- /dev/null +++ b/SLURM/partition_queue.sh @@ -0,0 +1,3 @@ +#! /bin/bash + +squeue -p epyc-gpu-test -l From 8a19d5a854f39f17c81f79954f1493c617a08077 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 09:10:53 +0200 Subject: [PATCH 07/18] chore: multi gpu logging --- SLURM/log_gpu_cpu_usage.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/SLURM/log_gpu_cpu_usage.py b/SLURM/log_gpu_cpu_usage.py index 22f4845..690ba0b 100644 --- a/SLURM/log_gpu_cpu_usage.py +++ b/SLURM/log_gpu_cpu_usage.py @@ -23,19 +23,28 @@ def _on_step(self) -> bool: """Log CPU and GPU usage at each step.""" cpu_usage = psutil.cpu_percent() gpus = GPUtil.getGPUs() - gpu_usage = gpus[0].load * 100 if gpus else 0 # Assuming using the first GPU + gpu_usage = [gpu.load * 100 for gpu in gpus] if gpus else [0] # Log to TensorBoard self.logger.record('cpu_usage', cpu_usage) - self.logger.record('gpu_usage', gpu_usage) + for idx, usage in enumerate(gpu_usage): + self.logger.record(f'gpu_{idx}_usage', usage) return True -def training(): - n_envs = 64 - ped_densities = [0.01, 0.02, 0.04, 0.08] - difficulty = 2 - +def training( + n_envs: int = 64, + ped_densities: list[float] = None, + difficulty: int = 2 + ): + """Train a robot in robot_sf. + Args: + n_envs: Number of environments to run in parallel. + ped_densities: List of pedestrian densities to use. + difficulty: Difficulty of the simulation. + """ + if ped_densities is None: + ped_densities = [0.01, 0.02, 0.04, 0.08] def make_env(): config = EnvSettings() config.sim_config.ped_density_by_difficulty = ped_densities From b0915d7b8b4792454c83182593e544243c54ab90 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 09:11:39 +0200 Subject: [PATCH 08/18] chore: Update SLURM partition_queue.sh script to allow overriding the partition name The SLURM partition_queue.sh script has been updated to allow overriding the default partition name via a command line argument. This enhancement provides flexibility for users to query job queues in different partitions without modifying the script. It improves the usability of the script and enhances the user experience. --- SLURM/partition_queue.sh | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/SLURM/partition_queue.sh b/SLURM/partition_queue.sh index dd0a19a..957318f 100755 --- a/SLURM/partition_queue.sh +++ b/SLURM/partition_queue.sh @@ -1,3 +1,27 @@ #! /bin/bash -squeue -p epyc-gpu-test -l +# Script to query jobs in a specified SLURM partition + +# Default partition name +PARTITION="epyc-gpu-test" + +# Allow overriding the partition name via command line argument +if [ $# -eq 1 ]; then + PARTITION="$1" +fi + +# Check if squeue command exists +if ! command -v squeue &> /dev/null; then + echo "Error: squeue command not found. Is SLURM installed?" + exit 1 +fi + +# Run squeue command and capture its exit status +squeue -p "$PARTITION" -l +exit_status=$? + +# Check if squeue command was successful +if [ $exit_status -ne 0 ]; then + echo "Error: squeue command failed with exit status $exit_status" + exit $exit_status +fi From e6923540a9d61a6adbaae7555e0271f921708f09 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 09:11:51 +0200 Subject: [PATCH 09/18] chore: Add instructions for viewing CPU and GPU usage results --- SLURM/readme.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/SLURM/readme.md b/SLURM/readme.md index bf96f5c..5d47eb6 100644 --- a/SLURM/readme.md +++ b/SLURM/readme.md @@ -30,3 +30,16 @@ Run the script: sbatch slurm_train.sl ``` +To view the CPU and GPU usage results, follow these steps: + +1. The tracking results are saved in a log file. The exact location of the log file will depend on the configuration in your `slurm_train.sl` and `log_gpu_cpu_usage.py` script. Please refer to the script to find the specific directory or file name where the results are stored. + +2. The results should be saved in a tensorboard log file. You can view the results by running the following command: + +```bash +tensorboard --logdir= +``` + +Make sure to adjust the instructions based on your specific setup and requirements. + + From c79bb3f52f6e3ae6a2baeab1d5daae24217589d9 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 10:03:43 +0200 Subject: [PATCH 10/18] chore: Add script to set email as environment variable A new script, `set_email_env_variable.sh`, has been added to the SLURM directory. This script sets the email as an environment variable in the `.bashrc` file if it is not already set. It provides a convenient way for users to ensure that the SLURM_EMAIL variable is properly configured. This addition improves the usability of the SLURM scripts and enhances the user experience. --- SLURM/set_email_env_variable.sh | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 SLURM/set_email_env_variable.sh diff --git a/SLURM/set_email_env_variable.sh b/SLURM/set_email_env_variable.sh new file mode 100755 index 0000000..6b29f27 --- /dev/null +++ b/SLURM/set_email_env_variable.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Set the email as an environment variable in .bashrc if not already set + +# Check for email argument +if [ -z "$1" ]; then + echo "Usage: $0 " + exit 1 +fi + +# Check if SLURM_EMAIL is already set in the environment +if [ -z "$SLURM_EMAIL" ]; then + # Check if SLURM_EMAIL is already in .bashrc + if ! grep -q "export SLURM_EMAIL" ~/.bashrc; then + # Add SLURM_EMAIL to .bashrc + echo "export SLURM_EMAIL=$1" >> ~/.bashrc + echo "SLURM_EMAIL has been added to ~/.bashrc" + source ~/.bashrc + echo "We also sourced ~/.bashrc" + else + echo "SLURM_EMAIL is already in ~/.bashrc" + fi + + # Set SLURM_EMAIL for the current session + export SLURM_EMAIL="$1" + echo "SLURM_EMAIL has been set to $SLURM_EMAIL for the current session" +else + echo "SLURM_EMAIL is already set to $SLURM_EMAIL" +fi From 13049cdc3e62492886dd93a3d562ed3a1c54a0d0 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 10:07:23 +0200 Subject: [PATCH 11/18] chore: Add SLURM_EMAIL check and notification to slurm_train.sl The `slurm_train.sl` script has been updated to include a check for the `SLURM_EMAIL` environment variable. If the variable is not set, a message is displayed prompting the user to set it before running the script. Additionally, email notification options have been added to the script using the `SBATCH --mail-user` and `SBATCH --mail-type` directives. This change improves the usability of the script by ensuring that the user is notified of job completion or failure via email. --- SLURM/slurm_train.sl | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/SLURM/slurm_train.sl b/SLURM/slurm_train.sl index 1d54b99..e9e59f9 100644 --- a/SLURM/slurm_train.sl +++ b/SLURM/slurm_train.sl @@ -10,7 +10,17 @@ #SBATCH --cpus-per-task=64 # Request GPU Ressources (model:number) #SBATCH --gpus=a100:1 - + +# Check if SLURM_EMAIL is set +if [ -z "$SLURM_EMAIL" ]; then + echo "SLURM_EMAIL is not set. Please set it before running the script." +else + # Add email notification + #SBATCH --mail-user=$SLURM_EMAIL + #SBATCH --mail-type=END,FAIL + echo "SLURM_EMAIL is set to $SLURM_EMAIL" +fi + # Clear all interactively loaded modules module purge From 835ac18d8d5682768ed94e26e8a507ec51d8c4e8 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 10:17:11 +0200 Subject: [PATCH 12/18] chore: Update SLURM training script to include experiment description and set OMP_NUM_THREADS=1 --- SLURM/slurm_train.sl | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/SLURM/slurm_train.sl b/SLURM/slurm_train.sl index e9e59f9..957b478 100644 --- a/SLURM/slurm_train.sl +++ b/SLURM/slurm_train.sl @@ -1,5 +1,5 @@ #!/usr/bin/env bash - + #SBATCH --job-name=robot-sf #SBATCH --partition=epyc-gpu-test #SBATCH --time=2:00:00 @@ -21,6 +21,13 @@ else echo "SLURM_EMAIL is set to $SLURM_EMAIL" fi + +# # echo date and time +echo "Starting script at $(date)" + +# # Create experiment description +echo "Run experiment with OMP_NUM_THREADS=1 because multithreading is in sb3" + # Clear all interactively loaded modules module purge @@ -35,6 +42,10 @@ conda activate conda_env # if you are adding your own level of parallelzation, you # probably want to set OMP_NUM_THREADS=1 instead, in order # to prevent the creation of too many threads (massive slowdown!) +export OMP_NUM_THREADS=1 # No need to pass number of tasks to srun srun python3 log_gpu_cpu_usage.py + +# echo date and time +# echo "Ending script at $(date)" From 7d3b4236555d55a5128a7ac8ea18d09e8a8f7d60 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 11:57:28 +0200 Subject: [PATCH 13/18] chore: Add SLURM partition_queue_start_time.sh script for querying job queue start time --- SLURM/partition_queue_start_time.sh | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100755 SLURM/partition_queue_start_time.sh diff --git a/SLURM/partition_queue_start_time.sh b/SLURM/partition_queue_start_time.sh new file mode 100755 index 0000000..d6382d8 --- /dev/null +++ b/SLURM/partition_queue_start_time.sh @@ -0,0 +1,27 @@ +#! /bin/bash + +# Script to query jobs in a specified SLURM partition + +# Default partition name +PARTITION="epyc-gpu-test" + +# Allow overriding the partition name via command line argument +if [ $# -eq 1 ]; then + PARTITION="$1" +fi + +# Check if squeue command exists +if ! command -v squeue &> /dev/null; then + echo "Error: squeue command not found. Is SLURM installed?" + exit 1 +fi + +# Run squeue command and capture its exit status +squeue -p "$PARTITION" --start +exit_status=$? + +# Check if squeue command was successful +if [ $exit_status -ne 0 ]; then + echo "Error: squeue command failed with exit status $exit_status" + exit $exit_status +fi From ac982abeeba806442c2efaff14be8587dea9b699 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 11:59:06 +0200 Subject: [PATCH 14/18] chore: Update SLURM partition_queue_start_time.sh script to include user filter for querying job queue start time --- SLURM/partition_queue_start_time.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SLURM/partition_queue_start_time.sh b/SLURM/partition_queue_start_time.sh index d6382d8..87d0d19 100755 --- a/SLURM/partition_queue_start_time.sh +++ b/SLURM/partition_queue_start_time.sh @@ -17,7 +17,7 @@ if ! command -v squeue &> /dev/null; then fi # Run squeue command and capture its exit status -squeue -p "$PARTITION" --start +squeue -p "$PARTITION" --start -u $USER exit_status=$? # Check if squeue command was successful From 53ae653f92f43857eb6d21be6f09ddd1f04ba407 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 13:27:45 +0200 Subject: [PATCH 15/18] chore: Update SLURM partition_queue_start_time.sh script to include user filter for querying job queue start time --- SLURM/partition_queue_start_time.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/SLURM/partition_queue_start_time.sh b/SLURM/partition_queue_start_time.sh index 87d0d19..ce63eb0 100755 --- a/SLURM/partition_queue_start_time.sh +++ b/SLURM/partition_queue_start_time.sh @@ -2,6 +2,10 @@ # Script to query jobs in a specified SLURM partition +# current date and time + +echo "Partition queue start time at $(date)" + # Default partition name PARTITION="epyc-gpu-test" @@ -17,7 +21,7 @@ if ! command -v squeue &> /dev/null; then fi # Run squeue command and capture its exit status -squeue -p "$PARTITION" --start -u $USER +output=$(squeue -p "$PARTITION" --start -u $USER 2>&1) exit_status=$? # Check if squeue command was successful @@ -25,3 +29,9 @@ if [ $exit_status -ne 0 ]; then echo "Error: squeue command failed with exit status $exit_status" exit $exit_status fi +# Display the output +echo "$output" + +if $VERBOSE; then + echo "Query completed successfully." +fi From cb140774ce27bf1d58b01fe878af1cb21a269e76 Mon Sep 17 00:00:00 2001 From: Lennart Luttkus Date: Wed, 25 Sep 2024 13:27:51 +0200 Subject: [PATCH 16/18] chore: Update SLURM partition_queue.sh script to include date and time in output --- SLURM/partition_queue.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/SLURM/partition_queue.sh b/SLURM/partition_queue.sh index 957318f..8e16568 100755 --- a/SLURM/partition_queue.sh +++ b/SLURM/partition_queue.sh @@ -2,6 +2,9 @@ # Script to query jobs in a specified SLURM partition +# date and time +echo "Partion queue at $(date)" + # Default partition name PARTITION="epyc-gpu-test" From c2dcd888464e29c9e4856dbc1631bf1b38286e5d Mon Sep 17 00:00:00 2001 From: ll7 Date: Thu, 26 Sep 2024 10:19:45 +0200 Subject: [PATCH 17/18] chore: Add ms-toolsai.tensorboard extension to VS Code --- .vscode/extensions.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 51a6fc7..1f9d7b0 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -3,6 +3,7 @@ "ms-python.python", "ms-python.pylint", - "github.vscode-pull-request-github" + "github.vscode-pull-request-github", + "ms-toolsai.tensorboard" ] } \ No newline at end of file From ac407b728a3733ae1b7f64de9dcb3271b49e9922 Mon Sep 17 00:00:00 2001 From: ll7 Date: Thu, 26 Sep 2024 10:20:09 +0200 Subject: [PATCH 18/18] chore: Update SLURM log_gpu_cpu_usage.py to log GPU memory utilization This commit updates the `log_gpu_cpu_usage.py` script in the SLURM directory to include logging of GPU memory utilization in addition to CPU and GPU usage. The `LogResourceUsageCallback` class now records the memory utilization of each GPU in the `gpu_{idx}_memory_util` field. This enhancement provides more comprehensive monitoring of resource usage during training. --- SLURM/log_gpu_cpu_usage.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) mode change 100644 => 100755 SLURM/log_gpu_cpu_usage.py diff --git a/SLURM/log_gpu_cpu_usage.py b/SLURM/log_gpu_cpu_usage.py old mode 100644 new mode 100755 index 690ba0b..08d5c2a --- a/SLURM/log_gpu_cpu_usage.py +++ b/SLURM/log_gpu_cpu_usage.py @@ -3,6 +3,7 @@ import sys import psutil import GPUtil +import os from loguru import logger from stable_baselines3 import PPO from stable_baselines3.common.env_util import make_vec_env @@ -16,24 +17,23 @@ class LogResourceUsageCallback(BaseCallback): """Custom callback to log CPU and GPU usage to TensorBoard.""" - def __init__(self, verbose=0): - super(LogResourceUsageCallback, self).__init__(verbose) - def _on_step(self) -> bool: - """Log CPU and GPU usage at each step.""" + """Log CPU and GPU usage and memory utilization at each step.""" cpu_usage = psutil.cpu_percent() gpus = GPUtil.getGPUs() gpu_usage = [gpu.load * 100 for gpu in gpus] if gpus else [0] + gpu_memory_util = [gpu.memoryUtil * 100 for gpu in gpus] if gpus else [0] # Log to TensorBoard self.logger.record('cpu_usage', cpu_usage) - for idx, usage in enumerate(gpu_usage): + for idx, (usage, mem_util) in enumerate(zip(gpu_usage, gpu_memory_util)): self.logger.record(f'gpu_{idx}_usage', usage) + self.logger.record(f'gpu_{idx}_memory_util', mem_util) return True def training( - n_envs: int = 64, + n_envs: int = os.cpu_count(), ped_densities: list[float] = None, difficulty: int = 2 ): @@ -43,6 +43,7 @@ def training( ped_densities: List of pedestrian densities to use. difficulty: Difficulty of the simulation. """ + logger.info(f"Number of CPUs: {n_envs}") if ped_densities is None: ped_densities = [0.01, 0.02, 0.04, 0.08] def make_env():