Merge pull request #54 from ll7/ll7/issue48-Retrain-the-robot-with-a-…

…reward-that-penalizes-quick-action-changes Ll7/issue48 retrain the robot with a reward that penalizes quick action changes
ll7 · Sep 23, 2024 · fa11313 · fa11313
2 parents 85daad9 + f4f0ce0
commit fa11313
Show file tree

Hide file tree

Showing 15 changed files with 247 additions and 33 deletions.
diff --git a/.cursorrules b/.cursorrules
@@ -0,0 +1 @@
+You are an AI assistant specialized in Python development. Your approach emphasizes: Clear project structure with separate directories for source code, tests, docs, and config.Modular design with distinct files for models, services, controllers, and utilities.Configuration management using environment variables. Robust error handling and logging, including context capture. Comprehensive testing with pytest. Detailed documentation using docstrings and README files.Dependency management via requirements.txt and virtual environments. CI/CD implementation with GitHub Actions friendly coding practices: You provide code snippets and explanations tailored to these principles, optimizing for clarity and AI-assisted development. Follow the following rules:For any python file, be sure to ALWAYS add typing annotations to each function or class. Be sure to include return types when necessary. Add descriptive docstrings to all python functions and classes as well. Please use pep257 convention. Update existing docstrings if need be. Make sure you keep any comments that exist in a file. When writing tests, make sure that you ONLY use pytest or pytest plugins, do NOT use the unittest module. All tests should have typing annotations as well. All tests should be in ./tests. Be sure to create all necessary files and folders. If you are creating files inside of ./tests, be sure to make a init.py file if one does not exist. All tests should be fully annotated and should contain docstrings.
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,8 @@ build
 file.log
 images
 logs
-model
+# model
+SLURM/model
 profile.json
 profiles
 pysf_tests

diff --git a/SLURM/.gitignore b/SLURM/.gitignore
@@ -0,0 +1 @@
+*.out
diff --git a/SLURM/load_module.sh b/SLURM/load_module.sh
@@ -0,0 +1,8 @@
+#! /bin/bash
+
+module purge
+
+module load anaconda cuda
+
+conda activate conda_env
+
diff --git a/SLURM/readme.md b/SLURM/readme.md
@@ -0,0 +1,5 @@
+# SLURM
+
+```bash
+sbatch slurm_train.sl
+```
diff --git a/SLURM/slurm_PPO_robot_sf.py b/SLURM/slurm_PPO_robot_sf.py
@@ -0,0 +1,65 @@
+"""train a robot in robot_sf on a slurm server"""
+
+import sys
+
+from loguru import logger
+
+from stable_baselines3 import PPO
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList
+
+from robot_sf.gym_env.robot_env import RobotEnv
+from robot_sf.gym_env.env_config import EnvSettings
+from robot_sf.feature_extractor import DynamicsExtractor
+from robot_sf.tb_logging import DrivingMetricsCallback
+
+
+def training():
+    n_envs = 64
+    ped_densities = [0.01, 0.02, 0.04, 0.08]
+    difficulty = 2
+
+
+    def make_env():
+        config = EnvSettings()
+        config.sim_config.ped_density_by_difficulty = ped_densities
+        config.sim_config.difficulty = difficulty
+        return RobotEnv(config)
+
+    env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)
+
+    policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
+    model = PPO(
+        "MultiInputPolicy",
+        env,
+        tensorboard_log="./logs/ppo_logs/",
+        policy_kwargs=policy_kwargs
+        )
+    save_model_callback = CheckpointCallback(
+        500_000 // n_envs,
+        "./model/backup",
+        "ppo_model"
+        )
+    collect_metrics_callback = DrivingMetricsCallback(n_envs)
+    combined_callback = CallbackList(
+        [save_model_callback, collect_metrics_callback]
+        )
+
+    logger.info("start learning")
+    model.learn(
+        total_timesteps=10_000_000,
+        progress_bar=True,
+        callback=combined_callback
+        )
+    logger.info("save model")
+    model.save("./model/ppo_model")
+
+
+if __name__ == '__main__':
+    logger.info(f"python path: {sys.executable}")
+    logger.info(f"python version: {sys.version}")
+
+    logger.info("start training")
+    training()
+    logger.info("end training")
diff --git a/SLURM/slurm_train.sl b/SLURM/slurm_train.sl
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+#SBATCH --job-name=robot-sf
+#SBATCH --partition=epyc-gpu
+#SBATCH --time=10:00:00
+
+# Request memory per CPU
+#SBATCH --mem-per-cpu=2G
+# Request n CPUs for your task.
+#SBATCH --cpus-per-task=64
+# Request GPU Ressources (model:number)
+#SBATCH --gpus=a100:1
+
+# Clear all interactively loaded modules
+module purge
+
+# Load a python package manager
+module load cuda anaconda  # or micromamba or condaforge
+
+# Activate a certain environment
+conda activate conda_env
+
+# set number of OpenMP threads (i.e. for numpy, etc...)
+# export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
+# if you are adding your own level of parallelzation, you
+# probably want to set OMP_NUM_THREADS=1 instead, in order
+# to prevent the creation of too many threads (massive slowdown!)
+
+# No need to pass number of tasks to srun
+srun python3 slurm_PPO_robot_sf.py
diff --git a/SLURM/user_queue.sh b/SLURM/user_queue.sh
@@ -0,0 +1,3 @@
+#! /bin/bash
+
+squeue -u $USER
diff --git a/git_submodule.sh b/git_submodule.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# if you forgot to clone recursively, run this script
+
+git submodule update --init --recursive
diff --git a/model/ppo_model_retrained_10m_2024-09-17.zip b/model/ppo_model_retrained_10m_2024-09-17.zip
diff --git a/robot_sf/gym_env/reward.py b/robot_sf/gym_env/reward.py
@@ -1,3 +1,9 @@
+"""
+This module defines the reward function for the robot environment.
+"""
+
+import numpy as np
+
 def simple_reward(
         meta: dict,
         max_episode_step_discount: float=-0.1,
@@ -34,3 +40,45 @@ def simple_reward(
         reward += reach_waypoint_reward
 
     return reward
+
+def punish_action_reward(
+        meta: dict,
+        max_episode_step_discount: float=-0.1,
+        ped_coll_penalty: float=-5,
+        obst_coll_penalty: float=-2,
+        reach_waypoint_reward: float=1,
+        punish_action: bool=True,
+        punish_action_penalty: float=-0.1
+        ) -> float:
+    """
+    Calculate the reward for the robot's current state.
+
+    Parameters:
+    meta (dict): Metadata containing information about the robot's current state.
+    max_episode_step_discount (float): Discount factor for each step in the episode.
+    ped_coll_penalty (float): Penalty for colliding with a pedestrian.
+    obst_coll_penalty (float): Penalty for colliding with an obstacle.
+    reach_waypoint_reward (float): Reward for reaching a waypoint.
+    punish_action (bool): Whether to punish the robot for taking actions.
+    punish_action_penalty (float): Penalty for taking actions.
+
+    Returns:
+    float: The calculated reward.
+    """
+
+    # Initialize reward with a discount based on the maximum simulation steps
+    reward = simple_reward(
+        meta,
+        max_episode_step_discount,
+        ped_coll_penalty,
+        obst_coll_penalty,
+        reach_waypoint_reward
+        )
+
+    # punish the robot taking a different action from the last action
+    if punish_action and meta["last_action"] is not None:
+        action_diff = np.linalg.norm(np.array(meta["action"]) - np.array(meta["last_action"]))
+        if action_diff > 0:
+            reward += punish_action_penalty * action_diff
+
+    return reward
diff --git a/robot_sf/gym_env/robot_env.py b/robot_sf/gym_env/robot_env.py
@@ -141,41 +141,53 @@ def step(self, action):
         """
         # Process the action through the simulator
         action = self.simulator.robots[0].parse_action(action)
-        self.last_action = action
         # Perform simulation step
         self.simulator.step_once([action])
         # Get updated observation
         obs = self.state.step()
         # Fetch metadata about the current state
-        meta = self.state.meta_dict()
+        reward_dict = self.state.meta_dict()
+        # add the action space to dict
+        reward_dict["action_space"] = self.action_space
+        # add action to dict
+        reward_dict["action"] = action
+        # Add last_action to reward_dict
+        reward_dict["last_action"] = self.last_action
         # Determine if the episode has reached terminal state
         term = self.state.is_terminal
         # Compute the reward using the provided reward function
-        reward = self.reward_func(meta)
+        reward = self.reward_func(reward_dict)
+        # Update last_action for next step
+        self.last_action = action
 
         # if recording is enabled, record the state
         if self.recording_enabled:
             self.record()
 
-        return obs, reward, term, {"step": meta["step"], "meta": meta}
+        # observation, reward, terminal, truncated,info
+        return obs, reward, term, False, {"step": reward_dict["step"], "meta": reward_dict}
 
-    def reset(self):
+    def reset(self, seed=None, options=None):
         """
         Reset the environment state to start a new episode.
 
         Returns:
         - obs: The initial observation after resetting the environment.
         """
+        super().reset(seed=seed,options=options)
+        # Reset last_action
+        self.last_action = None
         # Reset internal simulator state
         self.simulator.reset_state()
         # Reset the environment's state and return the initial observation
         obs = self.state.reset()
-
         # if recording is enabled, save the recording and reset the state list
         if self.recording_enabled:
             self.save_recording()
-
-        return obs
+
+        # info is necessary for the gym environment, but useless at the moment
+        info = {"info": "test"}
+        return obs, info
 
     def _prepare_visualizable_state(self):
         # Prepare action visualization, if any action was executed
@@ -263,27 +275,6 @@ def save_recording(self, filename: str = None):
             logger.info("Reset state list")
             self.recorded_states = []
 
-    def seed(self, seed=None):
-        """
-        Set the seed for this env's random number generator(s).
-
-        Note:
-            Some environments use multiple pseudorandom number generators.
-            We want to capture all such seeds used in order to ensure that
-            there aren't accidental correlations between multiple generators.
-
-        Returns:
-            list<bigint>: Returns the list of seeds used in this env's random
-            number generators. The first value in the list should be the
-            "main" seed, or the value which a reproducer should pass to
-            'seed'. Often, the main seed equals the provided 'seed', but
-            this won't be true if seed=None, for example.
-
-        TODO: validate this method
-        """
-        self.np_random, seed = seeding.np_random(seed)
-        return [seed]
-
     def exit(self):
         """
         Clean up and exit the simulation UI, if it exists.

diff --git a/scripts/PPO_training/train_ppo_punish_action.py b/scripts/PPO_training/train_ppo_punish_action.py
@@ -0,0 +1,56 @@
+from stable_baselines3 import PPO
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList
+
+from robot_sf.gym_env.robot_env import RobotEnv
+from robot_sf.gym_env.env_config import EnvSettings
+from robot_sf.feature_extractor import DynamicsExtractor
+from robot_sf.tb_logging import DrivingMetricsCallback
+from robot_sf.gym_env.reward import punish_action_reward
+
+
+def training():
+    n_envs = 32
+    ped_densities = [0.01, 0.02, 0.04, 0.08]
+    difficulty = 2
+
+
+    def make_env():
+        config = EnvSettings()
+        config.sim_config.ped_density_by_difficulty = ped_densities
+        config.sim_config.difficulty = difficulty
+        return RobotEnv(
+            config,
+            reward_func=punish_action_reward
+            )
+
+    env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)
+
+    policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
+    model = PPO(
+        "MultiInputPolicy",
+        env,
+        tensorboard_log="./logs/ppo_logs/",
+        policy_kwargs=policy_kwargs
+        )
+    save_model_callback = CheckpointCallback(
+        500_000 // n_envs,
+        "./model/backup",
+        "ppo_model"
+        )
+    collect_metrics_callback = DrivingMetricsCallback(n_envs)
+    combined_callback = CallbackList(
+        [save_model_callback, collect_metrics_callback]
+        )
+
+    model.learn(
+        total_timesteps=10_000_000,
+        progress_bar=True,
+        callback=combined_callback
+        )
+    model.save("./model/ppo_model")
+
+
+if __name__ == '__main__':
+    training()
diff --git a/tests/env_test.py b/tests/env_test.py
@@ -13,7 +13,7 @@ def test_can_return_valid_observation():
     drive_state_spec: spaces.Box = env.observation_space[OBS_DRIVE_STATE]
     lidar_state_spec: spaces.Box = env.observation_space[OBS_RAYS]
 
-    obs = env.reset()
+    obs, info = env.reset()
 
     assert isinstance(obs, dict)
     assert OBS_DRIVE_STATE in obs and OBS_RAYS in obs
@@ -27,6 +27,7 @@ def test_can_simulate_with_pedestrians():
     env.reset()
     for _ in range(total_steps):
         rand_action = env.action_space.sample()
-        _, _, done, _ = env.step(rand_action)
+        _, _, terminated, truncated, _ = env.step(rand_action)
+        done = terminated or truncated
         if done:
             env.reset()
diff --git a/tests/sb3_test.py b/tests/sb3_test.py
@@ -23,7 +23,7 @@ def test_can_load_model_snapshot():
 
     inf_env = RobotEnv()
     model2 = PPO.load(MODEL_PATH, env=inf_env)
-    obs = inf_env.reset()
+    obs, info = inf_env.reset()
     action, _ = model2.predict(obs, deterministic=True)
     assert action.shape == inf_env.action_space.shape
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		You are an AI assistant specialized in Python development. Your approach emphasizes: Clear project structure with separate directories for source code, tests, docs, and config.Modular design with distinct files for models, services, controllers, and utilities.Configuration management using environment variables. Robust error handling and logging, including context capture. Comprehensive testing with pytest. Detailed documentation using docstrings and README files.Dependency management via requirements.txt and virtual environments. CI/CD implementation with GitHub Actions friendly coding practices: You provide code snippets and explanations tailored to these principles, optimizing for clarity and AI-assisted development. Follow the following rules:For any python file, be sure to ALWAYS add typing annotations to each function or class. Be sure to include return types when necessary. Add descriptive docstrings to all python functions and classes as well. Please use pep257 convention. Update existing docstrings if need be. Make sure you keep any comments that exist in a file. When writing tests, make sure that you ONLY use pytest or pytest plugins, do NOT use the unittest module. All tests should have typing annotations as well. All tests should be in ./tests. Be sure to create all necessary files and folders. If you are creating files inside of ./tests, be sure to make a init.py file if one does not exist. All tests should be fully annotated and should contain docstrings.