Skip to content

Commit

Permalink
Merge pull request #54 from ll7/ll7/issue48-Retrain-the-robot-with-a-…
Browse files Browse the repository at this point in the history
…reward-that-penalizes-quick-action-changes

Ll7/issue48 retrain the robot with a reward that penalizes quick action changes
  • Loading branch information
ll7 authored Sep 23, 2024
2 parents 85daad9 + f4f0ce0 commit fa11313
Show file tree
Hide file tree
Showing 15 changed files with 247 additions and 33 deletions.
1 change: 1 addition & 0 deletions .cursorrules
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
You are an AI assistant specialized in Python development. Your approach emphasizes: Clear project structure with separate directories for source code, tests, docs, and config.Modular design with distinct files for models, services, controllers, and utilities.Configuration management using environment variables. Robust error handling and logging, including context capture. Comprehensive testing with pytest. Detailed documentation using docstrings and README files.Dependency management via requirements.txt and virtual environments. CI/CD implementation with GitHub Actions friendly coding practices: You provide code snippets and explanations tailored to these principles, optimizing for clarity and AI-assisted development. Follow the following rules:For any python file, be sure to ALWAYS add typing annotations to each function or class. Be sure to include return types when necessary. Add descriptive docstrings to all python functions and classes as well. Please use pep257 convention. Update existing docstrings if need be. Make sure you keep any comments that exist in a file. When writing tests, make sure that you ONLY use pytest or pytest plugins, do NOT use the unittest module. All tests should have typing annotations as well. All tests should be in ./tests. Be sure to create all necessary files and folders. If you are creating files inside of ./tests, be sure to make a init.py file if one does not exist. All tests should be fully annotated and should contain docstrings.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ build
file.log
images
logs
model
# model
SLURM/model
profile.json
profiles
pysf_tests
Expand Down
1 change: 1 addition & 0 deletions SLURM/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.out
8 changes: 8 additions & 0 deletions SLURM/load_module.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash

module purge

module load anaconda cuda

conda activate conda_env

5 changes: 5 additions & 0 deletions SLURM/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# SLURM

```bash
sbatch slurm_train.sl
```
65 changes: 65 additions & 0 deletions SLURM/slurm_PPO_robot_sf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""train a robot in robot_sf on a slurm server"""

import sys

from loguru import logger

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList

from robot_sf.gym_env.robot_env import RobotEnv
from robot_sf.gym_env.env_config import EnvSettings
from robot_sf.feature_extractor import DynamicsExtractor
from robot_sf.tb_logging import DrivingMetricsCallback


def training():
n_envs = 64
ped_densities = [0.01, 0.02, 0.04, 0.08]
difficulty = 2


def make_env():
config = EnvSettings()
config.sim_config.ped_density_by_difficulty = ped_densities
config.sim_config.difficulty = difficulty
return RobotEnv(config)

env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)

policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
model = PPO(
"MultiInputPolicy",
env,
tensorboard_log="./logs/ppo_logs/",
policy_kwargs=policy_kwargs
)
save_model_callback = CheckpointCallback(
500_000 // n_envs,
"./model/backup",
"ppo_model"
)
collect_metrics_callback = DrivingMetricsCallback(n_envs)
combined_callback = CallbackList(
[save_model_callback, collect_metrics_callback]
)

logger.info("start learning")
model.learn(
total_timesteps=10_000_000,
progress_bar=True,
callback=combined_callback
)
logger.info("save model")
model.save("./model/ppo_model")


if __name__ == '__main__':
logger.info(f"python path: {sys.executable}")
logger.info(f"python version: {sys.version}")

logger.info("start training")
training()
logger.info("end training")
30 changes: 30 additions & 0 deletions SLURM/slurm_train.sl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

#SBATCH --job-name=robot-sf
#SBATCH --partition=epyc-gpu
#SBATCH --time=10:00:00

# Request memory per CPU
#SBATCH --mem-per-cpu=2G
# Request n CPUs for your task.
#SBATCH --cpus-per-task=64
# Request GPU Ressources (model:number)
#SBATCH --gpus=a100:1

# Clear all interactively loaded modules
module purge

# Load a python package manager
module load cuda anaconda # or micromamba or condaforge

# Activate a certain environment
conda activate conda_env

# set number of OpenMP threads (i.e. for numpy, etc...)
# export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
# if you are adding your own level of parallelzation, you
# probably want to set OMP_NUM_THREADS=1 instead, in order
# to prevent the creation of too many threads (massive slowdown!)

# No need to pass number of tasks to srun
srun python3 slurm_PPO_robot_sf.py
3 changes: 3 additions & 0 deletions SLURM/user_queue.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#! /bin/bash

squeue -u $USER
4 changes: 4 additions & 0 deletions git_submodule.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
# if you forgot to clone recursively, run this script

git submodule update --init --recursive
Binary file added model/ppo_model_retrained_10m_2024-09-17.zip
Binary file not shown.
48 changes: 48 additions & 0 deletions robot_sf/gym_env/reward.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
"""
This module defines the reward function for the robot environment.
"""

import numpy as np

def simple_reward(
meta: dict,
max_episode_step_discount: float=-0.1,
Expand Down Expand Up @@ -34,3 +40,45 @@ def simple_reward(
reward += reach_waypoint_reward

return reward

def punish_action_reward(
meta: dict,
max_episode_step_discount: float=-0.1,
ped_coll_penalty: float=-5,
obst_coll_penalty: float=-2,
reach_waypoint_reward: float=1,
punish_action: bool=True,
punish_action_penalty: float=-0.1
) -> float:
"""
Calculate the reward for the robot's current state.
Parameters:
meta (dict): Metadata containing information about the robot's current state.
max_episode_step_discount (float): Discount factor for each step in the episode.
ped_coll_penalty (float): Penalty for colliding with a pedestrian.
obst_coll_penalty (float): Penalty for colliding with an obstacle.
reach_waypoint_reward (float): Reward for reaching a waypoint.
punish_action (bool): Whether to punish the robot for taking actions.
punish_action_penalty (float): Penalty for taking actions.
Returns:
float: The calculated reward.
"""

# Initialize reward with a discount based on the maximum simulation steps
reward = simple_reward(
meta,
max_episode_step_discount,
ped_coll_penalty,
obst_coll_penalty,
reach_waypoint_reward
)

# punish the robot taking a different action from the last action
if punish_action and meta["last_action"] is not None:
action_diff = np.linalg.norm(np.array(meta["action"]) - np.array(meta["last_action"]))
if action_diff > 0:
reward += punish_action_penalty * action_diff

return reward
49 changes: 20 additions & 29 deletions robot_sf/gym_env/robot_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,41 +141,53 @@ def step(self, action):
"""
# Process the action through the simulator
action = self.simulator.robots[0].parse_action(action)
self.last_action = action
# Perform simulation step
self.simulator.step_once([action])
# Get updated observation
obs = self.state.step()
# Fetch metadata about the current state
meta = self.state.meta_dict()
reward_dict = self.state.meta_dict()
# add the action space to dict
reward_dict["action_space"] = self.action_space
# add action to dict
reward_dict["action"] = action
# Add last_action to reward_dict
reward_dict["last_action"] = self.last_action
# Determine if the episode has reached terminal state
term = self.state.is_terminal
# Compute the reward using the provided reward function
reward = self.reward_func(meta)
reward = self.reward_func(reward_dict)
# Update last_action for next step
self.last_action = action

# if recording is enabled, record the state
if self.recording_enabled:
self.record()

return obs, reward, term, {"step": meta["step"], "meta": meta}
# observation, reward, terminal, truncated,info
return obs, reward, term, False, {"step": reward_dict["step"], "meta": reward_dict}

def reset(self):
def reset(self, seed=None, options=None):
"""
Reset the environment state to start a new episode.
Returns:
- obs: The initial observation after resetting the environment.
"""
super().reset(seed=seed,options=options)
# Reset last_action
self.last_action = None
# Reset internal simulator state
self.simulator.reset_state()
# Reset the environment's state and return the initial observation
obs = self.state.reset()

# if recording is enabled, save the recording and reset the state list
if self.recording_enabled:
self.save_recording()

return obs

# info is necessary for the gym environment, but useless at the moment
info = {"info": "test"}
return obs, info

def _prepare_visualizable_state(self):
# Prepare action visualization, if any action was executed
Expand Down Expand Up @@ -263,27 +275,6 @@ def save_recording(self, filename: str = None):
logger.info("Reset state list")
self.recorded_states = []

def seed(self, seed=None):
"""
Set the seed for this env's random number generator(s).
Note:
Some environments use multiple pseudorandom number generators.
We want to capture all such seeds used in order to ensure that
there aren't accidental correlations between multiple generators.
Returns:
list<bigint>: Returns the list of seeds used in this env's random
number generators. The first value in the list should be the
"main" seed, or the value which a reproducer should pass to
'seed'. Often, the main seed equals the provided 'seed', but
this won't be true if seed=None, for example.
TODO: validate this method
"""
self.np_random, seed = seeding.np_random(seed)
return [seed]

def exit(self):
"""
Clean up and exit the simulation UI, if it exists.
Expand Down
56 changes: 56 additions & 0 deletions scripts/PPO_training/train_ppo_punish_action.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList

from robot_sf.gym_env.robot_env import RobotEnv
from robot_sf.gym_env.env_config import EnvSettings
from robot_sf.feature_extractor import DynamicsExtractor
from robot_sf.tb_logging import DrivingMetricsCallback
from robot_sf.gym_env.reward import punish_action_reward


def training():
n_envs = 32
ped_densities = [0.01, 0.02, 0.04, 0.08]
difficulty = 2


def make_env():
config = EnvSettings()
config.sim_config.ped_density_by_difficulty = ped_densities
config.sim_config.difficulty = difficulty
return RobotEnv(
config,
reward_func=punish_action_reward
)

env = make_vec_env(make_env, n_envs=n_envs, vec_env_cls=SubprocVecEnv)

policy_kwargs = dict(features_extractor_class=DynamicsExtractor)
model = PPO(
"MultiInputPolicy",
env,
tensorboard_log="./logs/ppo_logs/",
policy_kwargs=policy_kwargs
)
save_model_callback = CheckpointCallback(
500_000 // n_envs,
"./model/backup",
"ppo_model"
)
collect_metrics_callback = DrivingMetricsCallback(n_envs)
combined_callback = CallbackList(
[save_model_callback, collect_metrics_callback]
)

model.learn(
total_timesteps=10_000_000,
progress_bar=True,
callback=combined_callback
)
model.save("./model/ppo_model")


if __name__ == '__main__':
training()
5 changes: 3 additions & 2 deletions tests/env_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_can_return_valid_observation():
drive_state_spec: spaces.Box = env.observation_space[OBS_DRIVE_STATE]
lidar_state_spec: spaces.Box = env.observation_space[OBS_RAYS]

obs = env.reset()
obs, info = env.reset()

assert isinstance(obs, dict)
assert OBS_DRIVE_STATE in obs and OBS_RAYS in obs
Expand All @@ -27,6 +27,7 @@ def test_can_simulate_with_pedestrians():
env.reset()
for _ in range(total_steps):
rand_action = env.action_space.sample()
_, _, done, _ = env.step(rand_action)
_, _, terminated, truncated, _ = env.step(rand_action)
done = terminated or truncated
if done:
env.reset()
2 changes: 1 addition & 1 deletion tests/sb3_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_can_load_model_snapshot():

inf_env = RobotEnv()
model2 = PPO.load(MODEL_PATH, env=inf_env)
obs = inf_env.reset()
obs, info = inf_env.reset()
action, _ = model2.predict(obs, deterministic=True)
assert action.shape == inf_env.action_space.shape

Expand Down

0 comments on commit fa11313

Please sign in to comment.