diff --git a/README.md b/README.md index 3054bee3..c9e602d6 100644 --- a/README.md +++ b/README.md @@ -8,24 +8,56 @@ The CARES reinforcement learning bed used as the foundation for RL related proje Consult the repository [wiki](https://github.com/UoA-CARES/cares_reinforcement_learning/wiki) for a guide on how to use the package ## Installation Instructions -`git clone` the repository +If you want to utilise the GPU with Pytorch install CUDA first - https://developer.nvidia.com/cuda-toolkit -If you would like to leverage your machine's GPU, uncomment the optional dependencies in the `requirements.txt` before moving on. +Install Pytorch following the instructions here - https://pytorch.org/get-started/locally/ + +`git clone` the repository into your desired directory on your local machine Run `pip3 install -r requirements.txt` in the **root directory** of the package To make the module **globally accessible** in your working environment run `pip3 install --editable .` in the **project root** ## Running an Example -This repository includes a script that allows you to run any OpenAI environment – provided you comply with all the dependencies for that environment. These examples make use of the package, and can provide an example on how one might use the package in their own environments. +This repository includes a script that allows you to run any OpenAI Gymnasium (https://github.com/Farama-Foundation/Gymnasium) or Deep Mind Control Suite (https://github.com/google-deepmind/dm_control) environment – provided you comply with all the dependencies for that environment. These examples make use of the package, and can provide an example on how one might use the package in their own environments. + +`example_training_loops.py` takes in hyperparameters that allow you to customise the training run enviromment – OpenAI or DMCS Environment - or RL algorithm. Use `python3 example_training_loops.py -h` for help on what parameters are available for customisation. + +An example is found below for running on the OpenAI and DMCS environments with TD3: +``` +python3 example_training_loops.py openai --task HalfCheetah-v4 TD3 + -`example_training_loops.py` takes in hyperparameters that allow you to customise the training run – OpenAI Environment, training steps, gamma... Use `python3 example_training_loops.py -h` for help on what hyperparameters are available for customisation. +python3 example_training_loops.py dmcs --domain ball_in_cup --task catch TD3 +``` + +### Data Outputs +All data from a training run is saved into '~/cares_rl_logs'. A folder will be created for each training run named as 'ALGORITHM-TASK-YY_MM_DD:HH:MM:SS', e.g. 'TD3-HalfCheetah-v4-23_10_11_08:47:22'. This folder will contain the following directories and information saved during the training session: -An example is found below: ``` -python3 example_training_loops.py --task 'Pendulum-v1' --algorithm PPO --max_steps_training 1000000 --seed 571 --gamma 0.99 --actor_lr 0.0001 --critic_lr 0.001 +ALGORITHM-TASK-YY_MM_DD:HH:MM:SS/ +├─ config.py +├─ data +| ├─ train.csv +| ├─ eval.csv +├─ figures +| ├─ eval.png +| ├─ train.png +├─ models +| ├─ model.pht +| ├─ CHECKPOINT_N.pht +| ├─ ... +├─ videos +| ├─ STEP.mp4 +| ├─ ... ``` +### Plotting +The plotting utility will plot the data contained in the training data. An example of how to plot the data from one or multiple training sessions together is shown below. Running 'python3 plotter.py -h' will provide details on the plotting parameters. + +``` +python3 plotter.py -s ~/cares_rl_logs -d ~/cares_rl_logs/ALGORITHM-TASK-YY_MM_DD:HH:MM:SS -w 20 +``` ## Package Structure diff --git a/cares_reinforcement_learning/util/EnvironmentFactory.py b/cares_reinforcement_learning/util/EnvironmentFactory.py new file mode 100644 index 00000000..d92a1944 --- /dev/null +++ b/cares_reinforcement_learning/util/EnvironmentFactory.py @@ -0,0 +1,180 @@ +import logging + +import cv2 + +import gym +from gym import spaces + +from dm_control import suite + +import numpy as np +from collections import deque + +# from typing import override +from functools import cached_property + +class EnvironmentFactory: + def __init__(self) -> None: + pass + + def create_environment(self, gym_environment, args): + logging.info(f"Training Environment: {gym_environment}") + if gym_environment == 'dmcs': + env = DMCSImage(args=args) if args['image_observation'] else DMCS(args=args) + elif gym_environment == "openai": + env = OpenAIGym(args=args) + else: + raise ValueError(f"Unkown environment: {gym_environment}") + return env + +class OpenAIGym: + def __init__(self, args) -> None: + logging.info(f"Training task {args['task']}") + self.env = gym.make(args["task"], render_mode="rgb_array") + self.set_seed(args['seed']) + + @cached_property + def max_action_value(self): + return self.env.action_space.high[0] + + @cached_property + def min_action_value(self): + return self.env.action_space.low[0] + + @cached_property + def observation_space(self): + return self.env.observation_space.shape[0] + + @cached_property + def action_num(self): + if type(self.env.action_space) == spaces.Box: + action_num = self.env.action_space.shape[0] + elif type(self.env.action_space) == spaces.Discrete: + action_num= self.env.action_space.n + else: + raise ValueError(f"Unhandled action space type: {type(self.env.action_space)}") + return action_num + + def set_seed(self, seed): + self.env.action_space.seed(seed) + + def reset(self): + state, _ = self.env.reset() + return state + + def step(self, action): + state, reward, done, truncated, _ = self.env.step(action) + return state, reward, done, truncated + + def grab_frame(self): + frame = self.env.render() + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV + return frame + +class OpenAIGymImage: + def __init__(self, args, k=3): + self.k = k # number of frames to be stacked + self.frames_stacked = deque([], maxlen=k) + + super().__init__(args=args) + + # @override + @property + def observation_space(self): + raise NotImplementedError("Not Implemented Yet") + + # @override + def reset(self): + _ = self.env.reset() + frame = self.env.physics.render(84, 84, camera_id=0) # --> shape= (84, 84, 3) + frame = np.moveaxis(frame, -1, 0) # --> shape= (3, 84, 84) + for _ in range(self.k): + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) # --> shape = (9, 84, 84) + return stacked_frames + + # @override + def step(self, action): + time_step = self.env.step(action) + reward, done = time_step.reward, time_step.last() + frame = self.env.physics.render(84, 84, camera_id=0) + frame = np.moveaxis(frame, -1, 0) + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) + return stacked_frames, reward, done, False # for consistency with open ai gym just add false for truncated + +class DMCS: + def __init__(self, args) -> None: + logging.info(f"Training on Domain {args['domain']}") + logging.info(f"Training with Task {args['task']}") + + self.env = suite.load(args['domain'], args['task'], task_kwargs={'random': args['seed']}) + + @cached_property + def min_action_value(self): + return self.env.action_spec().minimum[0] + + @cached_property + def max_action_value(self): + return self.env.action_spec().maximum[0] + + @cached_property + def observation_space(self): + time_step = self.env.reset() + observation = np.hstack(list(time_step.observation.values())) # # e.g. position, orientation, joint_angles + return len(observation) + + @cached_property + def action_num(self): + return self.env.action_spec().shape[0] + + def set_seed(self, seed): + self.env = suite.load(self.env.domain, self.env.task, task_kwargs={'random': seed}) + + def reset(self): + time_step = self.env.reset() + observation = np.hstack(list(time_step.observation.values())) # # e.g. position, orientation, joint_angles + return observation + + def step(self, action): + time_step = self.env.step(action) + state, reward, done = np.hstack(list(time_step.observation.values())), time_step.reward, time_step.last() + return state, reward, done, False # for consistency with open ai gym just add false for truncated + + def grab_frame(self): + frame = self.env.physics.render(camera_id=0, height=240, width=300) + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV + return frame + +# TODO paramatise the observation size 3x84x84 +class DMCSImage(DMCS): + def __init__(self, args, k=3): + self.k = k # number of frames to be stacked + self.frames_stacked = deque([], maxlen=k) + + super().__init__(args=args) + + # @override + @property + def observation_space(self): + raise NotImplementedError("Not Implemented Yet") + + # @override + def reset(self): + _ = self.env.reset() + frame = self.env.physics.render(84, 84, camera_id=0) # --> shape= (84, 84, 3) + frame = np.moveaxis(frame, -1, 0) # --> shape= (3, 84, 84) + for _ in range(self.k): + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) # --> shape = (9, 84, 84) + return stacked_frames + + # @override + def step(self, action): + time_step = self.env.step(action) + reward, done = time_step.reward, time_step.last() + frame = self.env.physics.render(84, 84, camera_id=0) + frame = np.moveaxis(frame, -1, 0) + self.frames_stacked.append(frame) + stacked_frames = np.concatenate(list(self.frames_stacked), axis=0) + return stacked_frames, reward, done, False # for consistency with open ai gym just add false for truncated \ No newline at end of file diff --git a/cares_reinforcement_learning/util/Record.py b/cares_reinforcement_learning/util/Record.py index 4b06447b..bd5060c1 100644 --- a/cares_reinforcement_learning/util/Record.py +++ b/cares_reinforcement_learning/util/Record.py @@ -1,5 +1,6 @@ import os import logging +import cv2 import pandas as pd @@ -36,6 +37,18 @@ def __init__(self, glob_log_dir=None, log_dir=None, network=None, config=None) - with open(f'{self.directory}/config.yml', 'w') as outfile: yaml.dump(config, outfile, default_flow_style=False) + def start_video(self, file_name, frame): + fps = 30 + video_name = f"{self.directory}/videos/{file_name}.mp4" + height, width, channels = frame.shape + self.video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) + + def stop_video(self): + self.video.release() + + def log_video(self, frame): + self.video.write(frame) + def log_info(self, info, display=False): self.info_data = pd.concat([self.info_data, pd.DataFrame([info])], ignore_index=True) self.save_data(self.info_data, "info", info, display=display) @@ -70,7 +83,7 @@ def save_data(self, data_frame, filename, logs, display=True): string = '| ' + string + ' |' if display: - print(string) + logging.info(string) def save(self): logging.info(f"Saving final outputs") @@ -97,4 +110,7 @@ def __initialise_directories(self): os.mkdir(f'{self.directory}/models') if not os.path.exists(f'{self.directory}/figures'): - os.mkdir(f'{self.directory}/figures') + os.mkdir(f'{self.directory}/figures') + + if not os.path.exists(f'{self.directory}/videos'): + os.mkdir(f'{self.directory}/videos') diff --git a/cares_reinforcement_learning/util/__init__.py b/cares_reinforcement_learning/util/__init__.py index 7af8c198..391658ee 100644 --- a/cares_reinforcement_learning/util/__init__.py +++ b/cares_reinforcement_learning/util/__init__.py @@ -1,3 +1,4 @@ from .NetworkFactory import NetworkFactory from .Record import Record +from .EnvironmentFactory import EnvironmentFactory diff --git a/cares_reinforcement_learning/util/arguement_parser.py b/cares_reinforcement_learning/util/arguement_parser.py new file mode 100644 index 00000000..69db253a --- /dev/null +++ b/cares_reinforcement_learning/util/arguement_parser.py @@ -0,0 +1,106 @@ +""" +Example of using sub-parser, sub-commands and sub-sub-commands :-) +""" + +import argparse + +def environment_args(parent_parser): + env_parser = argparse.ArgumentParser() + env_parsers = env_parser.add_subparsers(title="Environment", description="OpenAI Gym or Deep Mind Control Suite", help='choose', dest='gym_environment', required=True) + + # create the parser for the DMCS sub-command + parser_dmcs = env_parsers.add_parser('dmcs', help='Deep Mind Control Suite', parents=[parent_parser]) + required = parser_dmcs.add_argument_group('required arguments') + required.add_argument('--domain', type=str, required=True) + required.add_argument('--task', type=str, required=True) + + # create the parser for the OpenAI sub-command + parser_openai = env_parsers.add_parser('openai', help='OpenAI Gymnasium', parents=[parent_parser]) + required = parser_openai.add_argument_group('required arguments') + required.add_argument('--task', type=str, required=True) + return env_parser + +def algorithm_args(parent_parser): + alg_parser = argparse.ArgumentParser(add_help=False) + alg_parsers = alg_parser.add_subparsers(help='Select which RL algorith you want to use', dest='algorithm', required=True) + + # create the parser for TD3 with default parameters + parser_TD3 = alg_parsers.add_parser('TD3', help='TD3', parents=[parent_parser]) + parser_TD3.add_argument('--actor_lr', type=float, default=1e-4) + parser_TD3.add_argument('--critic_lr', type=float, default=1e-3) + parser_TD3.add_argument('--gamma', type=float, default=0.99) + parser_TD3.add_argument('--tau', type=float, default=0.005) + + # create the parser for DDPG with default parameters + parser_DDPG = alg_parsers.add_parser('DDPG', help='DDPG', parents=[parent_parser]) + parser_DDPG.add_argument('--actor_lr', type=float, default=1e-4) + parser_DDPG.add_argument('--critic_lr', type=float, default=1e-3) + parser_DDPG.add_argument('--gamma', type=float, default=0.99) + parser_DDPG.add_argument('--tau', type=float, default=0.005) + + # create the parser for SAC with default parameters + parser_SAC = alg_parsers.add_parser('SAC', help='SAC', parents=[parent_parser]) + parser_SAC.add_argument('--actor_lr', type=float, default=1e-4) + parser_SAC.add_argument('--critic_lr', type=float, default=1e-3) + parser_SAC.add_argument('--gamma', type=float, default=0.99) + parser_SAC.add_argument('--tau', type=float, default=0.005) + + # create the parser for PPO with default parameters + parser_PPO = alg_parsers.add_parser('PPO', help='SAC', parents=[parent_parser]) + parser_PPO.add_argument('--actor_lr', type=float, default=1e-4) + parser_PPO.add_argument('--critic_lr', type=float, default=1e-3) + parser_PPO.add_argument('--gamma', type=float, default=0.99) + parser_PPO.add_argument('--max_steps_per_batch', type=float, default=5000) + + # create the parser for DQN with default parameters + parser_DQN = alg_parsers.add_parser('DQN', help='DQN', parents=[parent_parser]) + parser_DQN.add_argument('--lr', type=float, default=1e-3) + parser_DQN.add_argument('--gamma', type=float, default=0.99) + parser_DQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DQN.add_argument('--exploration_decay', type=float, default=0.95) + + # create the parser for DuelingDQN with default parameters + parser_DuelingDQN = alg_parsers.add_parser('DuelingDQN', help='DuelingDQN', parents=[parent_parser]) + parser_DuelingDQN.add_argument('--lr', type=float, default=1e-3) + parser_DuelingDQN.add_argument('--gamma', type=float, default=0.99) + parser_DuelingDQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DuelingDQN.add_argument('--exploration_decay', type=float, default=0.95) + + # create the parser for DoubleDQN with default parameters + parser_DoubleDQN = alg_parsers.add_parser('DoubleDQN', help='DoubleDQN', parents=[parent_parser]) + parser_DoubleDQN.add_argument('--lr', type=float, default=1e-3) + parser_DoubleDQN.add_argument('--gamma', type=float, default=0.99) + parser_DoubleDQN.add_argument('--exploration_min', type=float, default=1e-3) + parser_DoubleDQN.add_argument('--exploration_decay', type=float, default=0.95) + + return alg_parser + +def parse_args(): + parser = argparse.ArgumentParser(add_help=False) # Add an argument + + parser.add_argument('--number_training_iterations', type=int, default=1, help="Total amount of training iterations to complete") + + parser.add_argument('--memory', type=str, default="MemoryBuffer", help="Memory type - options: {MemoryBuffer, PER}") + parser.add_argument('--image_observation', type=bool, default=False, help="Use image as the observation state from the environment") + + parser.add_argument('--G', type=int, default=10, help="Number of learning updates each step of training") + parser.add_argument('--batch_size', type=int, default=32, help="Batch Size used during training") + + parser.add_argument('--max_steps_exploration', type=int, default=10000, help="Total number of steps for exploration before training") + parser.add_argument('--max_steps_training', type=int, default=100000, help="Total number of steps to train the algorithm") + + parser.add_argument('--number_steps_per_evaluation', type=int, default=10000, help="The number of steps inbetween evaluation runs during training") + parser.add_argument('--number_eval_episodes', type=int, default=10, help="The number of episodes to evaluate the agent on during training") + + parser.add_argument('--seed', type=int, default=571, help="The random seed to set for training") + + parser.add_argument('--plot_frequency', type=int, default=100, help="How many steps between updating the running plot of the training and evaluation data during training") + parser.add_argument('--checkpoint_frequency', type=int, default=100, help="How many steps between saving check point models of the agent during training") + + parser = algorithm_args(parent_parser=parser) + parser = environment_args(parent_parser=parser) + + return vars(parser.parse_args()) # converts to a dictionary + +if __name__ == '__main__': + print(parse_args()) diff --git a/example/example_training_loops.py b/example/example_training_loops.py index 506df6f4..99d1c2fe 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -1,10 +1,14 @@ import time import argparse +import logging +logging.basicConfig(level=logging.INFO) from cares_reinforcement_learning.util import NetworkFactory from cares_reinforcement_learning.memory import MemoryBuffer from cares_reinforcement_learning.memory.augments import * from cares_reinforcement_learning.util import Record +from cares_reinforcement_learning.util import EnvironmentFactory +from cares_reinforcement_learning.util import arguement_parser as ap import example.policy_example as pbe import example.value_example as vbe @@ -15,83 +19,36 @@ import torch import random -import logging import numpy as np +from pathlib import Path -logging.basicConfig(level=logging.INFO) - -def set_seed(env, seed): +def set_seed(seed): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) - env.action_space.seed(seed) - -def parse_args(): - parser = argparse.ArgumentParser() # Add an argument - - parser.add_argument('--task', type=str, required=True) - parser.add_argument('--render', type=str, default="None") - parser.add_argument('--algorithm', type=str, required=True) - parser.add_argument('--memory', type=str, default="MemoryBuffer") - - parser.add_argument('--G', type=int, default=10) - parser.add_argument('--gamma', type=float, default=0.99) - parser.add_argument('--tau', type=float, default=0.005) - parser.add_argument('--batch_size', type=int, default=32) - - parser.add_argument('--max_steps_exploration', type=int, default=10000) - parser.add_argument('--max_steps_training', type=int, default=50000) - - parser.add_argument('--number_steps_per_evaluation', type=int, default=1000) - parser.add_argument('--number_eval_episodes', type=int, default=10) - - parser.add_argument('--seed', type=int, default=571) - parser.add_argument('--evaluation_seed', type=int, default=152) - - parser.add_argument('--actor_lr', type=float, default=1e-4) - parser.add_argument('--critic_lr', type=float, default=1e-3) - - parser.add_argument('--lr', type=float, default=1e-3) - parser.add_argument('--exploration_min', type=float, default=1e-3) - parser.add_argument('--exploration_decay', type=float, default=0.95) - - parser.add_argument('--max_steps_per_batch', type=float, default=5000) - - parser.add_argument('--plot_frequency', type=int, default=100) - parser.add_argument('--checkpoint_frequency', type=int, default=100) - - parser.add_argument('--display', type=str, default=True) - - return vars(parser.parse_args()) # converts into a dictionary def main(): - args = parse_args() + args = ap.parse_args() + args["device"] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + logging.info(f"Device: {args['device']}") logging.info(f"Training on {args['task']}") - env = gym.make(args["task"], render_mode=(None if args['render'] == "None" else args['render'])) - - logging.info(f"Device: {args['device']}") + env_factory = EnvironmentFactory() + + gym_environment = args['gym_environment'] + env = env_factory.create_environment(gym_environment=gym_environment, args=args) - args["observation_size"] = env.observation_space.shape[0] + args["observation_size"] = env.observation_space logging.info(f"Observation Size: {args['observation_size']}") - if type(env.action_space) == spaces.Box: - args["action_num"] = env.action_space.shape[0] - elif type(env.action_space) == spaces.Discrete: - args["action_num"] = env.action_space.n - else: - raise ValueError(f"Unhandled action space type: {type(env.action_space)}") + args['action_num'] = env.action_num logging.info(f"Action Num: {args['action_num']}") - logging.info(f"Seed: {args['seed']}") - set_seed(env, args["seed"]) - # Create the network we are using factory = NetworkFactory() logging.info(f"Algorithm: {args['algorithm']}") agent = factory.create_network(args["algorithm"], args) - logging.info(f"Algorithm: {args['algorithm']}") # TODO move to memory factory as we add new PER if args["memory"] == "MemoryBuffer": @@ -105,23 +62,27 @@ def main(): logging.info(f"Memory: {args['memory']}") - #create the record class - standardised results tracking - record = Record(network=agent, config={'args': args}) - # Train the policy or value based approach - if args["algorithm"] == "PPO": - ppe.ppo_train(env, agent, record, args) - env = gym.make(env.spec.id, render_mode="human") - ppe.evaluate_ppo_network(env, agent, args) - elif agent.type == "policy": - pbe.policy_based_train(env, agent, memory, record, args) - env = gym.make(env.spec.id, render_mode="human") - pbe.evaluate_policy_network(env, agent, args) - elif agent.type == "value": - vbe.value_based_train(env, agent, memory, record, args) - env = gym.make(env.spec.id, render_mode="human") - vbe.evaluate_value_network(env, agent, args) - else: - raise ValueError(f"Agent type is unkown: {agent.type}") + seed = args['seed'] + + training_iterations = args['number_training_iterations'] + for training_iteration in range(0, training_iterations): + logging.info(f"Training iteration {training_iteration+1}/{training_iterations} with Seed: {seed}") + set_seed(seed) + env.set_seed(seed) + + #create the record class - standardised results tracking + record = Record(network=agent, config={'args': args}) + + # Train the policy or value based approach + if args["algorithm"] == "PPO": + ppe.ppo_train(env, agent, record, args) + elif agent.type == "policy": + pbe.policy_based_train(env, agent, memory, record, args) + elif agent.type == "value": + vbe.value_based_train(env, agent, memory, record, args) + else: + raise ValueError(f"Agent type is unkown: {agent.type}") + seed += 10 record.save() diff --git a/example/policy_example.py b/example/policy_example.py index cdeab164..494bb2e1 100644 --- a/example/policy_example.py +++ b/example/policy_example.py @@ -2,18 +2,21 @@ from cares_reinforcement_learning.memory.augments import * from cares_reinforcement_learning.util import helpers as hlp, Record +import cv2 import time import gym import logging +import numpy as np def evaluate_policy_network(env, agent, args, record=None, total_steps=0): + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + number_eval_episodes = int(args["number_eval_episodes"]) - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - - state, _ = env.reset() + state = env.reset() for eval_episode_counter in range(number_eval_episodes): episode_timesteps = 0 @@ -25,11 +28,15 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): while not done and not truncated: episode_timesteps += 1 action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - state, reward, done, truncated, _ = env.step(action_env) + state, reward, done, truncated = env.step(action_env) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -40,10 +47,12 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + + record.stop_video() def policy_based_train(env, agent, memory, record, args): start_time = time.time() @@ -52,20 +61,19 @@ def policy_based_train(env, agent, memory, record, args): max_steps_exploration = args["max_steps_exploration"] number_steps_per_evaluation = args["number_steps_per_evaluation"] + logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}") + batch_size = args["batch_size"] seed = args["seed"] G = args["G"] - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - episode_timesteps = 0 episode_reward = 0 episode_num = 0 evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() episode_start = time.time() for total_step_counter in range(int(max_steps_training)): @@ -73,13 +81,17 @@ def policy_based_train(env, agent, memory, record, args): if total_step_counter < max_steps_exploration: logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") - action_env = env.action_space.sample() # action range the env uses [e.g. -2 , 2 for pendulum] - action = hlp.normalize(action_env, max_action_value, min_action_value) # algorithm range [-1, 1] + # action range the env uses [e.g. -2 , 2 for pendulum] + action_env = np.random.uniform(env.min_action_value, env.max_action_value, size=env.action_num) + # algorithm range [-1, 1] - note for DMCS this is redudenant but required for openai + action = hlp.normalize(action_env, env.max_action_value, env.min_action_value) else: - action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, max_action_value, min_action_value) # mapping to env range [e.g. -2 , 2 for pendulum] + # algorithm range [-1, 1] + action = agent.select_action_from_policy(state) + # mapping to env range [e.g. -2 , 2 for pendulum] - note for DMCS this is redudenant but required for openai + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - next_state, reward, done, truncated, info = env.step(action_env) + next_state, reward, done, truncated = env.step(action_env) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) state = next_state @@ -120,7 +132,7 @@ def policy_based_train(env, agent, memory, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 diff --git a/example/ppo_example.py b/example/ppo_example.py index 603a5a97..25671d0d 100644 --- a/example/ppo_example.py +++ b/example/ppo_example.py @@ -10,12 +10,13 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): - number_eval_episodes = int(args["number_eval_episodes"]) + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] + number_eval_episodes = int(args["number_eval_episodes"]) - state, _ = env.reset() + state = env.reset() for eval_episode_counter in range(number_eval_episodes): episode_timesteps = 0 @@ -27,11 +28,15 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): while not done and not truncated: episode_timesteps += 1 action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - state, reward, done, truncated, _ = env.step(action_env) + state, reward, done, truncated = env.step(action_env) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -42,11 +47,13 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + record.stop_video() + def ppo_train(env, agent, record, args): start_time = time.time() @@ -55,9 +62,6 @@ def ppo_train(env, agent, record, args): max_steps_per_batch = args["max_steps_per_batch"] number_steps_per_evaluation = args["number_steps_per_evaluation"] - min_action_value = env.action_space.low[0] - max_action_value = env.action_space.high[0] - episode_timesteps = 0 episode_num = 0 episode_reward = 0 @@ -66,16 +70,16 @@ def ppo_train(env, agent, record, args): evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) + action_env = hlp.denormalize(action, env.max_action_value, env.min_action_value) - next_state, reward, done, truncated, _ = env.step(action_env) + next_state, reward, done, truncated = env.step(action_env) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done, log_prob=log_prob) state = next_state @@ -115,7 +119,7 @@ def ppo_train(env, agent, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 diff --git a/example/value_example.py b/example/value_example.py index 9eeb7a92..ffe1ab64 100644 --- a/example/value_example.py +++ b/example/value_example.py @@ -1,18 +1,25 @@ from cares_reinforcement_learning.memory import * from cares_reinforcement_learning.util import helpers as hlp, Record +import numpy as np import time import gym import logging import random +from random import randrange + from timeit import default_timer as timer def evaluate_value_network(env, agent, args, record=None, total_steps=0): + if record is not None: + frame = env.grab_frame() + record.start_video(total_steps+1, frame) + number_eval_episodes = int(args["number_eval_episodes"]) - state, _ = env.reset() + state = env.reset() exploration_rate = args["exploration_min"] @@ -27,13 +34,17 @@ def evaluate_value_network(env, agent, args, record=None, total_steps=0): episode_timesteps += 1 if random.random() < exploration_rate: - action = env.action_space.sample() + action = randrange(env.action_num) else: action = agent.select_action_from_policy(state) - state, reward, done, truncated, _ = env.step(action) + state, reward, done, truncated = env.step(action) episode_reward += reward + if eval_episode_counter == 0 and record is not None: + frame = env.grab_frame() + record.log_video(frame) + if done or truncated: if record is not None: record.log_eval( @@ -44,11 +55,12 @@ def evaluate_value_network(env, agent, args, record=None, total_steps=0): ) # Reset environment - state, _ = env.reset() + state = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 + record.stop_video() def value_based_train(env, agent, memory, record, args): start_time = time.time() @@ -68,7 +80,7 @@ def value_based_train(env, agent, memory, record, args): evaluate = False - state, _ = env.reset(seed=seed) + state = env.reset() exploration_rate = 1 @@ -80,11 +92,11 @@ def value_based_train(env, agent, memory, record, args): exploration_rate = max(exploration_min, exploration_rate) if random.random() < exploration_rate: - action = env.action_space.sample() + action = randrange(env.action_num) else: action = agent.select_action_from_policy(state) - next_state, reward, done, truncated, _ = env.step(action) + next_state, reward, done, truncated = env.step(action) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) state = next_state episode_reward += reward @@ -124,7 +136,7 @@ def value_based_train(env, agent, memory, record, args): evaluate = False # Reset environment - state, _ = env.reset() + state = env.reset() episode_timesteps = 0 episode_reward = 0 episode_num += 1 diff --git a/requirements.txt b/requirements.txt index a2a50f8b..82b27633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,8 +27,5 @@ torchvision==0.14.1 typing_extensions==4.4.0 urllib3==1.26.13 PyYAML==6.0 -# Optional Dependencies if you want to leverage the GPU -# nvidia-cublas-cu11==11.10.3.66 -# nvidia-cuda-nvrtc-cu11==11.7.99 -# nvidia-cuda-runtime-cu11==11.7.99 -# nvidia-cudnn-cu11==8.5.0.96 +dm_control==1.0.10 +opencv-python