Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev/pydantic #93

Merged
merged 13 commits into from
Oct 18, 2023
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@ This repository includes a script that allows you to run any OpenAI Gymnasium (h

`example_training_loops.py` takes in hyperparameters that allow you to customise the training run enviromment – OpenAI or DMCS Environment - or RL algorithm. Use `python3 example_training_loops.py -h` for help on what parameters are available for customisation.

An example is found below for running on the OpenAI and DMCS environments with TD3:
An example is found below for running on the OpenAI and DMCS environments with TD3 through console
```
python3 example_training_loops.py openai --task HalfCheetah-v4 TD3
python example_training_loops.py run --gym openai --task HalfCheetah-v4 TD3


python3 example_training_loops.py dmcs --domain ball_in_cup --task catch TD3
python3 example_training_loops.py run dmcs --domain ball_in_cup --task catch TD3
```

An example is found below for running using pre-defined configuration files
```
python example_training_loops.py config --env_config ~/cares_rl_configs/env_dmcs_config.json --training_config ~/cares_rl_configs/training_config.json --algorithm_config ~/cares_rl_configs/algorithm_config.json
```

### Data Outputs
Expand All @@ -37,7 +42,9 @@ All data from a training run is saved into '~/cares_rl_logs'. A folder will be c
```
ALGORITHM-TASK-YY_MM_DD:HH:MM:SS/
├─ SEED
| ├─ config.py
| ├─ env_config.py
| ├─ alg_config.py
| ├─ train_config.py
| ├─ data
| | ├─ train.csv
| | ├─ eval.csv
Expand Down
1 change: 0 additions & 1 deletion cares_reinforcement_learning/algorithm/policy/DDPG.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import torch
import torch.nn.functional as F


class DDPG:

def __init__(self,
Expand Down
1 change: 0 additions & 1 deletion cares_reinforcement_learning/algorithm/policy/PPO.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import torch.nn.functional as F
from torch.distributions import MultivariateNormal


class PPO:
def __init__(self,
actor_network,
Expand Down
1 change: 0 additions & 1 deletion cares_reinforcement_learning/algorithm/policy/SAC.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import torch
import torch.nn.functional as F


class SAC:
def __init__(self,
actor_network,
Expand Down
3 changes: 1 addition & 2 deletions cares_reinforcement_learning/algorithm/policy/TD3.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import torch
import torch.nn.functional as F


class TD3(object):
def __init__(self,
actor_network,
Expand Down Expand Up @@ -147,4 +146,4 @@ def load_models(self, filepath, filename):

self.actor_net.load_state_dict(torch.load(f'{path}/{filename}_actor.pht'))
self.critic_net.load_state_dict(torch.load(f'{path}/{filename}_critic.pht'))
logging.info("models has been loaded...")
logging.info("models has been loaded...")
1 change: 0 additions & 1 deletion cares_reinforcement_learning/algorithm/value/DQN.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
import torch.nn.functional as F


class DQN:

def __init__(self,
Expand Down
1 change: 0 additions & 1 deletion cares_reinforcement_learning/algorithm/value/DoubleDQN.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import numpy as np
import torch.nn.functional as F


class DoubleDQN:

def __init__(self,
Expand Down
25 changes: 11 additions & 14 deletions cares_reinforcement_learning/train_loops/policy_loop.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
from cares_reinforcement_learning.memory import MemoryBuffer
from cares_reinforcement_learning.memory.augments import *
from cares_reinforcement_learning.util import helpers as hlp, Record
from cares_reinforcement_learning.util.configurations import TrainingConfig
from cares_reinforcement_learning.util import helpers as hlp

import cv2
import time
import gym
import logging
import numpy as np

def evaluate_policy_network(env, agent, args, record=None, total_steps=0):
def evaluate_policy_network(env, agent, config: TrainingConfig, record=None, total_steps=0):

if record is not None:
frame = env.grab_frame()
record.start_video(total_steps+1, frame)

number_eval_episodes = int(args["number_eval_episodes"])
number_eval_episodes = int(config.number_eval_episodes)

state = env.reset()

Expand Down Expand Up @@ -54,18 +53,17 @@ def evaluate_policy_network(env, agent, args, record=None, total_steps=0):

record.stop_video()

def policy_based_train(env, agent, memory, record, args):
def policy_based_train(env, agent, memory, record, config: TrainingConfig):
start_time = time.time()

max_steps_training = args["max_steps_training"]
max_steps_exploration = args["max_steps_exploration"]
number_steps_per_evaluation = args["number_steps_per_evaluation"]
max_steps_training = config.max_steps_training
max_steps_exploration = config.max_steps_exploration
number_steps_per_evaluation = config.number_steps_per_evaluation

logging.info(f"Training {max_steps_training} Exploration {max_steps_exploration} Evaluation {number_steps_per_evaluation}")

batch_size = args["batch_size"]
seed = args["seed"]
G = args["G"]
batch_size = config.batch_size
G = config.G

episode_timesteps = 0
episode_reward = 0
Expand Down Expand Up @@ -126,8 +124,7 @@ def policy_based_train(env, agent, memory, record, args):

if evaluate:
logging.info("*************--Evaluation Loop--*************")
args["evaluation_seed"] = seed
evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter)
evaluate_policy_network(env, agent, config, record=record, total_steps=total_step_counter)
logging.info("--------------------------------------------")
evaluate = False

Expand Down
20 changes: 9 additions & 11 deletions cares_reinforcement_learning/train_loops/ppo_loop.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from cares_reinforcement_learning.memory import *
from cares_reinforcement_learning.util.configurations import TrainingConfig
from cares_reinforcement_learning.util import helpers as hlp
from cares_reinforcement_learning.util import Record
from cares_reinforcement_learning.memory import MemoryBuffer

import time
import gym
import logging

from timeit import default_timer as timer

def evaluate_ppo_network(env, agent, args, record=None, total_steps=0):
def evaluate_ppo_network(env, agent, config: TrainingConfig, record=None, total_steps=0):

if record is not None:
frame = env.grab_frame()
record.start_video(total_steps+1, frame)

number_eval_episodes = int(args["number_eval_episodes"])
number_eval_episodes = int(config.number_eval_episodes)

state = env.reset()

Expand Down Expand Up @@ -54,13 +54,12 @@ def evaluate_ppo_network(env, agent, args, record=None, total_steps=0):

record.stop_video()

def ppo_train(env, agent, record, args):
def ppo_train(env, agent, record, config: TrainingConfig):
start_time = time.time()

seed = args["seed"]
max_steps_training = args["max_steps_training"]
max_steps_per_batch = args["max_steps_per_batch"]
number_steps_per_evaluation = args["number_steps_per_evaluation"]
max_steps_training = config.max_steps_training
max_steps_per_batch = config.max_steps_per_batch
number_steps_per_evaluation = config.number_steps_per_evaluation

episode_timesteps = 0
episode_num = 0
Expand Down Expand Up @@ -113,8 +112,7 @@ def ppo_train(env, agent, record, args):

if evaluate:
logging.info("*************--Evaluation Loop--*************")
args["evaluation_seed"] = seed
evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter)
evaluate_ppo_network(env, agent, config, record=record, total_steps=total_step_counter)
logging.info("--------------------------------------------")
evaluate = False

Expand Down
28 changes: 13 additions & 15 deletions cares_reinforcement_learning/train_loops/value_loop.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from cares_reinforcement_learning.memory import *
from cares_reinforcement_learning.util import helpers as hlp, Record
from cares_reinforcement_learning.util.configurations import TrainingConfig
from cares_reinforcement_learning.util import helpers as hlp

import numpy as np
import time
Expand All @@ -11,17 +11,17 @@

from timeit import default_timer as timer

def evaluate_value_network(env, agent, args, record=None, total_steps=0):
def evaluate_value_network(env, agent, config: TrainingConfig, record=None, total_steps=0):

if record is not None:
frame = env.grab_frame()
record.start_video(total_steps+1, frame)

number_eval_episodes = int(args["number_eval_episodes"])
number_eval_episodes = int(config.number_eval_episodes)

state = env.reset()

exploration_rate = args["exploration_min"]
exploration_rate = config.exploration_min

for eval_episode_counter in range(number_eval_episodes):
episode_timesteps = 0
Expand Down Expand Up @@ -62,17 +62,16 @@ def evaluate_value_network(env, agent, args, record=None, total_steps=0):

record.stop_video()

def value_based_train(env, agent, memory, record, args):
def value_based_train(env, agent, memory, record, config: TrainingConfig):
start_time = time.time()

max_steps_training = args["max_steps_training"]
exploration_min = args["exploration_min"]
exploration_decay = args["exploration_decay"]
number_steps_per_evaluation = args["number_steps_per_evaluation"]
max_steps_training = config.max_steps_training
exploration_min = config.exploration_min
exploration_decay = config.exploration_decay
number_steps_per_evaluation = config.number_steps_per_evaluation

batch_size = args["batch_size"]
seed = args["seed"]
G = args["G"]
batch_size = config.batch_size
G = config.G

episode_timesteps = 0
episode_reward = 0
Expand Down Expand Up @@ -130,8 +129,7 @@ def value_based_train(env, agent, memory, record, args):

if evaluate:
logging.info("*************--Evaluation Loop--*************")
args["evaluation_seed"] = seed
evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter)
evaluate_value_network(env, agent, config, record=record, total_steps=total_step_counter)
logging.info("--------------------------------------------")
evaluate = False

Expand Down
45 changes: 23 additions & 22 deletions cares_reinforcement_learning/util/EnvironmentFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,26 @@
# from typing import override
from functools import cached_property

from cares_reinforcement_learning.util.configurations import EnvironmentConfig

class EnvironmentFactory:
def __init__(self) -> None:
pass

def create_environment(self, gym_environment, args):
logging.info(f"Training Environment: {gym_environment}")
if gym_environment == 'dmcs':
env = DMCSImage(args=args) if args['image_observation'] else DMCS(args=args)
elif gym_environment == "openai":
env = OpenAIGym(args=args)
def create_environment(self, config: EnvironmentConfig):
logging.info(f"Training Environment: {config.gym}")
if config.gym == 'dmcs':
env = DMCSImage(config) if config.image_observation else DMCS(config)
elif config.gym == "openai":
env = OpenAIGymImage(config) if config.image_observation else OpenAIGym(config)
else:
raise ValueError(f"Unkown environment: {gym_environment}")
raise ValueError(f"Unkown environment: {config.gym}")
return env

class OpenAIGym:
def __init__(self, args) -> None:
logging.info(f"Training task {args['task']}")
self.env = gym.make(args["task"], render_mode="rgb_array")
self.set_seed(args['seed'])
def __init__(self, config: EnvironmentConfig) -> None:
logging.info(f"Training task {config.task}")
self.env = gym.make(config.task, render_mode="rgb_array")

@cached_property
def max_action_value(self):
Expand Down Expand Up @@ -72,15 +73,15 @@ def grab_frame(self, height=240, width=300):
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # Convert to BGR for use with OpenCV
return frame

class OpenAIGymImage:
def __init__(self, args, k=3):
class OpenAIGymImage(OpenAIGym):
def __init__(self, config: EnvironmentConfig, k=3):
self.k = k # number of frames to be stacked
self.frames_stacked = deque([], maxlen=k)

self.frame_width = 84
self.frame_height = 84

super().__init__(args=args)
super().__init__(config)

# @override
@property
Expand Down Expand Up @@ -108,13 +109,13 @@ def step(self, action):
return stacked_frames, reward, done, False # for consistency with open ai gym just add false for truncated

class DMCS:
def __init__(self, args) -> None:
logging.info(f"Training on Domain {args['domain']}")
logging.info(f"Training with Task {args['task']}")
def __init__(self, config: EnvironmentConfig) -> None:
logging.info(f"Training on Domain {config.domain}")
logging.info(f"Training with Task {config.task}")

self.domain = args['domain']
self.task = args['task']
self.env = suite.load(self.domain, self.task, task_kwargs={'random': args['seed']})
self.domain = config.domain
self.task = config.task
self.env = suite.load(self.domain, self.task)

@cached_property
def min_action_value(self):
Expand Down Expand Up @@ -154,14 +155,14 @@ def grab_frame(self, camera_id=0, height=240, width=300):

# TODO paramatise the observation size 3x84x84
class DMCSImage(DMCS):
def __init__(self, args, k=3):
def __init__(self, config: EnvironmentConfig, k=3):
self.k = k # number of frames to be stacked
self.frames_stacked = deque([], maxlen=k)

self.frame_width = 84
self.frame_height = 84

super().__init__(args=args)
super().__init__(config)

# @override
@property
Expand Down
Loading