diff --git a/cares_reinforcement_learning/util/NetworkFactory.py b/cares_reinforcement_learning/util/NetworkFactory.py index c1309339..91ea3a11 100644 --- a/cares_reinforcement_learning/util/NetworkFactory.py +++ b/cares_reinforcement_learning/util/NetworkFactory.py @@ -1,6 +1,5 @@ import torch - def create_DQN(args): from cares_reinforcement_learning.algorithm.value import DQN from cares_reinforcement_learning.networks.DQN import Network diff --git a/cares_reinforcement_learning/util/Plot.py b/cares_reinforcement_learning/util/Plot.py deleted file mode 100644 index a0f453f3..00000000 --- a/cares_reinforcement_learning/util/Plot.py +++ /dev/null @@ -1,255 +0,0 @@ -import os - -import seaborn as sns -import matplotlib.pyplot as plt -import pandas as pd -import uuid - - - -# TODO make this more easy and simple, plot and store checkpoints - -def plot_average(x, y, x_label='x_value',y_label='y_value', title='Title', window_size=10, file_path='figures/figure.png', display=False): - - figure = plt.figure() - figure.set_figwidth(8) - sns.set_theme(style="darkgrid") - - data_dict = {x_label: x, y_label: y} - df = pd.DataFrame(data=data_dict) - - df["avg"] = df[y_label].rolling(window_size, min_periods=1).mean() - df["std_dev"] = df[y_label].rolling(window_size, min_periods=1).std() - - ax = sns.lineplot(data=df, x=x_label, y="avg", label='Average') - ax.set(xlabel=x_label, ylabel=y_label) - plt.fill_between(df[x_label], df["avg"] - df["std_dev"], df["avg"] + - df["std_dev"], alpha=0.4) - - sns.move_legend(ax, "lower right") - - plt.title(title) - - plt.savefig(file_path) - plt.close(figure) - -class Plot: - def __init__(self, title='Training', x_label='Episode', y_label='Reward', x_data=None, y_data=None, plot_freq=1, checkpoint_freq=1): - if x_data is None: - x_data = [] - if y_data is None: - y_data = [] - - plt.ion() - - self.title = title - - self.x_label = x_label - self.y_label = y_label - - self.figure = plt.figure() - self.figure.set_figwidth(8) - - self.x_data = x_data - self.y_data = y_data - - self.plot_num = 0 - self.plot_freq = plot_freq - self.checkpoint_freq = checkpoint_freq - - sns.set_theme(style="darkgrid") - - def post(self, reward): - self.plot_num += 1 - - self.x_data.append(len(self.x_data)) - self.y_data.append(reward) - - if self.plot_num % self.plot_freq == 0: - self.__create_plot() - plt.pause(10e-10) - - if self.plot_num % self.checkpoint_freq == 0: - pass - #self.save_csv(f'{self.title}.csv') - - def plot(self): - plt.ioff() - self.__create_plot() - plt.show() - - def __create_plot(self): - data_dict = {self.x_label: self.x_data, self.y_label: self.y_data} - df = pd.DataFrame(data=data_dict) - - sns.lineplot(data=df, x=self.x_label, y=self.y_label) - plt.title(self.title) - - def save_plot(self, file_name=str(uuid.uuid4().hex)): - self.__create_plot() - - dir_exists = os.path.exists("figures") - - if not dir_exists: - os.makedirs("figures") - - plt.savefig(f"figures/{file_name}") - - - - def plot_average(self, window_size=10, file_name=str(uuid.uuid4().hex)): - - plt.ioff() - - data_dict = {"Episode": self.x_data, "Reward": self.y_data} - df = pd.DataFrame(data=data_dict) - - df["Average Reward"] = df["Reward"].rolling(window_size).mean() - df["Standard Deviation"] = df["Reward"].rolling(window_size).std() - - ax = sns.lineplot(data=df, x="Episode", y="Average Reward", label="Average Reward") - ax.set(xlabel="Episode", ylabel="Reward") - plt.fill_between(df["Reward"], df["Average Reward"] - df["Standard Deviation"], df["Average Reward"] + - df["Standard Deviation"], alpha=0.4) - - sns.move_legend(ax, "lower right") - - plt.title(self.title) - - dir_exists = os.path.exists("figures") - - if not dir_exists: - os.makedirs("figures") - - plt.savefig(f"figures/{file_name}") - - plt.show() - -# -# -# def read_file(file_path: str): -# """ -# Reads a file that contains rewards separated by new line -# -# Parameters: -# file_path: a string path to the data file -# """ -# file = open(file_path, "r") -# strings = file.readlines() -# floats = [float(x) for x in strings] -# return floats -# -# -# def plot_learning(title: str, reward, file_name: str = "figure.png"): -# """ -# Plot the learning of the agent. Saves the figure to figures directory -# -# Parameters: -# title: title of the plot -# reward: the array of rewards to be plot -# file_name: the name of the figure when saved to disc -# """ -# y = reward -# x = range(1, len(reward) + 1) -# -# print(reward) -# print(x) -# -# data_dict = {"Episode": x, "Reward": y} -# df = pd.DataFrame(data=data_dict) -# -# sns.set_theme(style="darkgrid") -# plt.figure().set_figwidth(8) -# -# sns.lineplot(data=df, x="Episode", y="Reward") -# plt.title(title) -# -# dir_exists = os.path.exists("figures") -# -# if not dir_exists: -# os.makedirs("figures") -# -# plt.savefig(f"figures/{file_name}") -# plt.show() -# -# -# def plot_learning_vs_average(title: str, reward, file_name: str = "figure.png", window_size: int = 10): -# """ -# Plot the rolling average and the actual learning. Saves the figure to figures directory -# -# Parameters: -# title: title of the plot -# reward: the array of rewards to be plot -# file_name: the name of the figure when saved to disc -# window_size: the size of the rolling average window -# """ -# y = reward -# x = range(1, len(reward) + 1) -# -# data_dict = {"Episode": x, "Reward": y} -# df = pd.DataFrame(data=data_dict) -# -# df["Average Reward"] = df["Reward"].rolling(window_size).mean() -# -# sns.set_theme(style="darkgrid") -# plt.figure().set_figwidth(8) -# -# sns.lineplot(data=df, x="Episode", y="Reward", alpha=0.4) -# sns.lineplot(data=df, x="Episode", y="Average Reward") -# -# plt.fill_between(df["Episode"], df["Reward"], df["Average Reward"], alpha=0.4) -# plt.title(title) -# -# dir_exists = os.path.exists("figures") -# -# if not dir_exists: -# os.makedirs("figures") -# -# plt.savefig(f"figures/{file_name}") -# -# plt.show() -# -# -# def plot_average_with_std(reward, -# title: str = "Cool Graph", -# file_name: str = "figure.png", -# window_size: int = 10): -# """ -# Plot the rolling average and standard deviation. Saves the figure to figures directory -# -# Parameters: -# title: title of the plot -# reward: the array of rewards to be plot -# file_name: the name of the figure when saved to disc -# window_size: the size of the rolling average window -# """ -# y = reward -# x = range(1, len(reward) + 1) -# -# data_dict = {"Episode": x, "Reward": y} -# df = pd.DataFrame(data=data_dict) -# -# df["Average Reward"] = df["Reward"].rolling(window_size).mean() -# df["Standard Deviation"] = df["Reward"].rolling(window_size).std() -# -# sns.set_theme(style="darkgrid") -# plt.figure().set_figwidth(8) -# -# ax = sns.lineplot(data=df, x="Episode", y="Average Reward", label="Average Reward") -# ax.set(xlabel="Episode", ylabel="Reward") -# plt.fill_between(df["Episode"], df["Average Reward"] - df["Standard Deviation"], df["Average Reward"] + -# df["Standard Deviation"], alpha=0.4) -# -# sns.move_legend(ax, "lower right") -# -# plt.title(title) -# -# dir_exists = os.path.exists("figures") -# -# if not dir_exists: -# os.makedirs("figures") -# -# plt.savefig(f"figures/{file_name}") -# -# plt.show() -# diff --git a/cares_reinforcement_learning/util/Record.py b/cares_reinforcement_learning/util/Record.py index 88766d49..4b06447b 100644 --- a/cares_reinforcement_learning/util/Record.py +++ b/cares_reinforcement_learning/util/Record.py @@ -1,103 +1,100 @@ -import pandas as pd -from datetime import datetime import os import logging -import torch + +import pandas as pd + import yaml -from cares_reinforcement_learning.util.Plot import plot_average -import math -# Python has no max int -MAX_INT = 9999999 +from pathlib import Path +from datetime import datetime + +import cares_reinforcement_learning.util.plotter as plt class Record: - def __init__(self, glob_log_dir=None, log_dir=None, networks={}, checkpoint_freq=None, config=None, keep_checkpoints=False) -> None: - - self.glob_log_dir = glob_log_dir or 'rl_logs' - self.log_dir = log_dir or datetime.now().strftime("%y_%m_%d_%H:%M:%S") - self.dir = f'{self.glob_log_dir}/{self.log_dir}' + def __init__(self, glob_log_dir=None, log_dir=None, network=None, config=None) -> None: + self.task = config["args"]["task"] + self.algoritm = config["args"]["algorithm"] + self.plot_frequency = config["args"]["plot_frequency"] + self.checkpoint_frequency = config["args"]["checkpoint_frequency"] - self.data = pd.DataFrame() + self.glob_log_dir = glob_log_dir or f'{Path.home()}/cares_rl_logs' + self.log_dir = log_dir or f"{self.algoritm}-{self.task}-{datetime.now().strftime('%y_%m_%d_%H:%M:%S')}" + self.directory = f'{self.glob_log_dir}/{self.log_dir}' - if checkpoint_freq < 10: - raise Exception('Checkpoint Frequency should be at least 10') + self.train_data = pd.DataFrame() + self.eval_data = pd.DataFrame() + self.info_data = pd.DataFrame() - self.checkpoint_freq = checkpoint_freq - - self.networks = networks + self.network = network self.log_count = 0 - - self.initial_log_keys = set() + self.__initialise_directories() - - self.keep_checkpoints = keep_checkpoints - + if config: - with open(f'{self.dir}/config.yml', 'w') as outfile: + with open(f'{self.directory}/config.yml', 'w') as outfile: yaml.dump(config, outfile, default_flow_style=False) - def log(self, out=False, **logs): + def log_info(self, info, display=False): + self.info_data = pd.concat([self.info_data, pd.DataFrame([info])], ignore_index=True) + self.save_data(self.info_data, "info", info, display=display) + + def log_train(self, display=False, **logs): self.log_count += 1 - - if not self.initial_log_keys: - logging.info('Setting Log Values') - self.initial_log_keys = self.initial_log_keys.union(logs.keys()) - - if not logs.keys() <= self.initial_log_keys: - logging.warning('Introducing new columns') - self.initial_log_keys = self.initial_log_keys.union(logs.keys()) - - if self.checkpoint_freq and self.log_count % self.checkpoint_freq == 0: - self.save(f'_checkpoint') - - self.data = pd.concat([self.data, pd.DataFrame([logs])], ignore_index=True) - + + self.train_data = pd.concat([self.train_data, pd.DataFrame([logs])], ignore_index=True) + self.save_data(self.train_data, "train", logs, display=display) + + if self.log_count % self.plot_frequency == 0: + plt.plot_train(self.train_data, f"Training-{self.algoritm}-{self.task}", f"{self.algoritm}", self.directory, "train", 20) + + if self.network is not None and self.log_count % self.checkpoint_frequency == 0: + self.network.save_models(f"{self.algoritm}-checkpoint-{self.log_count}", self.directory) + + def log_eval(self, display=False, **logs): + self.eval_data = pd.concat([self.eval_data, pd.DataFrame([logs])], ignore_index=True) + self.save_data(self.eval_data, "eval", logs, display=display) + + plt.plot_eval(self.eval_data, f"Evaluation-{self.algoritm}-{self.task}", f"{self.algoritm}", self.directory, "eval") + + def save_data(self, data_frame, filename, logs, display=True): + if data_frame.empty: + logging.warning('Trying to save an Empty Dataframe') + + path = f'{self.directory}/data/{filename}.csv' + data_frame.to_csv(path, index=False) + string = [f'{key}: {str(val)[0:10]:6s}' for key, val in logs.items()] string = ' | '.join(string) string = '| ' + string + ' |' - if out: + if display: print(string) - - def save(self, sfx='_final'): - if self.data.empty: - logging.warning('Trying to save an Empty Dataframe') - - path = f'{self.dir}/data/data{sfx}.csv' - self.data.to_csv(path, mode='a', header=not os.path.exists(path), index=False) - self.data.drop(self.data.index, inplace=True) - - # data = pd.read_csv(path) - - # for name, data in self.data.items(): - # plot_average( - # x=range(len(data.dropna())), - # y=data.dropna(), - # x_label='x', - # y_label=name, - # title=f'Average {name}', - # window_size=math.floor(len(data)/10), - # file_path=f'{self.dir}/figures/{name}_avg{sfx}.png' - # ) - - if self.networks: - for name, network in self.networks.items(): - torch.save(network.state_dict(), f'{self.dir}/models/{name}{sfx}{f"-{self.log_count}" if self.keep_checkpoints else ""}.pht') - + + def save(self): + logging.info(f"Saving final outputs") + self.save_data(self.train_data, "train", {}, display=False) + self.save_data(self.eval_data, "eval", {}, display=False) + + plt.plot_eval(self.eval_data, f"Evaluation-{self.algoritm}-{self.task}", f"{self.algoritm}", self.directory, "eval") + plt.plot_train(self.train_data, f"Training-{self.algoritm}-{self.task}", f"{self.algoritm}", self.directory, "train", 20) + + if self.network is not None: + self.network.save_models(self.algoritm, self.directory) + def __initialise_directories(self): if not os.path.exists(self.glob_log_dir): os.mkdir(self.glob_log_dir) - if not os.path.exists(self.dir): - os.mkdir(self.dir) + if not os.path.exists(self.directory): + os.mkdir(self.directory) - if not os.path.exists(f'{self.dir}/data'): - os.mkdir(f'{self.dir}/data') + if not os.path.exists(f'{self.directory}/data'): + os.mkdir(f'{self.directory}/data') - if not os.path.exists(f'{self.dir}/models'): - os.mkdir(f'{self.dir}/models') + if not os.path.exists(f'{self.directory}/models'): + os.mkdir(f'{self.directory}/models') - if not os.path.exists(f'{self.dir}/figures'): - os.mkdir(f'{self.dir}/figures') + if not os.path.exists(f'{self.directory}/figures'): + os.mkdir(f'{self.directory}/figures') diff --git a/cares_reinforcement_learning/util/plotter.py b/cares_reinforcement_learning/util/plotter.py new file mode 100644 index 00000000..567717fb --- /dev/null +++ b/cares_reinforcement_learning/util/plotter.py @@ -0,0 +1,129 @@ +import os + +import argparse + +import yaml +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +import pandas as pd +import uuid + +# TODO make the plots look how people want them too. This is just a basic example +def plot_data(plot_frame, title, label, x_label, y_label, directory, filename, display=True, close_figure=True): + window_size = plot_frame["window_size"] + + # TODO make font size a parameter + plt.xlabel(x_label, fontsize=10) + plt.ylabel(y_label, fontsize=10) + plt.title(title, fontsize=10) + + ax = sns.lineplot(data=plot_frame, x=plot_frame["steps"], y="avg", label=label) + + Z = 1.960 # 95% confidence interval + confidence_interval = Z * plot_frame["std_dev"] / np.sqrt(window_size) + + plt.fill_between(plot_frame["steps"], plot_frame["avg"] - confidence_interval, plot_frame["avg"] + confidence_interval, alpha=0.4) + + plt.savefig(f"{directory}/figures/{filename}.png") + + if display: + plt.show() + + if close_figure: + plt.close() + +def plot_comparisons(plot_frames, title, labels, x_label, y_label, directory, filename, display=True): + for plot_frame, label in zip(plot_frames, labels): + plot_data(plot_frame, title, label, x_label, y_label, directory, filename, display=False, close_figure=False) + + if display: + plt.show() + + plt.close() + +def prepare_eval_plot_frame(eval_data): + x_data = "total_steps" + y_data = "episode_reward" + + window_size = eval_data['episode'].max() + + plot_frame = pd.DataFrame() + + frame_average = eval_data.groupby([x_data], as_index=False).mean() + frame_std = eval_data.groupby([x_data], as_index=False).std() + + plot_frame["steps"] = frame_average[x_data] + plot_frame["avg"] = frame_average[y_data] + plot_frame["std_dev"] = frame_std[y_data] + plot_frame["window_size"] = window_size + + return plot_frame + +def plot_eval(eval_data, title, label, directory, filename, display=False): + eval_plot_frame = prepare_eval_plot_frame(eval_data) + plot_data(eval_plot_frame, title, label, "Steps", "Average Reward", directory, filename, display) + +def prepare_train_plot_frame(train_data, window_size): + x_data = "total_steps" + y_data = "episode_reward" + + plot_frame = pd.DataFrame() + plot_frame["steps"] = train_data[x_data] + plot_frame["avg"] = train_data[y_data].rolling(window_size, step=1, min_periods=1).mean() + plot_frame["std_dev"] = train_data[y_data].rolling(window_size, step=1, min_periods=1).std() + plot_frame["window_size"] = window_size + + return plot_frame + +def plot_train(train_data, title, label, directory, filename, window_size, display=False): + train_plot_frame = prepare_train_plot_frame(train_data, window_size) + plot_data(train_plot_frame, title, label, "Steps", "Average Reward", directory, filename, display) + +def parse_args(): + parser = argparse.ArgumentParser() # Add an argument + + parser.add_argument('-s','--save_directory', type=str, required=True) + parser.add_argument('-d','--data_path', type=str, nargs='+', help='List of Directories', required=True) + parser.add_argument('-w','--window_size', type=int, required=True) + + return vars(parser.parse_args()) # converts into a dictionary + +class SafeLoaderIgnoreUnknown(yaml.SafeLoader): + def ignore_unknown(self, node): + return None + +def main(): + args = parse_args() + + directory = args["save_directory"] + window_size = args["window_size"] + + train_plot_frames = [] + eval_plot_frames = [] + labels = [] + title = "Undefined Task" + + SafeLoaderIgnoreUnknown.add_constructor(None, SafeLoaderIgnoreUnknown.ignore_unknown) + + for data_directory in args["data_path"]: + with open(f"{data_directory}/config.yml", 'r') as file: + config = yaml.load(file, Loader=SafeLoaderIgnoreUnknown) + + labels.append(config["args"]["algorithm"]) + title = config["args"]["task"] + + train_data = pd.read_csv(f"{data_directory}/data/train.csv") + eval_data = pd.read_csv(f"{data_directory}/data/eval.csv") + + train_plot_frame = prepare_train_plot_frame(train_data, window_size=window_size) + eval_plot_frame = prepare_eval_plot_frame(eval_data) + + train_plot_frames.append(train_plot_frame) + eval_plot_frames.append(eval_plot_frame) + + plot_comparisons(train_plot_frames, f"{title}", labels, "Steps", "Average Reward", directory, f"{title}-compare-train", True) + plot_comparisons(eval_plot_frames, f"{title}", labels, "Steps", "Average Reward", directory, f"{title}-compare-eval", True) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/example/example_training_loops.py b/example/example_training_loops.py index ea06ea72..506df6f4 100644 --- a/example/example_training_loops.py +++ b/example/example_training_loops.py @@ -1,9 +1,3 @@ -""" -Description: - This is a basic example of the training loop for Off Policy Algorithms, - We may move this later for each repo/env or keep this in this repo -""" - import time import argparse @@ -26,14 +20,12 @@ logging.basicConfig(level=logging.INFO) - def set_seed(env, seed): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) env.action_space.seed(seed) - def parse_args(): parser = argparse.ArgumentParser() # Add an argument @@ -49,7 +41,9 @@ def parse_args(): parser.add_argument('--max_steps_exploration', type=int, default=10000) parser.add_argument('--max_steps_training', type=int, default=50000) - parser.add_argument('--max_steps_evaluation', type=int, default=5000) + + parser.add_argument('--number_steps_per_evaluation', type=int, default=1000) + parser.add_argument('--number_eval_episodes', type=int, default=10) parser.add_argument('--seed', type=int, default=571) parser.add_argument('--evaluation_seed', type=int, default=152) @@ -63,6 +57,9 @@ def parse_args(): parser.add_argument('--max_steps_per_batch', type=float, default=5000) + parser.add_argument('--plot_frequency', type=int, default=100) + parser.add_argument('--checkpoint_frequency', type=int, default=100) + parser.add_argument('--display', type=str, default=True) return vars(parser.parse_args()) # converts into a dictionary @@ -108,23 +105,24 @@ def main(): logging.info(f"Memory: {args['memory']}") + #create the record class - standardised results tracking + record = Record(network=agent, config={'args': args}) # Train the policy or value based approach if args["algorithm"] == "PPO": - #create the record class - record = Record(networks={'actor':agent.actor_net, 'critic': agent.critic_net}, checkpoint_freq = 200,config={'args': args}, keep_checkpoints=True) ppe.ppo_train(env, agent, record, args) - ppe.evaluate_ppo_network(env, agent, record, args) + env = gym.make(env.spec.id, render_mode="human") + ppe.evaluate_ppo_network(env, agent, args) elif agent.type == "policy": - record = Record(networks={'actor':agent.actor_net, 'critic': agent.critic_net}, checkpoint_freq = 200, config={'args': args}, keep_checkpoints=True) pbe.policy_based_train(env, agent, memory, record, args) - pbe.evaluate_policy_network(env, agent, record, args) + env = gym.make(env.spec.id, render_mode="human") + pbe.evaluate_policy_network(env, agent, args) elif agent.type == "value": - record = Record(networks={'network':agent.network}, checkpoint_freq = 200, config={'args': args}, keep_checkpoints=True) vbe.value_based_train(env, agent, memory, record, args) - vbe.evaluate_value_network(env, agent, record, args) + env = gym.make(env.spec.id, render_mode="human") + vbe.evaluate_value_network(env, agent, args) else: raise ValueError(f"Agent type is unkown: {agent.type}") - + record.save() if __name__ == '__main__': diff --git a/example/policy_example.py b/example/policy_example.py index 789a8b76..cdeab164 100644 --- a/example/policy_example.py +++ b/example/policy_example.py @@ -6,49 +6,52 @@ import gym import logging -def evaluate_policy_network(env, agent, record, args): - evaluation_seed = args["evaluation_seed"] - max_steps_evaluation = args["max_steps_evaluation"] - if max_steps_evaluation == 0: - return +def evaluate_policy_network(env, agent, args, record=None, total_steps=0): + number_eval_episodes = int(args["number_eval_episodes"]) + min_action_value = env.action_space.low[0] max_action_value = env.action_space.high[0] - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - env = gym.make(env.spec.id, render_mode="human") - state, _ = env.reset(seed=evaluation_seed) - - for total_step_counter in range(int(max_steps_evaluation)): - episode_timesteps += 1 - action = agent.select_action_from_policy(state, evaluation=True) - action_env = hlp.denormalize(action, max_action_value, min_action_value) - - state, reward, done, truncated, _ = env.step(action_env) - episode_reward += reward - - if done or truncated: - record.log( - Eval_episode= episode_num + 1, - Eval_timesteps=episode_timesteps, - Eval_reward= episode_reward, - out=True - ) - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 - + state, _ = env.reset() + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action = agent.select_action_from_policy(state, evaluation=True) + action_env = hlp.denormalize(action, max_action_value, min_action_value) + + state, reward, done, truncated, _ = env.step(action_env) + episode_reward += reward + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 def policy_based_train(env, agent, memory, record, args): start_time = time.time() max_steps_training = args["max_steps_training"] max_steps_exploration = args["max_steps_exploration"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] + batch_size = args["batch_size"] seed = args["seed"] G = args["G"] @@ -60,20 +63,21 @@ def policy_based_train(env, agent, memory, record, args): episode_reward = 0 episode_num = 0 + evaluate = False + state, _ = env.reset(seed=seed) - env.render() + episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 if total_step_counter < max_steps_exploration: - logging.info(f"Running Exploration Steps {total_step_counter}/{max_steps_exploration}") + logging.info(f"Running Exploration Steps {total_step_counter+1}/{max_steps_exploration}") action_env = env.action_space.sample() # action range the env uses [e.g. -2 , 2 for pendulum] action = hlp.normalize(action_env, max_action_value, min_action_value) # algorithm range [-1, 1] else: action = agent.select_action_from_policy(state) # algorithm range [-1, 1] - action_env = hlp.denormalize(action, max_action_value, - min_action_value) # mapping to env range [e.g. -2 , 2 for pendulum] + action_env = hlp.denormalize(action, max_action_value, min_action_value) # mapping to env range [e.g. -2 , 2 for pendulum] next_state, reward, done, truncated, info = env.step(action_env) memory.add(state=state, action=action, reward=reward, next_state=next_state, done=done) @@ -82,8 +86,6 @@ def policy_based_train(env, agent, memory, record, args): episode_reward += reward if total_step_counter >= max_steps_exploration: - actor_loss = 0 - critic_loss = 0 for i in range(G): experience = memory.sample(batch_size) info = agent.train_policy(( @@ -94,29 +96,35 @@ def policy_based_train(env, agent, memory, record, args): experience['done'] )) memory.update_priorities(experience['indices'], info) - critic_loss += info['critic_loss_total'].item() - - if 'actor_loss' in info: - actor_loss += info['actor_loss'].item() - - # record average losses - record.log( - Train_steps = total_step_counter + 1, - Train_episode= episode_num + 1, - Train_timesteps=episode_timesteps, - Train_reward= episode_reward, - Actor_loss = actor_loss/(G/agent.policy_update_freq), - Critic_loss = critic_loss/G, - out=done or truncated - ) + # record.log_info(info, display=False) + + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_policy_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False # Reset environment state, _ = env.reset() - episode_reward = 0 episode_timesteps = 0 + episode_reward = 0 episode_num += 1 + episode_start = time.time() end_time = time.time() elapsed_time = end_time - start_time diff --git a/example/ppo_example.py b/example/ppo_example.py index 9e35f1f2..603a5a97 100644 --- a/example/ppo_example.py +++ b/example/ppo_example.py @@ -5,46 +5,47 @@ import time import gym import logging -import random +from timeit import default_timer as timer -def evaluate_ppo_network(env, agent, record, args): - evaluation_seed = args["evaluation_seed"] - max_steps_evaluation = args["max_steps_evaluation"] - if max_steps_evaluation == 0: - return +def evaluate_ppo_network(env, agent, args, record=None, total_steps=0): + + number_eval_episodes = int(args["number_eval_episodes"]) min_action_value = env.action_space.low[0] max_action_value = env.action_space.high[0] - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - env = gym.make(env.spec.id, render_mode="human") - state, _ = env.reset(seed=evaluation_seed) - - for total_step_counter in range(int(max_steps_evaluation)): - episode_timesteps += 1 - action, log_prob = agent.select_action_from_policy(state) - action_env = hlp.denormalize(action, max_action_value, min_action_value) - - state, reward, done, truncated, _ = env.step(action_env) - episode_reward += reward - - if done or truncated: - record.log( - Eval_episode= episode_num + 1, - Eval_timesteps=episode_timesteps, - Eval_reward= episode_reward, - out=True - ) - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 + state, _ = env.reset() + + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + action, log_prob = agent.select_action_from_policy(state) + action_env = hlp.denormalize(action, max_action_value, min_action_value) + + state, reward, done, truncated, _ = env.step(action_env) + episode_reward += reward + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 def ppo_train(env, agent, record, args): start_time = time.time() @@ -52,6 +53,7 @@ def ppo_train(env, agent, record, args): seed = args["seed"] max_steps_training = args["max_steps_training"] max_steps_per_batch = args["max_steps_per_batch"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] min_action_value = env.action_space.low[0] max_action_value = env.action_space.high[0] @@ -59,12 +61,14 @@ def ppo_train(env, agent, record, args): episode_timesteps = 0 episode_num = 0 episode_reward = 0 - time_step = 1 memory = MemoryBuffer() + evaluate = False + state, _ = env.reset(seed=seed) + episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 @@ -77,7 +81,7 @@ def ppo_train(env, agent, record, args): state = next_state episode_reward += reward - if time_step % max_steps_per_batch == 0: + if (total_step_counter+1) % max_steps_per_batch == 0: experience = memory.flush() info = agent.train_policy(( experience['state'], @@ -87,26 +91,35 @@ def ppo_train(env, agent, record, args): experience['done'], experience['log_prob'] )) + # record.log_info(info, display=False) - record.log( - Train_steps = total_step_counter + 1, - Train_episode= episode_num + 1, - Train_timesteps=episode_timesteps, - Train_reward= episode_reward, - Actor_loss = info['actor_loss'].item(), - Critic_loss = info['critic_loss'].item(), - out=done or truncated - ) - - time_step += 1 + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_ppo_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False # Reset environment state, _ = env.reset() - episode_reward = 0 episode_timesteps = 0 + episode_reward = 0 episode_num += 1 + episode_start = time.time() end_time = time.time() elapsed_time = end_time - start_time diff --git a/example/value_example.py b/example/value_example.py index 26e96f92..9eeb7a92 100644 --- a/example/value_example.py +++ b/example/value_example.py @@ -6,45 +6,48 @@ import logging import random +from timeit import default_timer as timer -def evaluate_value_network(env, agent, record, args): - evaluation_seed = args["evaluation_seed"] - max_steps_evaluation = args["max_steps_evaluation"] - if max_steps_evaluation == 0: - return +def evaluate_value_network(env, agent, args, record=None, total_steps=0): - episode_timesteps = 0 - episode_reward = 0 - episode_num = 0 - - env = gym.make(env.spec.id, render_mode="human") - state, _ = env.reset(seed=evaluation_seed) + number_eval_episodes = int(args["number_eval_episodes"]) + + state, _ = env.reset() + exploration_rate = args["exploration_min"] - for total_step_counter in range(int(max_steps_evaluation)): - episode_timesteps += 1 - - if random.random() < exploration_rate: - action = env.action_space.sample() - else: - action = agent.select_action_from_policy(state) - - state, reward, done, truncated, _ = env.step(action) - episode_reward += reward - - if done or truncated: - record.log( - Eval_steps = total_step_counter + 1, - Eval_episode= episode_num + 1, - Eval_timesteps=episode_timesteps, - Eval_reward= episode_reward - ) - - # Reset environment - state, _ = env.reset() - episode_reward = 0 - episode_timesteps = 0 - episode_num += 1 + for eval_episode_counter in range(number_eval_episodes): + episode_timesteps = 0 + episode_reward = 0 + episode_num = 0 + done = False + truncated = False + + while not done and not truncated: + episode_timesteps += 1 + + if random.random() < exploration_rate: + action = env.action_space.sample() + else: + action = agent.select_action_from_policy(state) + + state, reward, done, truncated, _ = env.step(action) + episode_reward += reward + + if done or truncated: + if record is not None: + record.log_eval( + total_steps=total_steps+1, + episode=eval_episode_counter+1, + episode_reward=episode_reward, + display=True + ) + + # Reset environment + state, _ = env.reset() + episode_reward = 0 + episode_timesteps = 0 + episode_num += 1 def value_based_train(env, agent, memory, record, args): @@ -53,6 +56,7 @@ def value_based_train(env, agent, memory, record, args): max_steps_training = args["max_steps_training"] exploration_min = args["exploration_min"] exploration_decay = args["exploration_decay"] + number_steps_per_evaluation = args["number_steps_per_evaluation"] batch_size = args["batch_size"] seed = args["seed"] @@ -61,10 +65,14 @@ def value_based_train(env, agent, memory, record, args): episode_timesteps = 0 episode_reward = 0 episode_num = 0 + + evaluate = False state, _ = env.reset(seed=seed) + exploration_rate = 1 + episode_start = time.time() for total_step_counter in range(int(max_steps_training)): episode_timesteps += 1 @@ -82,7 +90,6 @@ def value_based_train(env, agent, memory, record, args): episode_reward += reward if len(memory) > batch_size: - network_loss = 0 for _ in range(G): experience = memory.sample(batch_size) info = agent.train_policy(( @@ -93,24 +100,35 @@ def value_based_train(env, agent, memory, record, args): experience['done'] )) memory.update_priorities(experience['indices'], info) - network_loss += info['network_loss'].item() - - record.log( - Train_steps = total_step_counter + 1, - Train_episode= episode_num + 1, - Train_timesteps=episode_timesteps, - Train_reward= episode_reward, - network_loss = network_loss / G, - out=done or truncated - ) + # record.log_info(info, display=False) + + if (total_step_counter+1) % number_steps_per_evaluation == 0: + evaluate = True if done or truncated: + episode_time = time.time() - episode_start + record.log_train( + total_steps = total_step_counter + 1, + episode = episode_num + 1, + episode_steps=episode_timesteps, + episode_reward = episode_reward, + episode_time = episode_time, + display = True + ) + + if evaluate: + logging.info("*************--Evaluation Loop--*************") + args["evaluation_seed"] = seed + evaluate_value_network(env, agent, args, record=record, total_steps=total_step_counter) + logging.info("--------------------------------------------") + evaluate = False # Reset environment state, _ = env.reset() - episode_reward = 0 episode_timesteps = 0 + episode_reward = 0 episode_num += 1 + episode_start = time.time() end_time = time.time() elapsed_time = end_time - start_time