From c11acae80ccd614e9a9db89903683973d3553662 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:23:43 +0200 Subject: [PATCH 01/37] Add Training State to APP Reinforcement Learning --- .../src/TrainingState.cpp | 91 ++++++++++++ .../src/TrainingState.h | 137 ++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 lib/APPReinforcementLearning/src/TrainingState.cpp create mode 100644 lib/APPReinforcementLearning/src/TrainingState.h diff --git a/lib/APPReinforcementLearning/src/TrainingState.cpp b/lib/APPReinforcementLearning/src/TrainingState.cpp new file mode 100644 index 00000000..91e91597 --- /dev/null +++ b/lib/APPReinforcementLearning/src/TrainingState.cpp @@ -0,0 +1,91 @@ +/* MIT License + * + * Copyright (c) 2023 - 2024 Andreas Merkle + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/******************************************************************************* + DESCRIPTION +*******************************************************************************/ +/** + * @brief Training state + * @author Akram Bziouech + */ + +/****************************************************************************** + * Includes + *****************************************************************************/ +#include "TrainingState.h" +#include +#include + +/****************************************************************************** + * Compiler Switches + *****************************************************************************/ + +/****************************************************************************** + * Macros + *****************************************************************************/ + +/****************************************************************************** + * Types and classes + *****************************************************************************/ + +/****************************************************************************** + * Prototypes + *****************************************************************************/ + +/****************************************************************************** + * Local Variables + *****************************************************************************/ + +/****************************************************************************** + * Public Methods + *****************************************************************************/ + +void TrainingState::entry() +{ + DifferentialDrive& diffDrive = DifferentialDrive::getInstance(); + diffDrive.setLinearSpeed(0, 0); + diffDrive.disable(); +} + +void TrainingState::process(StateMachine& sm) +{ + /* Nothing to do. */ + (void)sm; +} + +void TrainingState::exit() +{ + DifferentialDrive::getInstance().enable(); +} + +/****************************************************************************** + * Protected Methods + *****************************************************************************/ + +/****************************************************************************** + * Private Methods + *****************************************************************************/ + +/****************************************************************************** + * External Functions + *****************************************************************************/ diff --git a/lib/APPReinforcementLearning/src/TrainingState.h b/lib/APPReinforcementLearning/src/TrainingState.h new file mode 100644 index 00000000..737d7a89 --- /dev/null +++ b/lib/APPReinforcementLearning/src/TrainingState.h @@ -0,0 +1,137 @@ +/* MIT License + * + * Copyright (c) 2023 - 2024 Andreas Merkle + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/******************************************************************************* + DESCRIPTION +*******************************************************************************/ +/** + * @brief Training state + * @author Akram Bziouech + * + * @addtogroup Application + * + * @{ + */ + +#ifndef TRAINING_STATE_H +#define TRAINING_STATE_H + +/****************************************************************************** + * Compile Switches + *****************************************************************************/ + +/****************************************************************************** + * Includes + *****************************************************************************/ +#include +#include +#include + +/****************************************************************************** + * Macros + *****************************************************************************/ + +/****************************************************************************** + * Types and Classes + *****************************************************************************/ + +/** The system Training state. */ +class TrainingState : public IState +{ +public: + /** + * Get state instance. + * + * @return State instance. + */ + static TrainingState& getInstance() + { + static TrainingState instance; + + /* Singleton idiom to force initialization during first usage. */ + + return instance; + } + + /** + * If the state is entered, this method will called once. + */ + void entry() final; + + /** + * Processing the state. + * + * @param[in] sm State machine, which is calling this state. + */ + void process(StateMachine& sm) final; + + /** + * If the state is left, this method will be called once. + */ + void exit() final; + +protected: +private: + /** + * Default constructor. + */ + TrainingState() + { + } + + /** + * Default destructor. + */ + ~TrainingState() + { + } + + /** + * Copy construction of an instance. + * Not allowed. + * + * @param[in] state Source instance. + */ + TrainingState(const TrainingState& state); + + /** + * Assignment of an instance. + * Not allowed. + * + * @param[in] state Source instance. + * + * @returns Reference to Training instance. + */ + TrainingState& operator=(const TrainingState& state); +}; + +/****************************************************************************** + * Functions + *****************************************************************************/ + +#endif /* TRAINING_STATE_H */ +/** @} */ + +/****************************************************************************** + * Local Functions + *****************************************************************************/ From 8a75912b3e5201e260fdc95e8685dd483200a109 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:28:13 +0200 Subject: [PATCH 02/37] Handle failure to send data over the serial Mux protocol --- lib/APPReinforcementLearning/src/App.cpp | 61 +++++++++++++----------- lib/APPReinforcementLearning/src/App.h | 9 ++-- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index 2f469d32..623ff636 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -42,6 +42,7 @@ #include #include "ReadyState.h" #include +#include "TrainingState.h" /****************************************************************************** * Compiler Switches @@ -123,8 +124,7 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - /* Ignoring return value, as error handling is not available. */ - (void)m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); m_statusTimer.restart(); } @@ -133,7 +133,23 @@ void App::loop() if (true == m_sendLineSensorsDataInterval.isTimeout() && (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - sendLineSensorsData(); + ILineSensors& lineSensors = Board::getInstance().getLineSensors(); + uint8_t maxLineSensors = lineSensors.getNumLineSensors(); + const uint16_t* lineSensorValues = lineSensors.getSensorValues(); + uint8_t lineSensorIdx = 0U; + LineSensorData payload; + + if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) + { + while (maxLineSensors > lineSensorIdx) + { + payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx]; + + ++lineSensorIdx; + } + } + + m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); m_sendLineSensorsDataInterval.restart(); } @@ -148,13 +164,19 @@ void App::loop() SMPChannelPayload::Mode payload = (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - /* Ignoring return value, as error handling is not available. */ - (void)m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); m_modeSelectionSent = true; } } + if (false == m_data_sent) + { + /* Failed to send data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("DSF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + m_smpServer.process(millis()); m_systemStateMachine.process(); @@ -174,6 +196,13 @@ void App::handleRemoteCommand(const Command& cmd) m_modeSelectionSent = false; break; + case SMPChannelPayload::CmdId::CMD_ID_SET_TRAINING_STATE: + Odometry::getInstance().clearPosition(); + Odometry::getInstance().clearMileage(); + + m_systemStateMachine.setState(&TrainingState::getInstance()); + break; + default: /* Nothing to do. */ break; @@ -188,28 +217,6 @@ void App::handleRemoteCommand(const Command& cmd) * Private Methods *****************************************************************************/ -void App::sendLineSensorsData() const -{ - ILineSensors& lineSensors = Board::getInstance().getLineSensors(); - uint8_t maxLineSensors = lineSensors.getNumLineSensors(); - const uint16_t* lineSensorValues = lineSensors.getSensorValues(); - uint8_t lineSensorIdx = 0U; - LineSensorData payload; - - if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) - { - while (maxLineSensors > lineSensorIdx) - { - payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx]; - - ++lineSensorIdx; - } - } - - /* Ignoring return value, as error handling is not available. */ - (void)m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); -} - bool App::setupSerialMuxProt() { bool isSuccessful = false; diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index 8615bfa0..4d73fc4a 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -73,6 +73,7 @@ class App m_statusTimer(), m_sendLineSensorsDataInterval(), m_modeSelectionSent(false), + m_data_sent(true), m_smpServer(Serial, this) { } @@ -141,6 +142,9 @@ class App /** Ensue that the mode is only sent once */ bool m_modeSelectionSent; + /** check if the data has been sent */ + bool m_data_sent; + /** * Setup the SerialMuxProt channels. * @@ -148,11 +152,6 @@ class App */ bool setupSerialMuxProt(); - /** - * Send line sensors data via SerialMuxProt. - */ - void sendLineSensorsData() const; - /** * Copy construction of an instance. * Not allowed. From e79d46fffc255202acf833dbd9fb153f7969ede0 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:36:29 +0200 Subject: [PATCH 03/37] Delete unnecessary functions and add a timer to handle state changes --- .../src/ReadyState.cpp | 57 ++----------------- lib/APPReinforcementLearning/src/ReadyState.h | 21 +------ 2 files changed, 7 insertions(+), 71 deletions(-) diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp index 5e17cd6e..ec720077 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.cpp +++ b/lib/APPReinforcementLearning/src/ReadyState.cpp @@ -59,13 +59,6 @@ * Local Variables *****************************************************************************/ -const uint16_t ReadyState::SENSOR_VALUE_MAX = Board::getInstance().getLineSensors().getSensorValueMax(); - -/* Initialize the required sensor IDs to be generic. */ -const uint8_t ReadyState::SENSOR_ID_MOST_LEFT = 0U; -const uint8_t ReadyState::SENSOR_ID_MIDDLE = Board::getInstance().getLineSensors().getNumLineSensors() / 2U; -const uint8_t ReadyState::SENSOR_ID_MOST_RIGHT = Board::getInstance().getLineSensors().getNumLineSensors() - 1U; - /****************************************************************************** * Public Methods *****************************************************************************/ @@ -87,6 +80,7 @@ void ReadyState::entry() display.print(m_lapTime); display.print("ms"); } + m_stateTransitionTimer.start(m_state_Transition_period); m_modeTimeoutTimer.start(mode_selected_period); m_mode = IDLE; m_isLastStartStopLineDetected = false; @@ -99,11 +93,6 @@ void ReadyState::process(StateMachine& sm) IButton& buttonA = Board::getInstance().getButtonA(); IButton& buttonB = Board::getInstance().getButtonB(); - /* Get each sensor value. */ - ILineSensors& lineSensors = Board::getInstance().getLineSensors(); - uint8_t maxLineSensors = lineSensors.getNumLineSensors(); - const uint16_t* lineSensorValues = lineSensors.getSensorValues(); - /* Shall the driving mode be released? */ if (true == Util::isButtonTriggered(buttonA, m_isButtonAPressed)) { @@ -132,17 +121,11 @@ void ReadyState::process(StateMachine& sm) ; } - /**Drive forward until START LINE is crossed */ - driveUntilStartLineisCrossed(); - - if (false == (isStartStopLineDetected(lineSensorValues, maxLineSensors)) && (true == m_isLastStartStopLineDetected)) + /* Set DrivingState */ + if (true == m_stateTransitionTimer.isTimeout()) { sm.setState(&DrivingState::getInstance()); } - else - { - m_isLastStartStopLineDetected = isStartStopLineDetected(lineSensorValues, maxLineSensors); - } } void ReadyState::exit() @@ -174,45 +157,13 @@ ReadyState::ReadyState() : m_isButtonAPressed(false), m_isButtonBPressed(false), m_modeTimeoutTimer(), + m_stateTransitionTimer(), m_lapTime(0), m_isLastStartStopLineDetected(false), m_mode(IDLE) { } -/**Drive forward until START LINE is crossed */ -void ReadyState::driveUntilStartLineisCrossed() -{ - DifferentialDrive& diffDrive = DifferentialDrive::getInstance(); - int16_t top_speed = 2000; /* Set a top speed of 2000 */ - int16_t leftMotor = top_speed / 2U; /* Drive at half speed */ - int16_t rightMotor = top_speed / 2U; /* Drive at half speed */ - diffDrive.setLinearSpeed(leftMotor, rightMotor); -} - -bool ReadyState::isStartStopLineDetected(const uint16_t* lineSensorValues, uint8_t length) const -{ - bool isDetected = false; - const uint32_t LINE_MAX_30 = (SENSOR_VALUE_MAX * 3U) / 10U; /* 30 % of max. value */ - const uint32_t LINE_MAX_70 = (SENSOR_VALUE_MAX * 7U) / 10U; /* 70 % of max. value */ - - /* - * === = === - * + + + + + - * L M R - */ - if ((LINE_MAX_30 <= lineSensorValues[SENSOR_ID_MOST_LEFT]) && - (LINE_MAX_70 > lineSensorValues[SENSOR_ID_MIDDLE - 1U]) && - (LINE_MAX_70 <= lineSensorValues[SENSOR_ID_MIDDLE]) && - (LINE_MAX_70 > lineSensorValues[SENSOR_ID_MIDDLE + 1U]) && - (LINE_MAX_30 <= lineSensorValues[SENSOR_ID_MOST_RIGHT])) - { - isDetected = true; - } - - return isDetected; -} - /****************************************************************************** * External Functions *****************************************************************************/ diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 772003cd..76c59ebf 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -117,8 +117,9 @@ class ReadyState : public IState }; /** Duration of the selected mode in ms. This is the maximum time to select a mode. */ - static const uint32_t mode_selected_period = 200U; + static const uint32_t mode_selected_period = 100; + static const uint32_t m_state_Transition_period = 150; /** * The line sensor threshold (normalized) used to detect the track. * The track is detected in case the received value is greater or equal than @@ -153,7 +154,7 @@ class ReadyState : public IState uint32_t m_lapTime; /**< Lap time in ms of the last successful driven round. */ bool m_isButtonAPressed; /**< Is the button A pressed (last time)? */ bool m_isButtonBPressed; /**< Is the button B pressed (last time)? */ - + SimpleTimer m_stateTransitionTimer; /**< Timer to handle State changes */ /** * Default constructor. */ @@ -183,22 +184,6 @@ class ReadyState : public IState * @returns Reference to ErrorState instance. */ ReadyState& operator=(const ReadyState& state); - - /* - * The robot moves from its current position - * until it crosses and leaves the start line. - */ - void driveUntilStartLineisCrossed(); - - /** - * Is the start/stop line detected? - * - * @param[in] lineSensorValues The line sensor values as array. - * @param[in] length The number of line sensor values. - * - * @return If start/stop line detected, it will return true otherwise false. - */ - bool isStartStopLineDetected(const uint16_t* lineSensorValues, uint8_t length) const; }; /****************************************************************************** From cd3542ba1c6eddaa848124e4f564de5d725ac5f5 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:40:52 +0200 Subject: [PATCH 04/37] Add constraint to SetSpeedTarget function to limit motor speeds between 0 and maxSpeed. --- lib/APPReinforcementLearning/src/DrivingState.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/APPReinforcementLearning/src/DrivingState.cpp b/lib/APPReinforcementLearning/src/DrivingState.cpp index 5ef59c6c..c95bb6aa 100644 --- a/lib/APPReinforcementLearning/src/DrivingState.cpp +++ b/lib/APPReinforcementLearning/src/DrivingState.cpp @@ -79,9 +79,15 @@ void DrivingState::exit() m_observationTimer.stop(); } -void DrivingState::setTargetSpeeds(int16_t leftMotor, int16_t rightMotor) +void DrivingState::setTargetSpeeds(int16_t leftSpeed, int16_t rightSpeed) { - DifferentialDrive::getInstance().setLinearSpeed(leftMotor, rightMotor); + DifferentialDrive& diffDrive = DifferentialDrive::getInstance(); + const int16_t MAX_MOTOR_SPEED = diffDrive.getMaxMotorSpeed(); + const int16_t MIN_MOTOR_SPEED = 0; + leftSpeed = constrain(leftSpeed, MIN_MOTOR_SPEED, MAX_MOTOR_SPEED); + rightSpeed = constrain(rightSpeed, MIN_MOTOR_SPEED, MAX_MOTOR_SPEED); + + DifferentialDrive::getInstance().setLinearSpeed(leftSpeed, rightSpeed); } bool DrivingState::isAbortRequired() From e79697f4062ef63a8767e91bc72f310153097e93 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:45:02 +0200 Subject: [PATCH 05/37] Add Agent, TrajectoryBuffer, and Networks classes, and integrate them into the rl_supervisor. --- webots/controllers/RL_Supervisor/agent.py | 501 ++++++++++++++++++ webots/controllers/RL_Supervisor/networks.py | 106 ++++ .../RL_Supervisor/rl_supervisor.py | 162 +++--- .../RL_Supervisor/trajectory_buffer.py | 143 +++++ 4 files changed, 849 insertions(+), 63 deletions(-) create mode 100644 webots/controllers/RL_Supervisor/agent.py create mode 100644 webots/controllers/RL_Supervisor/networks.py create mode 100644 webots/controllers/RL_Supervisor/trajectory_buffer.py diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py new file mode 100644 index 00000000..a6422069 --- /dev/null +++ b/webots/controllers/RL_Supervisor/agent.py @@ -0,0 +1,501 @@ +""" Implementation ofintelligent agent.""" + +# MIT License +# +# Copyright (c) 2023 - 2024 Andreas Merkle +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +################################################################################ +# Imports +################################################################################ + +import struct +import numpy as np +import tensorflow as tf +import tensorflow_probability as tfp +from trajectory_buffer import Memory +from networks import Networks + +################################################################################ +# Variables +################################################################################ + +# Constants +COMMAND_CHANNEL_NAME = "CMD" +CMD_DLC = 1 + +SPEED_SET_CHANNEL_NAME = "SPEED_SET" +SPEED_SET_DLC = 4 + +LINE_SENSOR_CHANNEL_NAME = "LINE_SENS" +LINE_SENSOR_ON_TRACK_MIN_VALUE = 200 + +STATUS_CHANNEL_NAME = "STATUS" +STATUS_CHANNEL_ERROR_VAL = 1 + +MODE_CHANNEL_NAME = "MODE" +CMD_ID_SET_READY_STATE = 1 +CMD_ID_SET_TRAINING_STATE = 2 +POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683] +ORIENTATION_DATA = [ + -1.0564747468923541e-06, + 8.746699709178704e-07, + 0.9999999999990595, + 1.5880805820884731, +] +MAX_SENSOR_VALUE = 1000 +MIN_STD_DEV = 0.1 +STD_DEV_FACTOR = 0.3995 + +################################################################################ +# Classes +################################################################################ + + +class Agent: # pylint: disable=too-many-instance-attributes + """The Agent class represents an intelligent agent that makes decisions to + control motors based on the position of the robot.""" + + # pylint: disable=too-many-arguments + def __init__( + self, + smp_server, + gamma=0.99, + alpha=0.0003, + gae_lambda=0.95, + policy_clip=0.2, + batch_size=64, + chkpt_dir="models/", + top_speed=2000, + max_buffer_length=600, + ): + self.__serialmux = smp_server + self.__gamma = gamma + self.__alpha = alpha + self.__gae_lambda = gae_lambda + self.__policy_clip = policy_clip + self.__chkpt_dir = chkpt_dir + self.train_mode = False + self.__top_speed = top_speed + self.__memory = Memory(batch_size, max_buffer_length) + self.__neural_network = Networks(self.__alpha) + self.__training_index = 0 # Track batch index during training + self.__current_batch = None # Saving of the current batch which is in process + self.__std_dev = 1 + self.done = False + self.action = None + self.value = None + self.adjusted_log_prob = None + self.num_episodes = 0 + self.state = "IDLE" + self.data_sent = True + self.unsent_data = [] + + def set_train_mode(self): + """Set the Agent mode to train mode.""" + self.train_mode = True + self.state = "READY" + + def set_drive_mode(self): + """Set the Agent mode to drive mode.""" + self.train_mode = False + self.state = "READY" + self.num_episodes = 0 + + def store_transition( + self, state, action, probs, vals, reward, done + ): # pylint: disable=too-many-arguments + """Store transitions in the replay buffer.""" + + self.__memory.store_memory(state, action, probs, vals, reward, done) + + def save_models(self): + """Saves the models in the specified file.""" + + self.__neural_network.actor_network.save(self.__chkpt_dir + "actor.keras") + self.__neural_network.critic_network.save(self.__chkpt_dir + "critic.keras") + + def load_models(self): + """Loads the models in the specified file.""" + + self.__neural_network.actor_network = tf.keras.models.load_model( + self.__chkpt_dir + "actor.keras", compile=False + ) + self.__neural_network.critic_network = tf.keras.models.load_model( + self.__chkpt_dir + "critic.keras", compile=False + ) + + def predict_action(self, state): + """Predicts an action based on the current state.""" + + # Conversion of the state into a tensor + m_state = self.normalize_sensor_data(state) + state = tf.convert_to_tensor([m_state], dtype=tf.float32) + + # Calculation of probabilities by the Actor neural network + probs = self.__neural_network.actor_network(state) + + # Create a normal distribution with the calculated probabilities and the standard deviation + dist = tfp.distributions.Normal(probs, self.__std_dev) + + # Sampling an action from the normal distribution + sampled_action = dist.sample() + + # Apply the Tanh transformation to the sampled action + transformed_action = tf.tanh(sampled_action) + + # Calculation of the logarithm of the probability density of the sampled action + log_prob = dist.log_prob(sampled_action) + + # Calculation of the Jacobian determinant for the Tanh transformation + jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6) + + # Calculation of Adjusted probabilities by the neural network + adjusted_log_prob = log_prob - jacobian_log_det + + # calculate the estimated value of a state, which is determined by the Critic network + value = self.__neural_network.critic_network(state) + + self.action = transformed_action.numpy()[0] + self.value = value.numpy()[0] + self.adjusted_log_prob = adjusted_log_prob.numpy()[0] + + return self.action + + def send_motor_speeds(self, state): + """Sends the motor speeds to the robot.""" + + pre_action = self.predict_action(state) + + # Get motor speed difference + speeddifference = self.__top_speed * pre_action + + # Get individual motor speeds. The sign of speedDifference + # determines if the robot turns left or right. + left_motor_speed = int(self.__top_speed - speeddifference) + right_motor_speed = int(self.__top_speed + speeddifference) + + control_data = struct.pack("2H", left_motor_speed, right_motor_speed) + self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data) + + # Failed to send data. Appends the data to unsent_data List. + if self.data_sent is False: + self.unsent_data.append(("SPEED_SET", control_data)) + + def update(self, robot_node): + """Checks if the sequence has ended, performs updates, and saves the models.""" + + # Checks whether the sequence has ended if it is set to Training mode. + if (self.train_mode is True) and ( + (self.done is True) or (self.__memory.is_memory_full() is True) + ): + cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE) + self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + + # Failed to send data. Appends the data to unsent_data List + if self.data_sent is False: + self.unsent_data.append(("CMD", cmd_payload)) + + self.reinitialize(robot_node) + self.state = "TRAINING" + + # save models + if self.__memory.get_sum_rewards() >= 1000.0: + self.save_models() + + # Checks whether the sequence has ended if it is set to driving mode. + if (self.train_mode is False) and (self.done is True): + self.done = False + motorcontrol = struct.pack("2H", 0, 0) + cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) + + self.data_sent = self.__serialmux.send_data( + "SPEED_SET", motorcontrol + ) # stop the motors immediately + # Failed to send data. Appends the data to unsent_data List + if self.data_sent is False: + self.unsent_data.append(("SPEED_SET", motorcontrol)) + + self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + + # Failed to send data. Appends the data to unsent_data List + if self.data_sent is False: + self.unsent_data.append(("CMD", cmd_payload)) + + def normalize_sensor_data(self, sensor_data): + """The normalize_sensor_data function scales the sensor data to a range between 0 and 1.""" + + # Normalized sensor data + sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE + + return sensor_data + + def calculate_reward(self, sensor_data): + """ + The calculate_reward function evaluates the consequences of a certain + action performed in a certain state by calculating the resulting reward + """ + estimated_pos = self.calculate_position(sensor_data) + + # Return reward between 0 and 10 + if 500 <= estimated_pos <= 2000: + reward = ((1 / 150) * estimated_pos) - (10 / 3) + return reward + + if 2000 < estimated_pos <= 3500: + reward = ((-1 / 150) * estimated_pos) + (70 / 3) + return reward + + return 0 + + def calculate_position(self, sensor_data): + """ + Determines the deviation and returns an estimated position of the robot + with respect to a line. The estimate is made using a weighted average of + the sensor indices multiplied by 1000, so that a return value of 0 + indicates that the line is directly below sensor 0, a return value of + 1000 indicates that the line is directly below sensor 1, 2000 + indicates that it's below sensor 2000, etc. Intermediate values + indicate that the line is between two sensors. The formula is: + + 0*value0 + 1000*value1 + 2000*value2 + ... + -------------------------------------------- + value0 + value1 + value2 + ... + + This function assumes a dark line (high values) surrounded by white + (low values). + + Parameters + ---------- + sensor_data : List + + Returns + ---------- + Estimated position with respect to track. + """ + + estimated_pos = 0 + numerator = 0 + denominator = 0 + weight = 1000 + is_on_line = False + + max_sensors = len(sensor_data) + sensor_max_value = 1000 + last_pos_value = 0 + + for idx, sensor_value in enumerate(sensor_data): + numerator += idx * weight * sensor_value + denominator += sensor_value + + # Keep track of whether we see the line at all. + if LINE_SENSOR_ON_TRACK_MIN_VALUE < sensor_value: + is_on_line = True + + if False is is_on_line: + + # If it last read to the left of center, return 0. + if last_pos_value < ((max_sensors - 1) * sensor_max_value) / 2: + estimated_pos = 0 + + # If it last read to the right of center, return the max. value. + else: + estimated_pos = (max_sensors - 1) * sensor_max_value + + else: + # Check to avoid division by zero.. + if denominator != 0: + estimated_pos = numerator / denominator + + last_pos_value = estimated_pos + + return estimated_pos + + def calculate_advantages(self, rewards, values, dones): + """Calculate advantages for each state in a mini-batch.""" + + mini_batch_size = len(rewards) + + # Create empty advantages array + advantages = np.zeros(mini_batch_size, dtype=np.float32) + + for start_index in range(mini_batch_size): + discount = 1 + advantage = 0 + + for future_index in range(start_index, mini_batch_size - 1): + + # Calculate the temporal difference (TD) + delta = ( + rewards[future_index] + + ( + self.__gamma + * values[future_index + 1] + * (1 - int(dones[future_index])) + ) + - values[future_index] + ) + + # Accumulate the advantage using the discount factor + advantage += discount * delta + + # Update the discount factor for the next step + discount *= self.__gamma * self.__gae_lambda + + # Stop if a terminal state is reached + if dones[future_index]: + break + + # Save the calculated advantage for the current state + advantages[start_index] = advantage + + return advantages + + def learn( + self, states, actions, old_probs, values, rewards, dones + ): # pylint: disable=too-many-arguments + # pylint: disable=too-many-locals + """Perform training to optimize model weights""" + + advantages = self.calculate_advantages(rewards, values, dones) + + # optimize Actor Network weights + with tf.GradientTape() as tape: + states = tf.convert_to_tensor(states) + actions = tf.convert_to_tensor(actions) + old_probs = tf.convert_to_tensor(old_probs) + + # Predicts an action based on the Saved states + predict = self.__neural_network.actor_network(states) + new_dist = tfp.distributions.Normal(predict, 0.1) + new_log_prob = new_dist.log_prob(actions) + transformed_action = tf.tanh(actions) + jacobian_log_det = tf.math.log(1 - tf.square(transformed_action)) + adjusted_new_log_prob = new_log_prob - jacobian_log_det + + # The ratio between the new model and the old model’s action log probabilities + prob_ratio = tf.exp(adjusted_new_log_prob - old_probs) + + # If the ratio is too large or too small, it will be + # clipped according to the surrogate function. + weighted_probs = prob_ratio * advantages + clipped_probs = tf.clip_by_value( + prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip + ) + weighted_clipped_probs = clipped_probs * advantages + + # Policy Gradient Loss + actor_loss = -tf.reduce_mean( + tf.minimum(weighted_probs, weighted_clipped_probs) + ) + + # calculate gradient + actor_params = self.__neural_network.actor_network.trainable_variables + actor_grads = tape.gradient(actor_loss, actor_params) + self.__neural_network.actor_optimizer.apply_gradients( + zip(actor_grads, actor_params) + ) + + # optimize Critic Network weights + with tf.GradientTape() as tape: + + critic_value = self.__neural_network.critic_network(states) + critic_value = tf.squeeze(critic_value, 1) + returns = advantages + values + + # Generate loss + critic_loss = tf.keras.losses.MSE(critic_value, returns) + + # calculate gradient + critic_params = self.__neural_network.critic_network.trainable_variables + critic_grads = tape.gradient(critic_loss, critic_params) + self.__neural_network.critic_optimizer.apply_gradients( + zip(critic_grads, critic_params) + ) + + def perform_training(self): + """Runs the training process.""" + + if self.__current_batch is None: + + # Grab sample from memory + self.__current_batch = self.__memory.generate_batches() + + # Perform training with mini batchtes. + if self.__training_index < len(self.__current_batch[-1]): + ( + state_arr, + action_arr, + old_prob_arr, + vals_arr, + reward_arr, + dones_arr, + batches, + ) = self.__current_batch + batch = batches[self.__training_index] + + self.learn( + state_arr[batch], + action_arr[batch], + old_prob_arr[batch], + vals_arr[batch], + reward_arr[batch], + dones_arr[batch], + ) + self.__training_index += 1 + + # Training completed. + else: + self.__training_index = 0 + self.__current_batch = None + self.done = False + self.__memory.clear_memory() + self.state = "IDLE" + self.num_episodes += 1 + cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) + self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + + # Failed to send data. Appends the data to unsent_data List + if self.data_sent is False: + self.unsent_data.append(("CMD", cmd_payload)) + + # Minimize standard deviation until the minimum standard deviation is reached + self.__std_dev = self.__std_dev * STD_DEV_FACTOR + self.__std_dev = max(self.__std_dev, MIN_STD_DEV) + + def reinitialize(self, robot_node): + """Re-initialization of position and orientation of The ROBOT.""" + trans_field = robot_node.getField("translation") + rot_field = robot_node.getField("rotation") + initial_position = POSITION_DATA + initial_orientation = ORIENTATION_DATA + trans_field.setSFVec3f(initial_position) + rot_field.setSFRotation(initial_orientation) + + +################################################################################ +# Functions +################################################################################ + +################################################################################ +# Main +################################################################################ diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py new file mode 100644 index 00000000..0328d60b --- /dev/null +++ b/webots/controllers/RL_Supervisor/networks.py @@ -0,0 +1,106 @@ +"""Implementing Actor and Critic Networks""" + +# MIT License +# +# Copyright (c) 2023 - 2024 Andreas Merkle +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +################################################################################ +# Imports +################################################################################ + +import tensorflow as tf +from tensorflow.keras import layers # pylint: disable=import-error + +################################################################################ +# Variables +################################################################################ + + +################################################################################ +# Classes +################################################################################ + + +class Networks: + """Class for building networks of actors and critics""" + + def __init__(self, alpha): + self.__learning_rate = alpha + self.actor_network = self.build_actor_network() + self.critic_network = self.build_critic_network() + self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) + self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) + + def build_actor_network(self): + """Build Actor Network.""" + + state_input = layers.Input(shape=(5,)) # Assuming 5 sensor inputs + fc1 = layers.Dense( + 64, + activation="relu", + kernel_initializer="he_normal", + bias_initializer="zeros", + )(state_input) + fc2 = layers.Dense( + 64, + activation="relu", + kernel_initializer="he_normal", + bias_initializer="zeros", + )(fc1) + mu = layers.Dense( + 1, + activation="tanh", + kernel_initializer="glorot_uniform", + bias_initializer="zeros", + )(fc2) + + return tf.keras.models.Model(inputs=state_input, outputs=mu) + + def build_critic_network(self): + """Build Critic Network""" + + state_input = layers.Input(shape=(5,)) # Assuming 5 sensor inputs + fc1 = layers.Dense( + 64, + activation="relu", + kernel_initializer="he_normal", + bias_initializer="zeros", + )(state_input) + fc2 = layers.Dense( + 64, + activation="relu", + kernel_initializer="he_normal", + bias_initializer="zeros", + )(fc1) + value = layers.Dense(1)(fc2) # Value output + + return tf.keras.models.Model(inputs=state_input, outputs=value) + + +################################################################################ +# Functions +################################################################################ + +################################################################################ +# Main +################################################################################ diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index 309e9bd0..90880fe9 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -34,9 +34,11 @@ import sys import struct +import os from controller import Supervisor # pylint: disable=import-error from serial_webots import SerialWebots from SerialMuxProt import Server +from agent import Agent ################################################################################ # Variables @@ -62,14 +64,12 @@ MODE_CHANNEL_NAME = "MODE" CMD_ID_SET_READY_STATE = 1 -POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683] -ORIENTATION_DATA = [ - -1.0564747468923541e-06, - 8.746699709178704e-07, - 0.9999999999990595, - 1.5880805820884731, -] +MIN_NUMBER_OF_STEPS = 200 +SENSOR_ID_MOST_LEFT = 0 +SENSOR_ID_MOST_RIGHT = 4 +# Path of saved models +PATH = "models/" ################################################################################ # Classes @@ -79,51 +79,28 @@ class RobotController: """Class for data flow control logic""" - def __init__(self, robot_node, smp_server, tick_size): - self.__robot_node = robot_node + def __init__(self, smp_server, tick_size, agent): self.__smp_server = smp_server + self.__agent = agent self.__tick_size = tick_size self.__no_line_detection_count = 0 self.__timestamp = 0 # Elapsed time since reset [ms] - self.is_sequence_complete = False - - def reinitialize(self): - """Re-initialization of position and orientation of The ROBOT.""" - trans_field = self.__robot_node.getField("translation") - rot_field = self.__robot_node.getField("rotation") - initial_position = POSITION_DATA - initial_orientation = ORIENTATION_DATA - trans_field.setSFVec3f(initial_position) - rot_field.setSFRotation(initial_orientation) - - def callback_status(self, payload: bytearray) -> int: + self.last_sensor_data = None + self.steps = 0 + + def callback_status(self, payload: bytearray) -> None: """Callback Status Channel.""" # perform action on robot status feedback if payload[0] == STATUS_CHANNEL_ERROR_VAL: print("robot has reached error-state (max. lap time passed in robot)") - # stop robot motors - self.__smp_server.send_data(SPEED_SET_CHANNEL_NAME, struct.pack("2H", 0, 0)) - - # switch robot to ready state - self.__smp_server.send_data( - COMMAND_CHANNEL_NAME, struct.pack("B", CMD_ID_SET_READY_STATE) - ) - - # perform robot state reset - self.is_sequence_complete = True + self.__agent.done = 1 def callback_line_sensors(self, payload: bytearray) -> None: """Callback LINE_SENS Channel.""" sensor_data = struct.unpack("5H", payload) - - for idx in range(5): - if idx == 4: - print(f"Sensor[{idx}] = {sensor_data[idx]}") - else: - print(f"Sensor[{idx}] = {sensor_data[idx]}, ", end="") - + self.steps += 1 # determine lost line condition if all(value == 0 for value in sensor_data): self.__no_line_detection_count += 1 @@ -134,34 +111,67 @@ def callback_line_sensors(self, payload: bytearray) -> None: is_start_stop = all( value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data ) + # Detect Start/Stop Line before Finish Trajectories + if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS): + sensor_data = list(sensor_data) + sensor_data[SENSOR_ID_MOST_LEFT] = 0 + sensor_data[SENSOR_ID_MOST_RIGHT] = 0 + + # sequence stop criterion debounce no line detection and start/stop line detected + if self.__no_line_detection_count >= 30 or ( + is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS) + ): + self.__agent.done = True + self.__no_line_detection_count = 0 - # sequence stop criterion debounce no line detection and - if (self.__no_line_detection_count >= 20) or is_start_stop: - # Stop motors, maximum NO Line Detection Counter reached - self.__smp_server.send_data(SPEED_SET_CHANNEL_NAME, struct.pack("2H", 0, 0)) - - # SENDING A COMMAND ID SET READY STATUS - self.__smp_server.send_data( - COMMAND_CHANNEL_NAME, struct.pack("B", CMD_ID_SET_READY_STATE) - ) - self.is_sequence_complete = True - - else: - # [PLACEHOLDER] replace with line sensor data receiver - - # Set left and right motors speed to 1000 - self.__smp_server.send_data( - SPEED_SET_CHANNEL_NAME, struct.pack("2H", 1000, 1000) - ) + # The sequence of states and actions is stored in memory for the training phase. + if self.__agent.train_mode: + reward = self.__agent.calculate_reward(sensor_data) + + # Start storage The data after the second received sensor data + if self.last_sensor_data is not None: + normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data) + self.__agent.store_transition( + normalized_sensor_data, + self.__agent.action, + self.__agent.adjusted_log_prob, + self.__agent.value, + reward, + self.__agent.done, + ) + self.last_sensor_data = sensor_data + + # Sends the motor speeds to the robot. + if self.__agent.done is False and self.__agent.state == "READY": + self.__agent.send_motor_speeds(sensor_data) def callback_mode(self, payload: bytearray) -> None: """Callback MODE Channel.""" driving_mode = payload[0] if driving_mode: - print("Driving Mode Selected.") + self.__agent.set_drive_mode() + else: - print("Train Mode Selected.") + self.__agent.set_train_mode() + + def load_models(self, path) -> None: + """Load Model if exist""" + if os.path.exists(path): + self.__agent.load_models() + + def retry_unsent_data(self, unsent_data: list) -> bool: + """Resent any unsent Data""" + retry_succesful = True + + # Resent the unsent Data. + for data in unsent_data[:]: + if self.__smp_server.send_data(data[0], data[1]) is True: + unsent_data.remove(data) + else: + retry_succesful = False + + return retry_succesful def process(self): """function performing controller step""" @@ -177,6 +187,7 @@ def process(self): # pylint: disable=duplicate-code +# pylint: disable=too-many-branches def main_loop(): """Main loop: - Perform simulation steps until Webots is stopping the controller. @@ -229,9 +240,11 @@ def main_loop(): if sermux_channel_cmd_id == 0: print("ERROR: channel CMD not created.") status = -1 + # create instance of intelligence Agent + agent = Agent(smp_server) # create instance of robot logic class - controller = RobotController(robot_node, smp_server, timestep) + controller = RobotController(smp_server, timestep, agent) smp_server.subscribe_to_channel(STATUS_CHANNEL_NAME, controller.callback_status) @@ -243,14 +256,37 @@ def main_loop(): # setup successful if status != -1: + + controller.load_models(PATH) # simulation loop while supervisor.step(timestep) != -1: controller.process() - # stopping condition for sequence was reached - if controller.is_sequence_complete is True: - controller.reinitialize() - controller.is_sequence_complete = False + # Checks if the sequence has ended + if agent.state == "READY": + agent.update(robot_node) + + # Start the training + elif agent.state == "TRAINING": + controller.steps = 0 + agent.perform_training() + + if 1000 <= agent.num_episodes: + print(f"Episodes: {agent.num_episodes}") + + # Resent any unsent Data + if agent.unsent_data: + + # Stop The Simulation. Handel unsent Data + supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_PAUSE) + + # Set simulation mode to real time when unsent data is resent + if controller.retry_unsent_data(agent.unsent_data) is True: + supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_REAL_TIME) + + # Reset The Simulation + else: + supervisor.simulationReset() return status diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py new file mode 100644 index 00000000..84de8a14 --- /dev/null +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -0,0 +1,143 @@ +""" Implementation of a Trajectory Buffer """ + +# MIT License +# +# Copyright (c) 2023 - 2024 Andreas Merkle +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# + + +################################################################################ +# Imports +################################################################################ + +import numpy as np + +################################################################################ +# Variables +################################################################################ + + +################################################################################ +# Classes +################################################################################ + + +class Memory: # pylint: disable=too-many-instance-attributes + """Class for store and manage experience tuples during reinforcement learning""" + + def __init__(self, batch_size, max_length): + self.__states = [] + self.__probs = [] + self.__vals = [] + self.__actions = [] + self.__rewards = [] + self.__dones = [] + self.__max_length = max_length + self.__batch_size = batch_size + + def generate_batches(self): + """ + Generates batches of data for training. + + Returns + ---------- + Numpy-Array: States + Numpy-Array: Actions + Numpy-Array: Probs + Numpy-Array: Vals + Numpy-Array: Rewards + Numpy-Array: dones + List: Batches + """ + + # Determine the number of states + n_states = len(self.__states) + + # Calculate start indices for each batch + batch_start = np.arange(0, n_states, self.__batch_size) + + # Create indices for the states and mix them randomly + indices = np.arange(n_states, dtype=np.int64) + np.random.shuffle(indices) + + # Create batches by dividing the indices into groups of the batch_size + batches = [indices[indx : indx + self.__batch_size] for indx in batch_start] + print("Number of Steps:", n_states) + + return ( + np.array(self.__states), + np.array(self.__actions), + np.array(self.__probs), + np.array(self.__vals), + np.array(self.__rewards), + np.array(self.__dones), + batches, + ) + + def get_sum_rewards(self) -> float: + """Calculate sum of reward""" + sum_rewards = sum(self.__rewards) + + return sum_rewards + + def store_memory( + self, state, action, probs, vals, reward, done + ): # pylint: disable=too-many-arguments + """ + Store transitions in the replay buffer. + + Parameters + ---------- + state: The state observed. + action: The action taken. + prob: The probability of taking the action. + val: The estimated value of the state. + reward: The reward received. + done: Whether the episode is done. + """ + self.__states.append(state) + self.__actions.append(action) + self.__probs.append(probs) + self.__vals.append(vals) + self.__rewards.append(reward) + self.__dones.append(done) + + def clear_memory(self): + """Remove transitions from the replay buffer.""" + + self.__states = [] + self.__probs = [] + self.__vals = [] + self.__actions = [] + self.__rewards = [] + self.__dones = [] + + def is_memory_full(self): + """ + Checks whether Memory has reached its maximum capacity. + + Returns + ---------- + - Bool: Memory is full or not + """ + + is_full = len(self.__states) >= self.__max_length + return is_full From 4976b064fe36a2c2c5cd05c4faa4b48ebf56e4f6 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 5 Aug 2024 22:46:17 +0200 Subject: [PATCH 06/37] Add new Command ID --- lib/APPReinforcementLearning/src/SerialMuxchannels.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/APPReinforcementLearning/src/SerialMuxchannels.h b/lib/APPReinforcementLearning/src/SerialMuxchannels.h index 75daf582..a3f8f052 100644 --- a/lib/APPReinforcementLearning/src/SerialMuxchannels.h +++ b/lib/APPReinforcementLearning/src/SerialMuxchannels.h @@ -89,8 +89,9 @@ namespace SMPChannelPayload /** Remote control commands. */ typedef enum : uint8_t { - CMD_ID_IDLE = 0, /**< Nothing to do. */ - CMD_ID_SET_READY_STATE /**< Set Ready State. */ + CMD_ID_IDLE = 0, /**< Nothing to do. */ + CMD_ID_SET_READY_STATE, /**< Set Ready State. */ + CMD_ID_SET_TRAINING_STATE /**< Set Training State */ } CmdId; /**< Command ID */ From 5c5b28d4f560bb469b4525465f4de48050856e77 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 6 Aug 2024 10:06:02 +0200 Subject: [PATCH 07/37] Add missing comment --- lib/APPReinforcementLearning/src/ReadyState.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 76c59ebf..6a778fc4 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -119,7 +119,9 @@ class ReadyState : public IState /** Duration of the selected mode in ms. This is the maximum time to select a mode. */ static const uint32_t mode_selected_period = 100; + /** Duration to handle State changes in ms. */ static const uint32_t m_state_Transition_period = 150; + /** * The line sensor threshold (normalized) used to detect the track. * The track is detected in case the received value is greater or equal than From e66ae2f47acc71d07958d2ee03c6efd7ccb8f167 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 6 Aug 2024 10:30:27 +0200 Subject: [PATCH 08/37] resolve pylint import error and correct comments --- webots/controllers/RL_Supervisor/agent.py | 20 +++++++++---------- webots/controllers/RL_Supervisor/networks.py | 2 +- .../RL_Supervisor/rl_supervisor.py | 7 ++++--- .../RL_Supervisor/trajectory_buffer.py | 4 ++-- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index a6422069..98383bcd 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -1,4 +1,4 @@ -""" Implementation ofintelligent agent.""" +"""Implementation of intelligent agent.""" # MIT License # @@ -29,15 +29,16 @@ ################################################################################ import struct -import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp +import numpy as np # pylint: disable=import-error +import tensorflow as tf # pylint: disable=import-error +import tensorflow_probability as tfp # pylint: disable=import-error from trajectory_buffer import Memory from networks import Networks ################################################################################ # Variables ################################################################################ +# pylint: disable=duplicate-code # Constants COMMAND_CHANNEL_NAME = "CMD" @@ -63,8 +64,8 @@ 1.5880805820884731, ] MAX_SENSOR_VALUE = 1000 -MIN_STD_DEV = 0.1 -STD_DEV_FACTOR = 0.3995 +MIN_STD_DEV = 0.1 # Minimum standard deviation +STD_DEV_FACTOR = 0.3995 # Discounter standard deviation factor ################################################################################ # Classes @@ -370,10 +371,9 @@ def calculate_advantages(self, rewards, values, dones): return advantages - def learn( - self, states, actions, old_probs, values, rewards, dones - ): # pylint: disable=too-many-arguments - # pylint: disable=too-many-locals + # pylint: disable=too-many-arguments + # pylint: disable=too-many-locals + def learn(self, states, actions, old_probs, values, rewards, dones): """Perform training to optimize model weights""" advantages = self.calculate_advantages(rewards, values, dones) diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 0328d60b..84f9e98f 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,7 +28,7 @@ # Imports ################################################################################ -import tensorflow as tf +import tensorflow as tf # pylint: disable=import-error from tensorflow.keras import layers # pylint: disable=import-error ################################################################################ diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index 90880fe9..fa7388a8 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -43,7 +43,7 @@ ################################################################################ # Variables ################################################################################ - +# pylint: disable=duplicate-code # Constants ROBOT_NAME = "ROBOT" @@ -63,7 +63,7 @@ STATUS_CHANNEL_ERROR_VAL = 1 MODE_CHANNEL_NAME = "MODE" -CMD_ID_SET_READY_STATE = 1 + MIN_NUMBER_OF_STEPS = 200 SENSOR_ID_MOST_LEFT = 0 SENSOR_ID_MOST_RIGHT = 4 @@ -188,6 +188,7 @@ def process(self): # pylint: disable=duplicate-code # pylint: disable=too-many-branches +# pylint: disable=too-many-statements def main_loop(): """Main loop: - Perform simulation steps until Webots is stopping the controller. @@ -277,7 +278,7 @@ def main_loop(): # Resent any unsent Data if agent.unsent_data: - # Stop The Simulation. Handel unsent Data + # Stop The Simulation. Handle unsent Data supervisor.simulationSetMode(Supervisor.SIMULATION_MODE_PAUSE) # Set simulation mode to real time when unsent data is resent diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 84de8a14..01767d06 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -28,7 +28,7 @@ # Imports ################################################################################ -import numpy as np +import numpy as np # pylint: disable=import-error ################################################################################ # Variables @@ -93,7 +93,7 @@ def generate_batches(self): ) def get_sum_rewards(self) -> float: - """Calculate sum of reward""" + """Calculate total rewards""" sum_rewards = sum(self.__rewards) return sum_rewards From 93d2b3975d0099345f0b9a4e79fd315f5994a616 Mon Sep 17 00:00:00 2001 From: Akram Date: Wed, 7 Aug 2024 23:18:21 +0200 Subject: [PATCH 09/37] Add plotting script to analyze training logs --- webots/controllers/RL_Supervisor/agent.py | 75 ++++++++++++++------ webots/controllers/RL_Supervisor/plotting.py | 47 ++++++++++++ 2 files changed, 100 insertions(+), 22 deletions(-) create mode 100644 webots/controllers/RL_Supervisor/plotting.py diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 98383bcd..b3cdc924 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -27,7 +27,8 @@ ################################################################################ # Imports ################################################################################ - +import csv +import os import struct import numpy as np # pylint: disable=import-error import tensorflow as tf # pylint: disable=import-error @@ -65,7 +66,7 @@ ] MAX_SENSOR_VALUE = 1000 MIN_STD_DEV = 0.1 # Minimum standard deviation -STD_DEV_FACTOR = 0.3995 # Discounter standard deviation factor +STD_DEV_FACTOR = 0.99995 # Discounter standard deviation factor ################################################################################ # Classes @@ -110,6 +111,9 @@ def __init__( self.state = "IDLE" self.data_sent = True self.unsent_data = [] + self.critic_loss_history = [] + self.actor_loss_history = [] + self.reward_history = [] def set_train_mode(self): """Set the Agent mode to train mode.""" @@ -155,30 +159,33 @@ def predict_action(self, state): # Calculation of probabilities by the Actor neural network probs = self.__neural_network.actor_network(state) - # Create a normal distribution with the calculated probabilities and the standard deviation - dist = tfp.distributions.Normal(probs, self.__std_dev) + if self.train_mode is True: + # Create a normal distribution with the calculated probabilities and the standard deviation + dist = tfp.distributions.Normal(probs, self.__std_dev) - # Sampling an action from the normal distribution - sampled_action = dist.sample() + # Sampling an action from the normal distribution + sampled_action = dist.sample() - # Apply the Tanh transformation to the sampled action - transformed_action = tf.tanh(sampled_action) + # Apply the Tanh transformation to the sampled action + transformed_action = tf.tanh(sampled_action) - # Calculation of the logarithm of the probability density of the sampled action - log_prob = dist.log_prob(sampled_action) + # Calculation of the logarithm of the probability density of the sampled action + log_prob = dist.log_prob(sampled_action) - # Calculation of the Jacobian determinant for the Tanh transformation - jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6) + # Calculation of the Jacobian determinant for the Tanh transformation + jacobian_log_det = tf.math.log(1 - tf.square(transformed_action) + 1e-6) - # Calculation of Adjusted probabilities by the neural network - adjusted_log_prob = log_prob - jacobian_log_det + # Calculation of Adjusted probabilities by the neural network + adjusted_log_prob = log_prob - jacobian_log_det - # calculate the estimated value of a state, which is determined by the Critic network - value = self.__neural_network.critic_network(state) + # calculate the estimated value of a state, which is determined by the Critic network + value = self.__neural_network.critic_network(state) - self.action = transformed_action.numpy()[0] - self.value = value.numpy()[0] - self.adjusted_log_prob = adjusted_log_prob.numpy()[0] + self.action = transformed_action.numpy()[0] + self.value = value.numpy()[0] + self.adjusted_log_prob = adjusted_log_prob.numpy()[0] + else: + self.action = probs.numpy()[0] return self.action @@ -419,11 +426,9 @@ def learn(self, states, actions, old_probs, values, rewards, dones): with tf.GradientTape() as tape: critic_value = self.__neural_network.critic_network(states) - critic_value = tf.squeeze(critic_value, 1) returns = advantages + values - # Generate loss - critic_loss = tf.keras.losses.MSE(critic_value, returns) + critic_loss = tf.math.reduce_mean(tf.math.pow(returns - critic_value, 2)) # calculate gradient critic_params = self.__neural_network.critic_network.trainable_variables @@ -431,6 +436,32 @@ def learn(self, states, actions, old_probs, values, rewards, dones): self.__neural_network.critic_optimizer.apply_gradients( zip(critic_grads, critic_params) ) + self.actor_loss_history.append(actor_loss.numpy()) + self.critic_loss_history.append(critic_loss.numpy()) + self.reward_history.append(sum(rewards)) + + # saving logs in a CSV file + self.save_logs_to_csv() + + def save_logs_to_csv(self): + """Function for saving logs in a CSV file""" + + # Ensure the directory exists + log_dir = "logs" + os.makedirs(log_dir, exist_ok=True) + log_file = os.path.join(log_dir, "training_logs.csv") + + with open(log_file, mode="w", encoding="utf-8", newline="") as file: + writer = csv.writer(file) + writer.writerow(["Actor Loss", "Critic Loss", "Reward"]) + for indx, reward in enumerate(self.reward_history): + writer.writerow( + [ + self.actor_loss_history[indx], + self.critic_loss_history[indx], + reward, + ] + ) def perform_training(self): """Runs the training process.""" diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py new file mode 100644 index 00000000..a6afd7fc --- /dev/null +++ b/webots/controllers/RL_Supervisor/plotting.py @@ -0,0 +1,47 @@ +""" Plotting script with Matplotlib """ + +# Imports +import matplotlib.pyplot as plt +import pandas as pd + +# Define the path to the CSV file +LOG_FILE = "logs/training_logs.csv" + +# Read the CSV file +data = pd.read_csv(LOG_FILE) + +# Define length +data["Mini Batch"] = range(1, len(data) + 1) + +# Plotting Actor Loss +plt.figure(figsize=(10, 5)) +plt.plot(data["Mini Batch"], data["Actor Loss"], label="Actor Loss") +plt.xlabel("Mini Batch") +plt.ylabel("Loss") +plt.title("Actor Loss Over Mini Batches") +plt.legend() +plt.grid(True) +plt.savefig("logs/actor_loss_plot.png") +plt.show() + +# Plotting Critic Loss +plt.figure(figsize=(10, 5)) +plt.plot(data["Mini Batch"], data["Critic Loss"], label="Critic Loss") +plt.xlabel("Mini Batch") +plt.ylabel("Loss") +plt.title("Critic Loss Over Mini Batches") +plt.legend() +plt.grid(True) +plt.savefig("logs/critic_loss_plot.png") +plt.show() + +# Plotting Total Rewards +plt.figure(figsize=(10, 5)) +plt.plot(data["Mini Batch"], data["Reward"], label="Reward") +plt.xlabel("Mini Batch") +plt.ylabel("Total Reward") +plt.title("Total Rewards Over Mini Batches") +plt.legend() +plt.grid(True) +plt.savefig("logs/total_rewards_plot.png") +plt.show() From 849771dbc18996f038cb4841cac7a02acfe392e3 Mon Sep 17 00:00:00 2001 From: Akram Date: Thu, 8 Aug 2024 11:54:22 +0200 Subject: [PATCH 10/37] Add comments and fix import errors --- webots/controllers/RL_Supervisor/agent.py | 14 +++++++++++--- webots/controllers/RL_Supervisor/plotting.py | 4 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index b3cdc924..2d071d3b 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -160,7 +160,8 @@ def predict_action(self, state): probs = self.__neural_network.actor_network(state) if self.train_mode is True: - # Create a normal distribution with the calculated probabilities and the standard deviation + # Create a normal distribution with the calculated probabilities + # and the standard deviation dist = tfp.distributions.Normal(probs, self.__std_dev) # Sampling an action from the normal distribution @@ -425,10 +426,17 @@ def learn(self, states, actions, old_probs, values, rewards, dones): # optimize Critic Network weights with tf.GradientTape() as tape: + # The critical value represents the expected return from state 𝑠𝑡. + # It provides an estimate of how good it is to be in a given state. critic_value = self.__neural_network.critic_network(states) - returns = advantages + values + + # the total discounted reward accumulated from time step 𝑡 + estimate_returns = advantages + values + # Generate loss - critic_loss = tf.math.reduce_mean(tf.math.pow(returns - critic_value, 2)) + critic_loss = tf.math.reduce_mean( + tf.math.pow(estimate_returns - critic_value, 2) + ) # calculate gradient critic_params = self.__neural_network.critic_network.trainable_variables diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py index a6afd7fc..4b20bf63 100644 --- a/webots/controllers/RL_Supervisor/plotting.py +++ b/webots/controllers/RL_Supervisor/plotting.py @@ -1,8 +1,8 @@ """ Plotting script with Matplotlib """ # Imports -import matplotlib.pyplot as plt -import pandas as pd +import matplotlib.pyplot as plt # pylint: disable=import-error +import pandas as pd # pylint: disable=import-error # Define the path to the CSV file LOG_FILE = "logs/training_logs.csv" From 0a5001cc8286c0654e84ca5bdf21ff1e5f2f9261 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 19 Aug 2024 08:17:02 +0200 Subject: [PATCH 11/37] added comments, renamed variables, and updated values --- webots/controllers/RL_Supervisor/agent.py | 64 ++++++++++--------- webots/controllers/RL_Supervisor/networks.py | 27 ++++++-- .../RL_Supervisor/rl_supervisor.py | 17 +++-- 3 files changed, 70 insertions(+), 38 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 2d071d3b..19fa550b 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -102,7 +102,7 @@ def __init__( self.__neural_network = Networks(self.__alpha) self.__training_index = 0 # Track batch index during training self.__current_batch = None # Saving of the current batch which is in process - self.__std_dev = 1 + self.__std_dev = 0.9 self.done = False self.action = None self.value = None @@ -156,13 +156,14 @@ def predict_action(self, state): m_state = self.normalize_sensor_data(state) state = tf.convert_to_tensor([m_state], dtype=tf.float32) - # Calculation of probabilities by the Actor neural network - probs = self.__neural_network.actor_network(state) + # output from the Actor Network. + action_mean = self.__neural_network.actor_network(state) + # Training mode is set. if self.train_mode is True: - # Create a normal distribution with the calculated probabilities - # and the standard deviation - dist = tfp.distributions.Normal(probs, self.__std_dev) + + # Create a normal distribution + dist = tfp.distributions.Normal(action_mean, self.__std_dev) # Sampling an action from the normal distribution sampled_action = dist.sample() @@ -185,8 +186,10 @@ def predict_action(self, state): self.action = transformed_action.numpy()[0] self.value = value.numpy()[0] self.adjusted_log_prob = adjusted_log_prob.numpy()[0] + + # Driving mode is set else: - self.action = probs.numpy()[0] + self.action = action_mean.numpy()[0] return self.action @@ -240,6 +243,7 @@ def update(self, robot_node): self.data_sent = self.__serialmux.send_data( "SPEED_SET", motorcontrol ) # stop the motors immediately + # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: self.unsent_data.append(("SPEED_SET", motorcontrol)) @@ -253,29 +257,9 @@ def update(self, robot_node): def normalize_sensor_data(self, sensor_data): """The normalize_sensor_data function scales the sensor data to a range between 0 and 1.""" - # Normalized sensor data sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE - return sensor_data - def calculate_reward(self, sensor_data): - """ - The calculate_reward function evaluates the consequences of a certain - action performed in a certain state by calculating the resulting reward - """ - estimated_pos = self.calculate_position(sensor_data) - - # Return reward between 0 and 10 - if 500 <= estimated_pos <= 2000: - reward = ((1 / 150) * estimated_pos) - (10 / 3) - return reward - - if 2000 < estimated_pos <= 3500: - reward = ((-1 / 150) * estimated_pos) + (70 / 3) - return reward - - return 0 - def calculate_position(self, sensor_data): """ Determines the deviation and returns an estimated position of the robot @@ -339,6 +323,26 @@ def calculate_position(self, sensor_data): return estimated_pos + def calculate_reward(self, sensor_data): + """ + The calculate_reward function evaluates the consequences of a certain + action performed in a certain state by calculating the resulting reward. + A reward of 1 means that the robot is in the center of the Line. + """ + estimated_pos = self.calculate_position(sensor_data) + + # Reward scaled between 0 and 1 If robot is in line. + if 500 <= estimated_pos <= 2000: + reward = (((1 / 150) * estimated_pos) - (10 / 3)) / 10 + return reward + + # Reward scaled between 1 and 0 If robot is in line. + if 2000 < estimated_pos <= 3500: + reward = (((-1 / 150) * estimated_pos) + (70 / 3))/10 + return reward + + return 0 + def calculate_advantages(self, rewards, values, dones): """Calculate advantages for each state in a mini-batch.""" @@ -386,7 +390,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones): advantages = self.calculate_advantages(rewards, values, dones) - # optimize Actor Network weights + # optimize Actor Network weights with tf.GradientTape() as tape: states = tf.convert_to_tensor(states) actions = tf.convert_to_tensor(actions) @@ -426,7 +430,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones): # optimize Critic Network weights with tf.GradientTape() as tape: - # The critical value represents the expected return from state 𝑠𝑡. + # The critical value represents the expected return from state 𝑠𝑡. # It provides an estimate of how good it is to be in a given state. critic_value = self.__neural_network.critic_network(states) @@ -479,7 +483,7 @@ def perform_training(self): # Grab sample from memory self.__current_batch = self.__memory.generate_batches() - # Perform training with mini batchtes. + # Perform training with mini batches. if self.__training_index < len(self.__current_batch[-1]): ( state_arr, diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 84f9e98f..f3624593 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -30,6 +30,7 @@ import tensorflow as tf # pylint: disable=import-error from tensorflow.keras import layers # pylint: disable=import-error +from tensorflow.keras.regularizers import l2 # pylint: disable=import-error ################################################################################ # Variables @@ -59,22 +60,31 @@ def build_actor_network(self): 64, activation="relu", kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), bias_initializer="zeros", )(state_input) fc2 = layers.Dense( 64, activation="relu", kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), bias_initializer="zeros", )(fc1) - mu = layers.Dense( + fc3 = layers.Dense( + 32, + activation="relu", + kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), + bias_initializer="zeros", + )(fc2) + mean = layers.Dense( 1, activation="tanh", kernel_initializer="glorot_uniform", bias_initializer="zeros", - )(fc2) + )(fc3) - return tf.keras.models.Model(inputs=state_input, outputs=mu) + return tf.keras.models.Model(inputs=state_input, outputs=mean) def build_critic_network(self): """Build Critic Network""" @@ -84,15 +94,24 @@ def build_critic_network(self): 64, activation="relu", kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), bias_initializer="zeros", )(state_input) fc2 = layers.Dense( 64, activation="relu", kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), bias_initializer="zeros", )(fc1) - value = layers.Dense(1)(fc2) # Value output + fc3 = layers.Dense( + 32, + activation="relu", + kernel_initializer="he_normal", + kernel_regularizer=l2(0.01), + bias_initializer="zeros", + )(fc2) + value = layers.Dense(1)(fc3) # Value output return tf.keras.models.Model(inputs=state_input, outputs=value) diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index fa7388a8..1e13460d 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -101,13 +101,14 @@ def callback_line_sensors(self, payload: bytearray) -> None: """Callback LINE_SENS Channel.""" sensor_data = struct.unpack("5H", payload) self.steps += 1 - # determine lost line condition + + # Determine lost line condition if all(value == 0 for value in sensor_data): self.__no_line_detection_count += 1 else: self.__no_line_detection_count = 0 - # detect start/stop line + # Detect start/stop line is_start_stop = all( value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data ) @@ -126,7 +127,12 @@ def callback_line_sensors(self, payload: bytearray) -> None: # The sequence of states and actions is stored in memory for the training phase. if self.__agent.train_mode: - reward = self.__agent.calculate_reward(sensor_data) + + # receive a -1 punishment if the robot leaves the line + if self.__no_line_detection_count > 0: + reward = -1 + else: + reward = self.__agent.calculate_reward(sensor_data) # Start storage The data after the second received sensor data if self.last_sensor_data is not None: @@ -159,6 +165,8 @@ def load_models(self, path) -> None: """Load Model if exist""" if os.path.exists(path): self.__agent.load_models() + else: + print("No model available") def retry_unsent_data(self, unsent_data: list) -> bool: """Resent any unsent Data""" @@ -259,6 +267,7 @@ def main_loop(): if status != -1: controller.load_models(PATH) + # simulation loop while supervisor.step(timestep) != -1: controller.process() @@ -273,7 +282,7 @@ def main_loop(): agent.perform_training() if 1000 <= agent.num_episodes: - print(f"Episodes: {agent.num_episodes}") + print(f"The number of episodes:{agent.num_episodes}") # Resent any unsent Data if agent.unsent_data: From 0de39f70800d515771c1aa43f3ffcc82a912f8aa Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 19 Aug 2024 08:29:15 +0200 Subject: [PATCH 12/37] remove trailing whitespace from code file --- webots/controllers/RL_Supervisor/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 19fa550b..eafb037e 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -390,7 +390,7 @@ def learn(self, states, actions, old_probs, values, rewards, dones): advantages = self.calculate_advantages(rewards, values, dones) - # optimize Actor Network weights + # optimize Actor Network weights with tf.GradientTape() as tape: states = tf.convert_to_tensor(states) actions = tf.convert_to_tensor(actions) From cecb42785792d8087cf4acb0841b0d71aa7e15f3 Mon Sep 17 00:00:00 2001 From: Akram Date: Thu, 22 Aug 2024 11:44:16 +0200 Subject: [PATCH 13/37] Reset sendLineSensorsData function to its original state to fix issues with sensor data transmission. --- lib/APPReinforcementLearning/src/App.cpp | 40 ++++++++++++++---------- lib/APPReinforcementLearning/src/App.h | 7 ++++- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index 623ff636..d3e212a6 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -133,23 +133,7 @@ void App::loop() if (true == m_sendLineSensorsDataInterval.isTimeout() && (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - ILineSensors& lineSensors = Board::getInstance().getLineSensors(); - uint8_t maxLineSensors = lineSensors.getNumLineSensors(); - const uint16_t* lineSensorValues = lineSensors.getSensorValues(); - uint8_t lineSensorIdx = 0U; - LineSensorData payload; - - if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) - { - while (maxLineSensors > lineSensorIdx) - { - payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx]; - - ++lineSensorIdx; - } - } - - m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); + sendLineSensorsData(); m_sendLineSensorsDataInterval.restart(); } @@ -217,6 +201,28 @@ void App::handleRemoteCommand(const Command& cmd) * Private Methods *****************************************************************************/ +void App::sendLineSensorsData() +{ + ILineSensors& lineSensors = Board::getInstance().getLineSensors(); + uint8_t maxLineSensors = lineSensors.getNumLineSensors(); + const uint16_t* lineSensorValues = lineSensors.getSensorValues(); + uint8_t lineSensorIdx = 0U; + LineSensorData payload; + + if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) + { + while (maxLineSensors > lineSensorIdx) + { + payload.lineSensorData[lineSensorIdx] = lineSensorValues[lineSensorIdx]; + + ++lineSensorIdx; + } + } + + /* Ignoring return value, as error handling is not available. */ + m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); +} + bool App::setupSerialMuxProt() { bool isSuccessful = false; diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index 4d73fc4a..2683c301 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -150,7 +150,12 @@ class App * * @return If successful returns true, otherwise false. */ - bool setupSerialMuxProt(); + bool setupSerialMuxProt() + + /** + * Send line sensors data via SerialMuxProt. + */ + void sendLineSensorsData() ; /** * Copy construction of an instance. From 4e2aec2d8165a64a6e3674a9e501c4b6eddbd624 Mon Sep 17 00:00:00 2001 From: Akram Date: Thu, 22 Aug 2024 11:54:37 +0200 Subject: [PATCH 14/37] Fix missing semicolon --- lib/APPReinforcementLearning/src/App.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index 2683c301..846292dd 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -150,12 +150,12 @@ class App * * @return If successful returns true, otherwise false. */ - bool setupSerialMuxProt() + bool setupSerialMuxProt(); /** * Send line sensors data via SerialMuxProt. */ - void sendLineSensorsData() ; + void sendLineSensorsData(); /** * Copy construction of an instance. From 2935cb34497b403df462924ad882e98e5bc62821 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 26 Aug 2024 10:29:24 +0200 Subject: [PATCH 15/37] Apply coding style corrections --- lib/APPReinforcementLearning/src/App.cpp | 16 +++++++++------- lib/APPReinforcementLearning/src/App.h | 6 +++--- lib/APPReinforcementLearning/src/ReadyState.cpp | 4 ++-- lib/APPReinforcementLearning/src/ReadyState.h | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index d3e212a6..9780d434 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -124,7 +124,7 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); m_statusTimer.restart(); } @@ -133,7 +133,7 @@ void App::loop() if (true == m_sendLineSensorsDataInterval.isTimeout() && (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - sendLineSensorsData(); + m_dataSent = sendLineSensorsData(); m_sendLineSensorsDataInterval.restart(); } @@ -148,13 +148,13 @@ void App::loop() SMPChannelPayload::Mode payload = (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); m_modeSelectionSent = true; } } - if (false == m_data_sent) + if (false == m_dataSent) { /* Failed to send data to the supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("DSF"); @@ -201,12 +201,13 @@ void App::handleRemoteCommand(const Command& cmd) * Private Methods *****************************************************************************/ -void App::sendLineSensorsData() +bool App::sendLineSensorsData() const { ILineSensors& lineSensors = Board::getInstance().getLineSensors(); uint8_t maxLineSensors = lineSensors.getNumLineSensors(); const uint16_t* lineSensorValues = lineSensors.getSensorValues(); uint8_t lineSensorIdx = 0U; + bool isDataSent = true; LineSensorData payload; if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) @@ -219,8 +220,9 @@ void App::sendLineSensorsData() } } - /* Ignoring return value, as error handling is not available. */ - m_data_sent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); + isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); + + return isDataSent; } bool App::setupSerialMuxProt() diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index 846292dd..c8dca206 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -73,7 +73,7 @@ class App m_statusTimer(), m_sendLineSensorsDataInterval(), m_modeSelectionSent(false), - m_data_sent(true), + m_dataSent(true), m_smpServer(Serial, this) { } @@ -143,7 +143,7 @@ class App bool m_modeSelectionSent; /** check if the data has been sent */ - bool m_data_sent; + bool m_dataSent; /** * Setup the SerialMuxProt channels. @@ -155,7 +155,7 @@ class App /** * Send line sensors data via SerialMuxProt. */ - void sendLineSensorsData(); + bool sendLineSensorsData() const; /** * Copy construction of an instance. diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp index ec720077..1af24879 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.cpp +++ b/lib/APPReinforcementLearning/src/ReadyState.cpp @@ -80,8 +80,8 @@ void ReadyState::entry() display.print(m_lapTime); display.print("ms"); } - m_stateTransitionTimer.start(m_state_Transition_period); - m_modeTimeoutTimer.start(mode_selected_period); + m_stateTransitionTimer.start(STATE_TRANSITION_PERIOD); + m_modeTimeoutTimer.start(MODE_SELECTED_PERIOD); m_mode = IDLE; m_isLastStartStopLineDetected = false; m_isButtonAPressed = false; diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 6a778fc4..5cef49ec 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -117,10 +117,10 @@ class ReadyState : public IState }; /** Duration of the selected mode in ms. This is the maximum time to select a mode. */ - static const uint32_t mode_selected_period = 100; + static const uint32_t MODE_SELECTED_PERIOD = 100U; /** Duration to handle State changes in ms. */ - static const uint32_t m_state_Transition_period = 150; + static const uint32_t STATE_TRANSITION_PERIOD = 150U; /** * The line sensor threshold (normalized) used to detect the track. From 5002ecd1cd7fee4a9c23105a320b0dca9979910a Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 26 Aug 2024 10:52:19 +0200 Subject: [PATCH 16/37] Add comments and refactor functions from agent to trajectory_buffer class --- lib/APPReinforcementLearning/src/App.h | 2 + webots/controllers/RL_Supervisor/agent.py | 341 ++++++++---------- webots/controllers/RL_Supervisor/plotting.py | 5 + .../RL_Supervisor/rl_supervisor.py | 26 +- .../RL_Supervisor/trajectory_buffer.py | 147 +++++++- 5 files changed, 311 insertions(+), 210 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index c8dca206..bd3c6a1e 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -154,6 +154,8 @@ class App /** * Send line sensors data via SerialMuxProt. + * + * * @return true if data has been sent, false otherwise. */ bool sendLineSensorsData() const; diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index eafb037e..69494144 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -33,7 +33,7 @@ import numpy as np # pylint: disable=import-error import tensorflow as tf # pylint: disable=import-error import tensorflow_probability as tfp # pylint: disable=import-error -from trajectory_buffer import Memory +from trajectory_buffer import Buffer from networks import Networks ################################################################################ @@ -66,7 +66,7 @@ ] MAX_SENSOR_VALUE = 1000 MIN_STD_DEV = 0.1 # Minimum standard deviation -STD_DEV_FACTOR = 0.99995 # Discounter standard deviation factor +STD_DEV_FACTOR = 0.9995 # Discounter standard deviation factor ################################################################################ # Classes @@ -88,21 +88,20 @@ def __init__( batch_size=64, chkpt_dir="models/", top_speed=2000, - max_buffer_length=600, + max_buffer_length=65536, ): self.__serialmux = smp_server - self.__gamma = gamma self.__alpha = alpha - self.__gae_lambda = gae_lambda self.__policy_clip = policy_clip self.__chkpt_dir = chkpt_dir self.train_mode = False self.__top_speed = top_speed - self.__memory = Memory(batch_size, max_buffer_length) + self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda) self.__neural_network = Networks(self.__alpha) self.__training_index = 0 # Track batch index during training self.__current_batch = None # Saving of the current batch which is in process self.__std_dev = 0.9 + self.n_epochs = 10 self.done = False self.action = None self.value = None @@ -127,11 +126,20 @@ def set_drive_mode(self): self.num_episodes = 0 def store_transition( - self, state, action, probs, vals, reward, done + self, state, action, probs, value, reward, done ): # pylint: disable=too-many-arguments - """Store transitions in the replay buffer.""" + """Store transitions in the replay buffer. - self.__memory.store_memory(state, action, probs, vals, reward, done) + Parameters + ---------- + state: The state observed. + action: The action taken. + prob: The probability of taking the action. + value: The estimated value of the state. + reward: The reward received. + done: Indicating whether the target sequence has been reached. + """ + self.__buffer.store_memory(state, action, probs, value, reward, done) def save_models(self): """Saves the models in the specified file.""" @@ -150,13 +158,24 @@ def load_models(self): ) def predict_action(self, state): - """Predicts an action based on the current state.""" + """ + Predicts an action based on the current state. - # Conversion of the state into a tensor + Parameters + ---------- + state: The state observed. + + Returns + ---------- + float32: The action taken. + """ + # scales the sensor data to a range between 0 and 1 m_state = self.normalize_sensor_data(state) + + # Conversion of the state into a tensor state = tf.convert_to_tensor([m_state], dtype=tf.float32) - # output from the Actor Network. + # Forward pass through the actor network to get the action mean action_mean = self.__neural_network.actor_network(state) # Training mode is set. @@ -194,17 +213,24 @@ def predict_action(self, state): return self.action def send_motor_speeds(self, state): - """Sends the motor speeds to the robot.""" + """ + Sends the motor speeds to the robot. + Parameters + ---------- + state: The state observed. + """ + # pre_action contains the predicted action for the given state, calculated based + # on the Actor model output. pre_action = self.predict_action(state) # Get motor speed difference - speeddifference = self.__top_speed * pre_action + speed_difference = self.__top_speed * pre_action # Get individual motor speeds. The sign of speedDifference # determines if the robot turns left or right. - left_motor_speed = int(self.__top_speed - speeddifference) - right_motor_speed = int(self.__top_speed + speeddifference) + left_motor_speed = int(self.__top_speed - speed_difference) + right_motor_speed = int(self.__top_speed + speed_difference) control_data = struct.pack("2H", left_motor_speed, right_motor_speed) self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data) @@ -214,26 +240,29 @@ def send_motor_speeds(self, state): self.unsent_data.append(("SPEED_SET", control_data)) def update(self, robot_node): - """Checks if the sequence has ended, performs updates, and saves the models.""" + """ + Checks if the sequence has ended and performs updates. + + Parameters + ---------- + robot_node: The Robot interface + """ # Checks whether the sequence has ended if it is set to Training mode. if (self.train_mode is True) and ( - (self.done is True) or (self.__memory.is_memory_full() is True) + (self.done is True) or (self.__buffer.is_memory_full() is True) ): cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE) self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) - # Failed to send data. Appends the data to unsent_data List + # Failed to send data. Appends the data to unsent_data List. if self.data_sent is False: self.unsent_data.append(("CMD", cmd_payload)) + # Stopping condition for sequence was reached. self.reinitialize(robot_node) self.state = "TRAINING" - # save models - if self.__memory.get_sum_rewards() >= 1000.0: - self.save_models() - # Checks whether the sequence has ended if it is set to driving mode. if (self.train_mode is False) and (self.done is True): self.done = False @@ -255,205 +284,131 @@ def update(self, robot_node): self.unsent_data.append(("CMD", cmd_payload)) def normalize_sensor_data(self, sensor_data): - """The normalize_sensor_data function scales the sensor data to a range between 0 and 1.""" - - sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE - return sensor_data - - def calculate_position(self, sensor_data): """ - Determines the deviation and returns an estimated position of the robot - with respect to a line. The estimate is made using a weighted average of - the sensor indices multiplied by 1000, so that a return value of 0 - indicates that the line is directly below sensor 0, a return value of - 1000 indicates that the line is directly below sensor 1, 2000 - indicates that it's below sensor 2000, etc. Intermediate values - indicate that the line is between two sensors. The formula is: - - 0*value0 + 1000*value1 + 2000*value2 + ... - -------------------------------------------- - value0 + value1 + value2 + ... - - This function assumes a dark line (high values) surrounded by white - (low values). - + The normalize_sensor_data function scales the sensor data to a range between 0 and 1. + Parameters ---------- - sensor_data : List + sensor_data: The state observed. Returns ---------- - Estimated position with respect to track. + NumPy array of float32: Normalized Sensor Data """ - estimated_pos = 0 - numerator = 0 - denominator = 0 - weight = 1000 - is_on_line = False - - max_sensors = len(sensor_data) - sensor_max_value = 1000 - last_pos_value = 0 - - for idx, sensor_value in enumerate(sensor_data): - numerator += idx * weight * sensor_value - denominator += sensor_value - - # Keep track of whether we see the line at all. - if LINE_SENSOR_ON_TRACK_MIN_VALUE < sensor_value: - is_on_line = True - - if False is is_on_line: - - # If it last read to the left of center, return 0. - if last_pos_value < ((max_sensors - 1) * sensor_max_value) / 2: - estimated_pos = 0 - - # If it last read to the right of center, return the max. value. - else: - estimated_pos = (max_sensors - 1) * sensor_max_value - - else: - # Check to avoid division by zero.. - if denominator != 0: - estimated_pos = numerator / denominator - - last_pos_value = estimated_pos - - return estimated_pos + sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE + return sensor_data - def calculate_reward(self, sensor_data): + def determine_reward(self, sensor_data): """ - The calculate_reward function evaluates the consequences of a certain + The function evaluates the consequences of a certain action performed in a certain state by calculating the resulting reward. A reward of 1 means that the robot is in the center of the Line. - """ - estimated_pos = self.calculate_position(sensor_data) - # Reward scaled between 0 and 1 If robot is in line. - if 500 <= estimated_pos <= 2000: - reward = (((1 / 150) * estimated_pos) - (10 / 3)) / 10 - return reward + Parameters + ---------- + sensor_data : The state observed. - # Reward scaled between 1 and 0 If robot is in line. - if 2000 < estimated_pos <= 3500: - reward = (((-1 / 150) * estimated_pos) + (70 / 3))/10 - return reward + Returns + ---------- + float: the Resulting Reward - return 0 + """ + reward = self.__buffer.calculate_reward(sensor_data) + return reward - def calculate_advantages(self, rewards, values, dones): - """Calculate advantages for each state in a mini-batch.""" + # pylint: disable=too-many-arguments + # pylint: disable=too-many-locals + def learn(self, states, actions, old_probs, values, rewards, dones): + """ + Perform training to optimize model weights. - mini_batch_size = len(rewards) + Parameters + ---------- + states: The saved states observed during interactions with the environment. + actions: The saved actions taken in response to the observed states. + old_probs: The saved probabilities of the actions taken, based on the previous policy. + values: The saved estimated values of the observed states. + rewards: The saved rewards received for taking the actions. + dones: The saved flags indicating whether the target sequence or episode has + been completed. + """ + for _ in range(self.n_epochs): - # Create empty advantages array - advantages = np.zeros(mini_batch_size, dtype=np.float32) + #the computed advantage values for each state in a given batch of experiences. + advantages = self.__buffer.calculate_advantages(rewards, values, dones) - for start_index in range(mini_batch_size): - discount = 1 - advantage = 0 + # optimize Actor Network weights + with tf.GradientTape() as tape: + states = tf.convert_to_tensor(states) + actions = tf.convert_to_tensor(actions) + old_probs = tf.convert_to_tensor(old_probs) - for future_index in range(start_index, mini_batch_size - 1): + # Forward pass through the actor network to get the action mean + predict_mean = self.__neural_network.actor_network(states) - # Calculate the temporal difference (TD) - delta = ( - rewards[future_index] - + ( - self.__gamma - * values[future_index + 1] - * (1 - int(dones[future_index])) - ) - - values[future_index] - ) + # Create the normal distribution with the predicted mean + new_dist = tfp.distributions.Normal(predict_mean, self.__std_dev) - # Accumulate the advantage using the discount factor - advantage += discount * delta + # Invert the tanh transformation to recover the original actions before tanh + untransformed_actions = tf.atanh(actions) - # Update the discount factor for the next step - discount *= self.__gamma * self.__gae_lambda + new_log_prob = new_dist.log_prob(untransformed_actions) - # Stop if a terminal state is reached - if dones[future_index]: - break + # Compute the log of the Jacobian for the tanh transformation + jacobian_log_det = tf.math.log(1 - tf.square(actions)) + adjusted_new_log_prob = new_log_prob - jacobian_log_det - # Save the calculated advantage for the current state - advantages[start_index] = advantage + # The ratio between the new model and the old model’s action log probabilities + prob_ratio = tf.exp(adjusted_new_log_prob - old_probs) - return advantages + # If the ratio is too large or too small, it will be + # clipped according to the surrogate function. + weighted_probs = prob_ratio * advantages + clipped_probs = tf.clip_by_value( + prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip + ) + weighted_clipped_probs = clipped_probs * advantages - # pylint: disable=too-many-arguments - # pylint: disable=too-many-locals - def learn(self, states, actions, old_probs, values, rewards, dones): - """Perform training to optimize model weights""" - - advantages = self.calculate_advantages(rewards, values, dones) - - # optimize Actor Network weights - with tf.GradientTape() as tape: - states = tf.convert_to_tensor(states) - actions = tf.convert_to_tensor(actions) - old_probs = tf.convert_to_tensor(old_probs) - - # Predicts an action based on the Saved states - predict = self.__neural_network.actor_network(states) - new_dist = tfp.distributions.Normal(predict, 0.1) - new_log_prob = new_dist.log_prob(actions) - transformed_action = tf.tanh(actions) - jacobian_log_det = tf.math.log(1 - tf.square(transformed_action)) - adjusted_new_log_prob = new_log_prob - jacobian_log_det - - # The ratio between the new model and the old model’s action log probabilities - prob_ratio = tf.exp(adjusted_new_log_prob - old_probs) - - # If the ratio is too large or too small, it will be - # clipped according to the surrogate function. - weighted_probs = prob_ratio * advantages - clipped_probs = tf.clip_by_value( - prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip - ) - weighted_clipped_probs = clipped_probs * advantages + # Policy Gradient Loss + actor_loss = -tf.reduce_mean( + tf.minimum(weighted_probs, weighted_clipped_probs) + ) - # Policy Gradient Loss - actor_loss = -tf.reduce_mean( - tf.minimum(weighted_probs, weighted_clipped_probs) + # calculate gradient + actor_params = self.__neural_network.actor_network.trainable_variables + actor_grads = tape.gradient(actor_loss, actor_params) + self.__neural_network.actor_optimizer.apply_gradients( + zip(actor_grads, actor_params) ) - # calculate gradient - actor_params = self.__neural_network.actor_network.trainable_variables - actor_grads = tape.gradient(actor_loss, actor_params) - self.__neural_network.actor_optimizer.apply_gradients( - zip(actor_grads, actor_params) - ) - - # optimize Critic Network weights - with tf.GradientTape() as tape: + # optimize Critic Network weights + with tf.GradientTape() as tape: - # The critical value represents the expected return from state 𝑠𝑡. - # It provides an estimate of how good it is to be in a given state. - critic_value = self.__neural_network.critic_network(states) + # The critical value represents the expected return from state 𝑠𝑡. + # It provides an estimate of how good it is to be in a given state. + critic_value = self.__neural_network.critic_network(states) - # the total discounted reward accumulated from time step 𝑡 - estimate_returns = advantages + values + # the total discounted reward accumulated from time step 𝑡 + estimate_returns = advantages + values - # Generate loss - critic_loss = tf.math.reduce_mean( - tf.math.pow(estimate_returns - critic_value, 2) - ) + # Generate loss + critic_loss = tf.math.reduce_mean( + tf.math.pow(estimate_returns - critic_value, 2) + ) - # calculate gradient - critic_params = self.__neural_network.critic_network.trainable_variables - critic_grads = tape.gradient(critic_loss, critic_params) - self.__neural_network.critic_optimizer.apply_gradients( - zip(critic_grads, critic_params) - ) - self.actor_loss_history.append(actor_loss.numpy()) - self.critic_loss_history.append(critic_loss.numpy()) - self.reward_history.append(sum(rewards)) + # calculate gradient + critic_params = self.__neural_network.critic_network.trainable_variables + critic_grads = tape.gradient(critic_loss, critic_params) + self.__neural_network.critic_optimizer.apply_gradients( + zip(critic_grads, critic_params) + ) + self.actor_loss_history.append(actor_loss.numpy()) + self.critic_loss_history.append(critic_loss.numpy()) + self.reward_history.append(sum(rewards)) - # saving logs in a CSV file - self.save_logs_to_csv() + # saving logs in a CSV file + self.save_logs_to_csv() def save_logs_to_csv(self): """Function for saving logs in a CSV file""" @@ -481,7 +436,7 @@ def perform_training(self): if self.__current_batch is None: # Grab sample from memory - self.__current_batch = self.__memory.generate_batches() + self.__current_batch = self.__buffer.generate_batches() # Perform training with mini batches. if self.__training_index < len(self.__current_batch[-1]): @@ -511,7 +466,7 @@ def perform_training(self): self.__training_index = 0 self.__current_batch = None self.done = False - self.__memory.clear_memory() + self.__buffer.clear_memory() self.state = "IDLE" self.num_episodes += 1 cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) @@ -526,7 +481,13 @@ def perform_training(self): self.__std_dev = max(self.__std_dev, MIN_STD_DEV) def reinitialize(self, robot_node): - """Re-initialization of position and orientation of The ROBOT.""" + """ + Re-initialization of position and orientation of The ROBOT. + + Parameters + ---------- + robot_node: The Robot interface + """ trans_field = robot_node.getField("translation") rot_field = robot_node.getField("rotation") initial_position = POSITION_DATA diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py index 4b20bf63..8efb92b3 100644 --- a/webots/controllers/RL_Supervisor/plotting.py +++ b/webots/controllers/RL_Supervisor/plotting.py @@ -1,5 +1,10 @@ """ Plotting script with Matplotlib """ +# Description: This script generates three separate plots using Matplotlib to visualize +# the Actor Loss, Critic Loss, and Reward over time based on data collected during the +# training process. Each plot represents the respective metric as a function of +# mini-batch steps. + # Imports import matplotlib.pyplot as plt # pylint: disable=import-error import pandas as pd # pylint: disable=import-error diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index 1e13460d..8ae0838a 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -132,7 +132,7 @@ def callback_line_sensors(self, payload: bytearray) -> None: if self.__no_line_detection_count > 0: reward = -1 else: - reward = self.__agent.calculate_reward(sensor_data) + reward = self.__agent.determine_reward(sensor_data) # Start storage The data after the second received sensor data if self.last_sensor_data is not None: @@ -272,17 +272,23 @@ def main_loop(): while supervisor.step(timestep) != -1: controller.process() - # Checks if the sequence has ended - if agent.state == "READY": - agent.update(robot_node) + match (agent.state): + case "READY": + agent.update(robot_node) - # Start the training - elif agent.state == "TRAINING": - controller.steps = 0 - agent.perform_training() + case "TRAINING": + controller.steps = 0 + agent.perform_training() + print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," + f"critic loss: {agent.critic_loss_history[-1]:.4f}") - if 1000 <= agent.num_episodes: - print(f"The number of episodes:{agent.num_episodes}") + # save model + if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0): + print(f"The number of episodes:{agent.num_episodes}") + agent.save_models() + + case "IDLE": + pass # Resent any unsent Data if agent.unsent_data: diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 01767d06..44f17fce 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -40,18 +40,23 @@ ################################################################################ -class Memory: # pylint: disable=too-many-instance-attributes +class Buffer: # pylint: disable=too-many-instance-attributes """Class for store and manage experience tuples during reinforcement learning""" - def __init__(self, batch_size, max_length): + # pylint: disable=too-many-arguments + def __init__(self, batch_size, max_length, gamma, gae_lambda): self.__states = [] self.__probs = [] self.__vals = [] self.__actions = [] self.__rewards = [] self.__dones = [] + self.__batch_size = batch_size self.__max_length = max_length self.__batch_size = batch_size + self.__gamma = gamma + self.__gae_lambda = gae_lambda + self.__current_index = 0 def generate_batches(self): """ @@ -76,11 +81,9 @@ def generate_batches(self): # Create indices for the states and mix them randomly indices = np.arange(n_states, dtype=np.int64) - np.random.shuffle(indices) # Create batches by dividing the indices into groups of the batch_size batches = [indices[indx : indx + self.__batch_size] for indx in batch_start] - print("Number of Steps:", n_states) return ( np.array(self.__states), @@ -93,7 +96,13 @@ def generate_batches(self): ) def get_sum_rewards(self) -> float: - """Calculate total rewards""" + """ + Calculate total rewards. + + Returns + ---------- + float: Total rewards received. + """ sum_rewards = sum(self.__rewards) return sum_rewards @@ -108,8 +117,8 @@ def store_memory( ---------- state: The state observed. action: The action taken. - prob: The probability of taking the action. - val: The estimated value of the state. + probs: The probability of taking the action. + vals: The estimated value of the state. reward: The reward received. done: Whether the episode is done. """ @@ -119,9 +128,10 @@ def store_memory( self.__vals.append(vals) self.__rewards.append(reward) self.__dones.append(done) + self.__current_index += 1 def clear_memory(self): - """Remove transitions from the replay buffer.""" + """Remove transitions from the trajektories buffer.""" self.__states = [] self.__probs = [] @@ -132,12 +142,129 @@ def clear_memory(self): def is_memory_full(self): """ - Checks whether Memory has reached its maximum capacity. + Checks whether Memory has reached its maximum capacity. Returns ---------- - Bool: Memory is full or not """ - is_full = len(self.__states) >= self.__max_length + is_full = self.__current_index >= self.__max_length return is_full + + def calculate_advantages(self, rewards, values, dones): + """ + The function measures how much better or worse an action + performed in a given state compared to the average action + the policy would take in that state. + + Parameters + ---------- + rewards: The rewards received. + values: The estimated values of the states. + dones: Indicating whether the target sequence has been reached. + + Returns + ---------- + NumPy array of float32: the computed advantage values for each + state in a given batch of experiences. + """ + + mini_batch_size = rewards.shape[0] + + # Create empty advantages array + advantages = np.zeros(mini_batch_size, dtype=np.float32) + + for start_index in range(mini_batch_size): + discount = 1 + advantage = 0 + + for future_index in range(start_index, mini_batch_size - 1): + + # Calculate the temporal difference (TD) + delta = ( + rewards[future_index] + + ( + self.__gamma + * values[future_index + 1] + * (1 - int(dones[future_index])) + ) + - values[future_index] + ) + + # Accumulate the advantage using the discount factor + advantage += discount * delta + + # Update the discount factor for the next step + discount *= self.__gamma * self.__gae_lambda + + # Stop if a terminal state is reached + if dones[future_index]: + break + + # Save the calculated advantage for the current state + advantages[start_index] = advantage + + return advantages + + def calculate_position(self, sensor_data): + """ + Determines the deviation and returns an estimated position of the robot + with respect to a line. The estimate is made using a weighted average of + the sensor indices multiplied by 1000, so that a return value of 0 + indicates that the line is directly below sensor 0, a return value of + 1000 indicates that the line is directly below sensor 1, 2000 + indicates that it's below sensor 2000, etc. Intermediate values + indicate that the line is between two sensors. The formula is: + + 0*value0 + 1000*value1 + 2000*value2 + ... + -------------------------------------------- + value0 + value1 + value2 + ... + + This function assumes a dark line (high values) surrounded by white + (low values). + + Parameters + ---------- + sensor_data : The state observed. + + Returns + ---------- + float: Estimated position with respect to track. + """ + estimated_pos = 0.0 + numerator = 0.0 + denominator = 0.0 + weight = 1000.0 + + for idx, sensor_value in enumerate(sensor_data): + numerator += idx * weight * sensor_value + denominator += sensor_value + + if denominator > 0: + estimated_pos = numerator / denominator + + return estimated_pos + + def calculate_reward(self, sensor_data): + """ + The calculate_reward function evaluates the consequences of a certain + action performed in a certain state by calculating the resulting reward. + A reward of 1 means that the robot is in the center of the Line. + + Parameters + ---------- + sensor_data : The state observed. + + Returns + ---------- + float: the Resulting Reward + """ + reward = 0.0 + estimated_pos = self.calculate_position(sensor_data) + + # Reward scaled between 0 and 1 based on the estimated position + if 500 <= estimated_pos <= 3500: + reward = 1.0 - abs(estimated_pos - 2000.0) / 1500.0 + + return reward From d4f6d5a1bfb901b04b255d2bc46dee9470bbbf2d Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 26 Aug 2024 11:11:18 +0200 Subject: [PATCH 17/37] Replace match-case with if-else in RobotController class for Python 3.9 compatibility --- .../RL_Supervisor/rl_supervisor.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index 8ae0838a..f5f7ac8d 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -272,23 +272,21 @@ def main_loop(): while supervisor.step(timestep) != -1: controller.process() - match (agent.state): - case "READY": - agent.update(robot_node) - - case "TRAINING": - controller.steps = 0 - agent.perform_training() - print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," - f"critic loss: {agent.critic_loss_history[-1]:.4f}") + if agent.state == "READY": + agent.update(robot_node) + + # Start the training + elif agent.state == "TRAINING": + supervisor.last_sensor_data = None + controller.steps = 0 + agent.perform_training() - # save model - if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0): - print(f"The number of episodes:{agent.num_episodes}") - agent.save_models() + print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," + f"critic loss: {agent.critic_loss_history[-1]:.4f}") - case "IDLE": - pass + # save model + if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0): + agent.save_models() # Resent any unsent Data if agent.unsent_data: From da2ac90fb08c2aa49bedd5d870980f816174e180 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 3 Sep 2024 11:47:51 +0200 Subject: [PATCH 18/37] Replace member variable with local variable in loop() to prevent side effects --- lib/APPReinforcementLearning/src/App.cpp | 15 ++++++++------- lib/APPReinforcementLearning/src/App.h | 4 ---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index 9780d434..a4cf8cb9 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -96,6 +96,7 @@ void App::loop() { Board::getInstance().process(); Speedometer::getInstance().process(); + bool isDataSent = true; if (true == m_controlInterval.isTimeout()) { @@ -124,7 +125,7 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); m_statusTimer.restart(); } @@ -133,7 +134,7 @@ void App::loop() if (true == m_sendLineSensorsDataInterval.isTimeout() && (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - m_dataSent = sendLineSensorsData(); + isDataSent = sendLineSensorsData(); m_sendLineSensorsDataInterval.restart(); } @@ -148,13 +149,13 @@ void App::loop() SMPChannelPayload::Mode payload = (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - m_dataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); m_modeSelectionSent = true; } } - if (false == m_dataSent) + if (false == isDataSent) { /* Failed to send data to the supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("DSF"); @@ -207,7 +208,7 @@ bool App::sendLineSensorsData() const uint8_t maxLineSensors = lineSensors.getNumLineSensors(); const uint16_t* lineSensorValues = lineSensors.getSensorValues(); uint8_t lineSensorIdx = 0U; - bool isDataSent = true; + bool isPayloadSent = true; LineSensorData payload; if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) @@ -220,9 +221,9 @@ bool App::sendLineSensorsData() const } } - isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); + isPayloadSent = m_smpServer.sendData(m_serialMuxProtChannelIdLineSensors, &payload, sizeof(payload)); - return isDataSent; + return isPayloadSent; } bool App::setupSerialMuxProt() diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index bd3c6a1e..6fec26c9 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -73,7 +73,6 @@ class App m_statusTimer(), m_sendLineSensorsDataInterval(), m_modeSelectionSent(false), - m_dataSent(true), m_smpServer(Serial, this) { } @@ -142,9 +141,6 @@ class App /** Ensue that the mode is only sent once */ bool m_modeSelectionSent; - /** check if the data has been sent */ - bool m_dataSent; - /** * Setup the SerialMuxProt channels. * From d39d111585829c2fcf8955ae0d3c603669fa5f2f Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 3 Sep 2024 11:52:23 +0200 Subject: [PATCH 19/37] Increase the duration of the selected mode and the duration to handle state changes. --- lib/APPReinforcementLearning/src/ReadyState.cpp | 1 + lib/APPReinforcementLearning/src/ReadyState.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp index 1af24879..fc8babaa 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.cpp +++ b/lib/APPReinforcementLearning/src/ReadyState.cpp @@ -125,6 +125,7 @@ void ReadyState::process(StateMachine& sm) if (true == m_stateTransitionTimer.isTimeout()) { sm.setState(&DrivingState::getInstance()); + m_stateTransitionTimer.restart(); } } diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 5cef49ec..8626a46a 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -117,10 +117,10 @@ class ReadyState : public IState }; /** Duration of the selected mode in ms. This is the maximum time to select a mode. */ - static const uint32_t MODE_SELECTED_PERIOD = 100U; + static const uint32_t MODE_SELECTED_PERIOD = 2000U; /** Duration to handle State changes in ms. */ - static const uint32_t STATE_TRANSITION_PERIOD = 150U; + static const uint32_t STATE_TRANSITION_PERIOD = 2100U; /** * The line sensor threshold (normalized) used to detect the track. From 812fbb50fce48ac517980c91be5a440cdc590654 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 3 Sep 2024 12:04:37 +0200 Subject: [PATCH 20/37] Rename classes, fix bugs in calculate_advantages, and update hyperparameter values --- webots/controllers/RL_Supervisor/agent.py | 70 ++++++++++--------- webots/controllers/RL_Supervisor/networks.py | 23 +++--- .../RL_Supervisor/rl_supervisor.py | 26 +++---- .../RL_Supervisor/trajectory_buffer.py | 32 +++++---- 4 files changed, 81 insertions(+), 70 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 69494144..f45ef80a 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -33,8 +33,8 @@ import numpy as np # pylint: disable=import-error import tensorflow as tf # pylint: disable=import-error import tensorflow_probability as tfp # pylint: disable=import-error -from trajectory_buffer import Buffer -from networks import Networks +from trajectory_buffer import Memory +from networks import Models ################################################################################ # Variables @@ -57,16 +57,16 @@ MODE_CHANNEL_NAME = "MODE" CMD_ID_SET_READY_STATE = 1 CMD_ID_SET_TRAINING_STATE = 2 -POSITION_DATA = [-0.24713614078815466, 0.01, 0.013994298332013683] +POSITION_DATA = [-0.24713614078815466, -0.04863962992854465, 0.013994298332013683] ORIENTATION_DATA = [ -1.0564747468923541e-06, 8.746699709178704e-07, 0.9999999999990595, - 1.5880805820884731, + 1.5880805820884731 ] MAX_SENSOR_VALUE = 1000 -MIN_STD_DEV = 0.1 # Minimum standard deviation -STD_DEV_FACTOR = 0.9995 # Discounter standard deviation factor +MIN_STD_DEV = 0.01 # Minimum standard deviation +STD_DEV_FACTOR = 0.995 # Discounter standard deviation factor ################################################################################ # Classes @@ -74,15 +74,18 @@ class Agent: # pylint: disable=too-many-instance-attributes - """The Agent class represents an intelligent agent that makes decisions to - control motors based on the position of the robot.""" + """ + The Agent class represents an intelligent agent that makes decisions to + control motors based on the position of the robot. + """ # pylint: disable=too-many-arguments def __init__( self, smp_server, gamma=0.99, - alpha=0.0003, + actor_alpha=0.0001, + critic_alpha=0.0003, gae_lambda=0.95, policy_clip=0.2, batch_size=64, @@ -91,17 +94,16 @@ def __init__( max_buffer_length=65536, ): self.__serialmux = smp_server - self.__alpha = alpha self.__policy_clip = policy_clip self.__chkpt_dir = chkpt_dir self.train_mode = False self.__top_speed = top_speed - self.__buffer = Buffer(batch_size, max_buffer_length, gamma, gae_lambda) - self.__neural_network = Networks(self.__alpha) + self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda) + self.__neural_network = Models(actor_alpha, critic_alpha) self.__training_index = 0 # Track batch index during training self.__current_batch = None # Saving of the current batch which is in process - self.__std_dev = 0.9 - self.n_epochs = 10 + self.__std_dev = 0.05 + self.n_epochs = 3 self.done = False self.action = None self.value = None @@ -139,7 +141,7 @@ def store_transition( reward: The reward received. done: Indicating whether the target sequence has been reached. """ - self.__buffer.store_memory(state, action, probs, value, reward, done) + self.__memory.store_memory(state, action, probs, value, reward, done) def save_models(self): """Saves the models in the specified file.""" @@ -250,7 +252,7 @@ def update(self, robot_node): # Checks whether the sequence has ended if it is set to Training mode. if (self.train_mode is True) and ( - (self.done is True) or (self.__buffer.is_memory_full() is True) + (self.done is True) or (self.__memory.is_memory_full() is True) ): cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE) self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) @@ -277,11 +279,13 @@ def update(self, robot_node): if self.data_sent is False: self.unsent_data.append(("SPEED_SET", motorcontrol)) + self.reinitialize(robot_node) self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: self.unsent_data.append(("CMD", cmd_payload)) + self.state = "IDLE" def normalize_sensor_data(self, sensor_data): """ @@ -314,29 +318,26 @@ def determine_reward(self, sensor_data): float: the Resulting Reward """ - reward = self.__buffer.calculate_reward(sensor_data) + reward = self.__memory.calculate_reward(sensor_data) return reward # pylint: disable=too-many-arguments # pylint: disable=too-many-locals - def learn(self, states, actions, old_probs, values, rewards, dones): + def learn(self, states, actions, old_probs, values, rewards, advantages): """ Perform training to optimize model weights. Parameters ---------- - states: The saved states observed during interactions with the environment. - actions: The saved actions taken in response to the observed states. - old_probs: The saved probabilities of the actions taken, based on the previous policy. - values: The saved estimated values of the observed states. - rewards: The saved rewards received for taking the actions. - dones: The saved flags indicating whether the target sequence or episode has - been completed. + states: The saved states observed during interactions with the environment. + actions: The saved actions taken in response to the observed states. + old_probs: The saved probabilities of the actions taken, based on the previous policy. + values: The saved estimated values of the observed states. + rewards: The saved rewards received for taking the actions. + advantages: the computed advantage values for each state in a given Data size. """ - for _ in range(self.n_epochs): - #the computed advantage values for each state in a given batch of experiences. - advantages = self.__buffer.calculate_advantages(rewards, values, dones) + for _ in range(self.n_epochs): # optimize Actor Network weights with tf.GradientTape() as tape: @@ -407,8 +408,8 @@ def learn(self, states, actions, old_probs, values, rewards, dones): self.critic_loss_history.append(critic_loss.numpy()) self.reward_history.append(sum(rewards)) - # saving logs in a CSV file - self.save_logs_to_csv() + # saving logs in a CSV file + self.save_logs_to_csv() def save_logs_to_csv(self): """Function for saving logs in a CSV file""" @@ -436,7 +437,7 @@ def perform_training(self): if self.__current_batch is None: # Grab sample from memory - self.__current_batch = self.__buffer.generate_batches() + self.__current_batch = self.__memory.generate_batches() # Perform training with mini batches. if self.__training_index < len(self.__current_batch[-1]): @@ -446,18 +447,19 @@ def perform_training(self): old_prob_arr, vals_arr, reward_arr, - dones_arr, + advatage_arr, batches, ) = self.__current_batch batch = batches[self.__training_index] + # pylint: disable=too-many-arguments self.learn( state_arr[batch], action_arr[batch], old_prob_arr[batch], vals_arr[batch], reward_arr[batch], - dones_arr[batch], + advatage_arr[batch] ) self.__training_index += 1 @@ -466,7 +468,7 @@ def perform_training(self): self.__training_index = 0 self.__current_batch = None self.done = False - self.__buffer.clear_memory() + self.__memory.clear_memory() self.state = "IDLE" self.num_episodes += 1 cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index f3624593..1535330c 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,9 +28,9 @@ # Imports ################################################################################ -import tensorflow as tf # pylint: disable=import-error -from tensorflow.keras import layers # pylint: disable=import-error -from tensorflow.keras.regularizers import l2 # pylint: disable=import-error +from tensorflow import keras # pylint: disable=import-error +from keras import layers # pylint: disable=import-error +from keras.regularizers import l2 # pylint: disable=import-error ################################################################################ # Variables @@ -42,15 +42,16 @@ ################################################################################ -class Networks: - """Class for building networks of actors and critics""" +class Models: + """Class for building networks of actors and critics.""" - def __init__(self, alpha): - self.__learning_rate = alpha + def __init__(self, actor_alpha, critic_alpha): + self.__actor_learning_rate = actor_alpha + self.__critic_learning_rate = critic_alpha self.actor_network = self.build_actor_network() self.critic_network = self.build_critic_network() - self.actor_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) - self.critic_optimizer = tf.keras.optimizers.Adam(self.__learning_rate) + self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate) + self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate) def build_actor_network(self): """Build Actor Network.""" @@ -84,7 +85,7 @@ def build_actor_network(self): bias_initializer="zeros", )(fc3) - return tf.keras.models.Model(inputs=state_input, outputs=mean) + return keras.models.Model(inputs=state_input, outputs=mean) def build_critic_network(self): """Build Critic Network""" @@ -113,7 +114,7 @@ def build_critic_network(self): )(fc2) value = layers.Dense(1)(fc3) # Value output - return tf.keras.models.Model(inputs=state_input, outputs=value) + return keras.models.Model(inputs=state_input, outputs=value) ################################################################################ diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index f5f7ac8d..b620956a 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -64,7 +64,7 @@ MODE_CHANNEL_NAME = "MODE" -MIN_NUMBER_OF_STEPS = 200 +MIN_NUMBER_OF_STEPS = 400 SENSOR_ID_MOST_LEFT = 0 SENSOR_ID_MOST_RIGHT = 4 @@ -76,8 +76,8 @@ ################################################################################ -class RobotController: - """Class for data flow control logic""" +class RobotController: # pylint: disable=too-many-instance-attributes + """Class for data flow control logic.""" def __init__(self, smp_server, tick_size, agent): self.__smp_server = smp_server @@ -86,6 +86,7 @@ def __init__(self, smp_server, tick_size, agent): self.__no_line_detection_count = 0 self.__timestamp = 0 # Elapsed time since reset [ms] self.last_sensor_data = None + self.start_stop_line_detected = False self.steps = 0 def callback_status(self, payload: bytearray) -> None: @@ -94,8 +95,7 @@ def callback_status(self, payload: bytearray) -> None: # perform action on robot status feedback if payload[0] == STATUS_CHANNEL_ERROR_VAL: print("robot has reached error-state (max. lap time passed in robot)") - - self.__agent.done = 1 + self.__agent.done = True def callback_line_sensors(self, payload: bytearray) -> None: """Callback LINE_SENS Channel.""" @@ -109,21 +109,24 @@ def callback_line_sensors(self, payload: bytearray) -> None: self.__no_line_detection_count = 0 # Detect start/stop line - is_start_stop = all( - value >= LINE_SENSOR_ON_TRACK_MIN_VALUE for value in sensor_data - ) + if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and + (sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)): + self.start_stop_line_detected = True + # Detect Start/Stop Line before Finish Trajectories - if (is_start_stop is True) and (self.steps < MIN_NUMBER_OF_STEPS): + if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS): sensor_data = list(sensor_data) sensor_data[SENSOR_ID_MOST_LEFT] = 0 sensor_data[SENSOR_ID_MOST_RIGHT] = 0 + self.start_stop_line_detected = False # sequence stop criterion debounce no line detection and start/stop line detected if self.__no_line_detection_count >= 30 or ( - is_start_stop and (self.steps >= MIN_NUMBER_OF_STEPS) + (self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS) ): self.__agent.done = True self.__no_line_detection_count = 0 + self.steps = 0 # The sequence of states and actions is stored in memory for the training phase. if self.__agent.train_mode: @@ -136,7 +139,7 @@ def callback_line_sensors(self, payload: bytearray) -> None: # Start storage The data after the second received sensor data if self.last_sensor_data is not None: - normalized_sensor_data = self.__agent.normalize_sensor_data(sensor_data) + normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data) self.__agent.store_transition( normalized_sensor_data, self.__agent.action, @@ -278,7 +281,6 @@ def main_loop(): # Start the training elif agent.state == "TRAINING": supervisor.last_sensor_data = None - controller.steps = 0 agent.perform_training() print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 44f17fce..151dcf52 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -40,8 +40,8 @@ ################################################################################ -class Buffer: # pylint: disable=too-many-instance-attributes - """Class for store and manage experience tuples during reinforcement learning""" +class Memory: # pylint: disable=too-many-instance-attributes + """Class for store and manage experience tuples during Reinforcement learning.""" # pylint: disable=too-many-arguments def __init__(self, batch_size, max_length, gamma, gae_lambda): @@ -51,6 +51,7 @@ def __init__(self, batch_size, max_length, gamma, gae_lambda): self.__actions = [] self.__rewards = [] self.__dones = [] + self.__advatages = [] self.__batch_size = batch_size self.__max_length = max_length self.__batch_size = batch_size @@ -85,14 +86,18 @@ def generate_batches(self): # Create batches by dividing the indices into groups of the batch_size batches = [indices[indx : indx + self.__batch_size] for indx in batch_start] + # the computed advantage values for each state in a given Data size. + self.__advatages = self.calculate_advantages(self.__rewards, + self.__vals, self.__dones) return ( np.array(self.__states), np.array(self.__actions), np.array(self.__probs), np.array(self.__vals), np.array(self.__rewards), - np.array(self.__dones), + np.array(self.__advatages), batches, + ) def get_sum_rewards(self) -> float: @@ -139,6 +144,8 @@ def clear_memory(self): self.__actions = [] self.__rewards = [] self.__dones = [] + self.__advatages = [] + self.__current_index = 0 def is_memory_full(self): """ @@ -148,8 +155,11 @@ def is_memory_full(self): ---------- - Bool: Memory is full or not """ + is_full = False + + if self.__current_index >= self.__max_length: + is_full = True - is_full = self.__current_index >= self.__max_length return is_full def calculate_advantages(self, rewards, values, dones): @@ -167,19 +177,19 @@ def calculate_advantages(self, rewards, values, dones): Returns ---------- NumPy array of float32: the computed advantage values for each - state in a given batch of experiences. + state in a given Data size. """ - mini_batch_size = rewards.shape[0] + data_size = len(rewards) # Create empty advantages array - advantages = np.zeros(mini_batch_size, dtype=np.float32) + advantages = np.zeros(data_size, dtype=np.float32) - for start_index in range(mini_batch_size): + for start_index in range(data_size-1): discount = 1 advantage = 0 - for future_index in range(start_index, mini_batch_size - 1): + for future_index in range(start_index, data_size - 1): # Calculate the temporal difference (TD) delta = ( @@ -198,10 +208,6 @@ def calculate_advantages(self, rewards, values, dones): # Update the discount factor for the next step discount *= self.__gamma * self.__gae_lambda - # Stop if a terminal state is reached - if dones[future_index]: - break - # Save the calculated advantage for the current state advantages[start_index] = advantage From f8f5bc22376d12bcfd5e8d4f900382eba925b0bd Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 3 Sep 2024 12:55:47 +0200 Subject: [PATCH 21/37] Correct return type of getSelectedMode function in ReadyState class --- .../src/ReadyState.cpp | 2 +- lib/APPReinforcementLearning/src/ReadyState.h | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/APPReinforcementLearning/src/ReadyState.cpp b/lib/APPReinforcementLearning/src/ReadyState.cpp index fc8babaa..415d6f7f 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.cpp +++ b/lib/APPReinforcementLearning/src/ReadyState.cpp @@ -140,7 +140,7 @@ void ReadyState::setLapTime(uint32_t lapTime) m_lapTime = lapTime; } -uint8_t ReadyState::getSelectedMode() +ReadyState::Mode ReadyState::getSelectedMode() { return m_mode; } diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 8626a46a..93949f32 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -59,6 +59,16 @@ class ReadyState : public IState { public: + /** + * The mode that can be selected. + */ + enum Mode: uint8_t + { + IDLE = 0, /**< No mode has been selected*/ + DRIVING_MODE, /**< Driving mode Selected. */ + TRAINING_MODE /**< Training mode Selected. */ + }; + /** * Get state instance. * @@ -100,21 +110,12 @@ class ReadyState : public IState /** * Set the selected mode. * - * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected. + * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected, otherwise 0. */ - uint8_t getSelectedMode(); + Mode getSelectedMode(); protected: private: - /** - * The mode that can be selected. - */ - enum Mode - { - IDLE = 0, /**< No mode has been selected*/ - DRIVING_MODE, /**< Driving mode Selected. */ - TRAINING_MODE /**< Training mode Selected. */ - }; /** Duration of the selected mode in ms. This is the maximum time to select a mode. */ static const uint32_t MODE_SELECTED_PERIOD = 2000U; From b3996880c7acf6e0490ea4a34ed14df393ac7acf Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 17:03:03 +0200 Subject: [PATCH 22/37] improve code formatting and update comments for clarity --- lib/APPReinforcementLearning/src/ReadyState.h | 2 +- lib/APPReinforcementLearning/src/TrainingState.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/APPReinforcementLearning/src/ReadyState.h b/lib/APPReinforcementLearning/src/ReadyState.h index 93949f32..bb827e55 100644 --- a/lib/APPReinforcementLearning/src/ReadyState.h +++ b/lib/APPReinforcementLearning/src/ReadyState.h @@ -110,7 +110,7 @@ class ReadyState : public IState /** * Set the selected mode. * - * @return It returns 1 if DrivingMode is selected or 2 if TrainingMode is selected, otherwise 0. + * @return It returns the selected Mode. */ Mode getSelectedMode(); diff --git a/lib/APPReinforcementLearning/src/TrainingState.h b/lib/APPReinforcementLearning/src/TrainingState.h index 737d7a89..a5ad381f 100644 --- a/lib/APPReinforcementLearning/src/TrainingState.h +++ b/lib/APPReinforcementLearning/src/TrainingState.h @@ -131,7 +131,3 @@ class TrainingState : public IState #endif /* TRAINING_STATE_H */ /** @} */ - -/****************************************************************************** - * Local Functions - *****************************************************************************/ From 0260fadac1e9b9d3bd77d17847e1098882a92447 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 17:04:36 +0200 Subject: [PATCH 23/37] Add conditions to verify and identify failed data --- lib/APPReinforcementLearning/src/App.cpp | 32 ++++++++++++++++++------ lib/APPReinforcementLearning/src/App.h | 2 +- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index a4cf8cb9..f7a9e725 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -96,7 +96,9 @@ void App::loop() { Board::getInstance().process(); Speedometer::getInstance().process(); - bool isDataSent = true; + bool isSensorDataSent = true; + bool isModeDataSent = true; + bool isStatusDataSent = true; if (true == m_controlInterval.isTimeout()) { @@ -125,7 +127,7 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + isStatusDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); m_statusTimer.restart(); } @@ -134,7 +136,7 @@ void App::loop() if (true == m_sendLineSensorsDataInterval.isTimeout() && (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - isDataSent = sendLineSensorsData(); + isSensorDataSent = sendLineSensorsData(); m_sendLineSensorsDataInterval.restart(); } @@ -149,16 +151,30 @@ void App::loop() SMPChannelPayload::Mode payload = (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + isModeDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); m_modeSelectionSent = true; } } - if (false == isDataSent) + if (false == isSensorDataSent) { - /* Failed to send data to the supervisor. Go to error state. */ - ErrorState::getInstance().setErrorMsg("DSF"); + /* Failed to send Senssor data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("SEND_SF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + + if (false == isStatusDataSent) + { + /* Failed to send Status data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("SD_SF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + + if (false == isModeDataSent) + { + /* Failed to send Mode data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("MD_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } @@ -208,7 +224,7 @@ bool App::sendLineSensorsData() const uint8_t maxLineSensors = lineSensors.getNumLineSensors(); const uint16_t* lineSensorValues = lineSensors.getSensorValues(); uint8_t lineSensorIdx = 0U; - bool isPayloadSent = true; + bool isPayloadSent = true; LineSensorData payload; if (LINE_SENSOR_CHANNEL_DLC == (maxLineSensors * sizeof(uint16_t))) diff --git a/lib/APPReinforcementLearning/src/App.h b/lib/APPReinforcementLearning/src/App.h index 6fec26c9..c15d3281 100644 --- a/lib/APPReinforcementLearning/src/App.h +++ b/lib/APPReinforcementLearning/src/App.h @@ -151,7 +151,7 @@ class App /** * Send line sensors data via SerialMuxProt. * - * * @return true if data has been sent, false otherwise. + * @return true if data has been sent, false otherwise. */ bool sendLineSensorsData() const; From 2add9fa863045fe234c37b78b3cce95b753edb22 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 17:17:16 +0200 Subject: [PATCH 24/37] refactor: enhance clarity in RL_Supervisor, Agent, and networks code --- webots/controllers/RL_Supervisor/agent.py | 164 +++++++----------- webots/controllers/RL_Supervisor/networks.py | 133 +++++++++++++- .../RL_Supervisor/rl_supervisor.py | 56 +++--- 3 files changed, 212 insertions(+), 141 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index f45ef80a..20ef40ac 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -27,14 +27,15 @@ ################################################################################ # Imports ################################################################################ + import csv import os import struct -import numpy as np # pylint: disable=import-error -import tensorflow as tf # pylint: disable=import-error -import tensorflow_probability as tfp # pylint: disable=import-error -from trajectory_buffer import Memory -from networks import Models +import numpy as np +import tensorflow as tf +import tensorflow_probability as tfp +from trajectory_buffer import Memory +from networks import Models ################################################################################ # Variables @@ -68,6 +69,16 @@ MIN_STD_DEV = 0.01 # Minimum standard deviation STD_DEV_FACTOR = 0.995 # Discounter standard deviation factor +TRANSLATION_FIELD = "translation" +ROTATION_FIELD = "rotation" + +IDLE = "IDLE_STATE" +READY = "READY_STATE" +TRAINING = "TRAINING_STATE" + +DIRECTORY = "logs" +FILE_DIRECTORY = "training_logs.csv" + ################################################################################ # Classes ################################################################################ @@ -94,37 +105,34 @@ def __init__( max_buffer_length=65536, ): self.__serialmux = smp_server - self.__policy_clip = policy_clip self.__chkpt_dir = chkpt_dir self.train_mode = False self.__top_speed = top_speed + self.__std_dev = 0.05 self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda) - self.__neural_network = Models(actor_alpha, critic_alpha) + self.__neural_network = Models(actor_alpha, critic_alpha, self.__std_dev, policy_clip) self.__training_index = 0 # Track batch index during training self.__current_batch = None # Saving of the current batch which is in process - self.__std_dev = 0.05 self.n_epochs = 3 self.done = False self.action = None self.value = None self.adjusted_log_prob = None self.num_episodes = 0 - self.state = "IDLE" + self.state = IDLE self.data_sent = True self.unsent_data = [] - self.critic_loss_history = [] - self.actor_loss_history = [] self.reward_history = [] def set_train_mode(self): """Set the Agent mode to train mode.""" self.train_mode = True - self.state = "READY" + self.state = READY def set_drive_mode(self): """Set the Agent mode to drive mode.""" self.train_mode = False - self.state = "READY" + self.state = READY self.num_episodes = 0 def store_transition( @@ -235,11 +243,11 @@ def send_motor_speeds(self, state): right_motor_speed = int(self.__top_speed + speed_difference) control_data = struct.pack("2H", left_motor_speed, right_motor_speed) - self.data_sent = self.__serialmux.send_data("SPEED_SET", control_data) + self.data_sent = self.__serialmux.send_data(SPEED_SET_CHANNEL_NAME, control_data) # Failed to send data. Appends the data to unsent_data List. if self.data_sent is False: - self.unsent_data.append(("SPEED_SET", control_data)) + self.unsent_data.append((SPEED_SET_CHANNEL_NAME, control_data)) def update(self, robot_node): """ @@ -251,19 +259,19 @@ def update(self, robot_node): """ # Checks whether the sequence has ended if it is set to Training mode. - if (self.train_mode is True) and ( - (self.done is True) or (self.__memory.is_memory_full() is True) - ): + if self.train_mode is True and ( + self.done is True or self.__memory.is_memory_full() is True): + cmd_payload = struct.pack("B", CMD_ID_SET_TRAINING_STATE) - self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload) # Failed to send data. Appends the data to unsent_data List. if self.data_sent is False: - self.unsent_data.append(("CMD", cmd_payload)) + self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload)) # Stopping condition for sequence was reached. self.reinitialize(robot_node) - self.state = "TRAINING" + self.state = TRAINING # Checks whether the sequence has ended if it is set to driving mode. if (self.train_mode is False) and (self.done is True): @@ -272,20 +280,20 @@ def update(self, robot_node): cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) self.data_sent = self.__serialmux.send_data( - "SPEED_SET", motorcontrol + SPEED_SET_CHANNEL_NAME, motorcontrol ) # stop the motors immediately # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: - self.unsent_data.append(("SPEED_SET", motorcontrol)) + self.unsent_data.append((SPEED_SET_CHANNEL_NAME, motorcontrol)) self.reinitialize(robot_node) - self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload) # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: - self.unsent_data.append(("CMD", cmd_payload)) - self.state = "IDLE" + self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload)) + self.state = IDLE def normalize_sensor_data(self, sensor_data): """ @@ -300,8 +308,8 @@ def normalize_sensor_data(self, sensor_data): NumPy array of float32: Normalized Sensor Data """ - sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE - return sensor_data + normalized_sensor_data = np.array(sensor_data) / MAX_SENSOR_VALUE + return normalized_sensor_data def determine_reward(self, sensor_data): """ @@ -315,14 +323,12 @@ def determine_reward(self, sensor_data): Returns ---------- - float: the Resulting Reward + float: The Resulting Reward. """ reward = self.__memory.calculate_reward(sensor_data) return reward - # pylint: disable=too-many-arguments - # pylint: disable=too-many-locals def learn(self, states, actions, old_probs, values, rewards, advantages): """ Perform training to optimize model weights. @@ -336,88 +342,34 @@ def learn(self, states, actions, old_probs, values, rewards, advantages): rewards: The saved rewards received for taking the actions. advantages: the computed advantage values for each state in a given Data size. """ + # scales the sensor data to a range between 0 and 1 + m_states = self.normalize_sensor_data(states) for _ in range(self.n_epochs): - # optimize Actor Network weights - with tf.GradientTape() as tape: - states = tf.convert_to_tensor(states) - actions = tf.convert_to_tensor(actions) - old_probs = tf.convert_to_tensor(old_probs) - - # Forward pass through the actor network to get the action mean - predict_mean = self.__neural_network.actor_network(states) - - # Create the normal distribution with the predicted mean - new_dist = tfp.distributions.Normal(predict_mean, self.__std_dev) - - # Invert the tanh transformation to recover the original actions before tanh - untransformed_actions = tf.atanh(actions) - - new_log_prob = new_dist.log_prob(untransformed_actions) + states = tf.convert_to_tensor(m_states) + actions = tf.convert_to_tensor(actions) + old_probs = tf.convert_to_tensor(old_probs) - # Compute the log of the Jacobian for the tanh transformation - jacobian_log_det = tf.math.log(1 - tf.square(actions)) - adjusted_new_log_prob = new_log_prob - jacobian_log_det - - # The ratio between the new model and the old model’s action log probabilities - prob_ratio = tf.exp(adjusted_new_log_prob - old_probs) - - # If the ratio is too large or too small, it will be - # clipped according to the surrogate function. - weighted_probs = prob_ratio * advantages - clipped_probs = tf.clip_by_value( - prob_ratio, 1 - self.__policy_clip, 1 + self.__policy_clip - ) - weighted_clipped_probs = clipped_probs * advantages - - # Policy Gradient Loss - actor_loss = -tf.reduce_mean( - tf.minimum(weighted_probs, weighted_clipped_probs) - ) - - # calculate gradient - actor_params = self.__neural_network.actor_network.trainable_variables - actor_grads = tape.gradient(actor_loss, actor_params) - self.__neural_network.actor_optimizer.apply_gradients( - zip(actor_grads, actor_params) - ) + # optimize Actor Network weights + self.__neural_network.compute_actor_gradient(states, actions, old_probs, advantages) # optimize Critic Network weights - with tf.GradientTape() as tape: - - # The critical value represents the expected return from state 𝑠𝑡. - # It provides an estimate of how good it is to be in a given state. - critic_value = self.__neural_network.critic_network(states) - - # the total discounted reward accumulated from time step 𝑡 - estimate_returns = advantages + values + self.__neural_network.compute_critic_gradient(states, values, advantages) - # Generate loss - critic_loss = tf.math.reduce_mean( - tf.math.pow(estimate_returns - critic_value, 2) - ) - - # calculate gradient - critic_params = self.__neural_network.critic_network.trainable_variables - critic_grads = tape.gradient(critic_loss, critic_params) - self.__neural_network.critic_optimizer.apply_gradients( - zip(critic_grads, critic_params) - ) - self.actor_loss_history.append(actor_loss.numpy()) - self.critic_loss_history.append(critic_loss.numpy()) - self.reward_history.append(sum(rewards)) + # Save the rewards received + self.reward_history.append(sum(rewards)) - # saving logs in a CSV file - self.save_logs_to_csv() + # saving logs in a CSV file + self.save_logs_to_csv() def save_logs_to_csv(self): """Function for saving logs in a CSV file""" # Ensure the directory exists - log_dir = "logs" + log_dir = DIRECTORY os.makedirs(log_dir, exist_ok=True) - log_file = os.path.join(log_dir, "training_logs.csv") + log_file = os.path.join(log_dir, FILE_DIRECTORY) with open(log_file, mode="w", encoding="utf-8", newline="") as file: writer = csv.writer(file) @@ -425,8 +377,8 @@ def save_logs_to_csv(self): for indx, reward in enumerate(self.reward_history): writer.writerow( [ - self.actor_loss_history[indx], - self.critic_loss_history[indx], + self.__neural_network.actor_loss_history[indx], + self.__neural_network.critic_loss_history[indx], reward, ] ) @@ -469,14 +421,14 @@ def perform_training(self): self.__current_batch = None self.done = False self.__memory.clear_memory() - self.state = "IDLE" + self.state = IDLE self.num_episodes += 1 cmd_payload = struct.pack("B", CMD_ID_SET_READY_STATE) - self.data_sent = self.__serialmux.send_data("CMD", cmd_payload) + self.data_sent = self.__serialmux.send_data(COMMAND_CHANNEL_NAME, cmd_payload) # Failed to send data. Appends the data to unsent_data List if self.data_sent is False: - self.unsent_data.append(("CMD", cmd_payload)) + self.unsent_data.append((COMMAND_CHANNEL_NAME, cmd_payload)) # Minimize standard deviation until the minimum standard deviation is reached self.__std_dev = self.__std_dev * STD_DEV_FACTOR @@ -490,8 +442,8 @@ def reinitialize(self, robot_node): ---------- robot_node: The Robot interface """ - trans_field = robot_node.getField("translation") - rot_field = robot_node.getField("rotation") + trans_field = robot_node.getField(TRANSLATION_FIELD) + rot_field = robot_node.getField(ROTATION_FIELD) initial_position = POSITION_DATA initial_orientation = ORIENTATION_DATA trans_field.setSFVec3f(initial_position) diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 1535330c..9deee19b 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,35 +28,43 @@ # Imports ################################################################################ -from tensorflow import keras # pylint: disable=import-error -from keras import layers # pylint: disable=import-error -from keras.regularizers import l2 # pylint: disable=import-error +import tensorflow as tf +import tensorflow_probability as tfp +from tensorflow import keras +from keras import layers +from keras.regularizers import l2 ################################################################################ # Variables ################################################################################ +# Constants +NUM_SENSORS = 5 # Assuming 5 sensor inputs ################################################################################ # Classes ################################################################################ -class Models: +class Models: # pylint: disable=too-many-instance-attributes """Class for building networks of actors and critics.""" - def __init__(self, actor_alpha, critic_alpha): + def __init__(self, actor_alpha, critic_alpha, std_dev, policy_clip): self.__actor_learning_rate = actor_alpha self.__critic_learning_rate = critic_alpha self.actor_network = self.build_actor_network() - self.critic_network = self.build_critic_network() + self.critic_network = self.build_actor_network() + self.std_dev = std_dev + self.policy_clip = policy_clip self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate) self.critic_optimizer = keras.optimizers.Adam(self.__critic_learning_rate) + self.critic_loss_history = [] + self.actor_loss_history = [] def build_actor_network(self): """Build Actor Network.""" - state_input = layers.Input(shape=(5,)) # Assuming 5 sensor inputs + state_input = layers.Input(shape=(NUM_SENSORS,)) fc1 = layers.Dense( 64, activation="relu", @@ -90,7 +98,7 @@ def build_actor_network(self): def build_critic_network(self): """Build Critic Network""" - state_input = layers.Input(shape=(5,)) # Assuming 5 sensor inputs + state_input = layers.Input(shape=(NUM_SENSORS,)) fc1 = layers.Dense( 64, activation="relu", @@ -116,6 +124,115 @@ def build_critic_network(self): return keras.models.Model(inputs=state_input, outputs=value) + def compute_critic_gradient(self, states, values, advantages): + """ optimize Critic Network weights. + + Parameters + ---------- + states: The saved states observed during interactions with the environment. + values: The saved estimated values of the observed states. + advantages: the computed advantage values for each state in a given Data size. + """ + + with tf.GradientTape() as tape: + + # The critical value represents the expected return from state 𝑠𝑡. + # It provides an estimate of how good it is to be in a given state. + critic_value = self.critic_network(states) + + # the total discounted reward accumulated from time step 𝑡 + estimate_returns = advantages + values + + # Generate loss + critic_loss = tf.math.reduce_mean( + tf.math.pow(estimate_returns - critic_value, 2) + ) + # calculate gradient + critic_params = self.critic_network.trainable_variables + critic_grads = tape.gradient(critic_loss, critic_params) + self.critic_optimizer.apply_gradients( + zip(critic_grads, critic_params) + ) + + # save the critic Loss + self.critic_loss_history.append(critic_loss.numpy()) + + def calculate_adjusted_log_probability(self, states, actions): + """ The function computes the logarithmic probability of a given action. + + Parameters + ---------- + states: The saved states observed during interactions with the environment. + values: The saved estimated values of the observed states. + + Returns + ---------- + A TensorFlow tensor with the data type float32: The logarithmic probability + of a given action + + """ + # Forward pass through the actor network to get the action mean + predict_mean = self.actor_network(states) + + # Create the normal distribution with the predicted mean + new_dist = tfp.distributions.Normal(predict_mean, self.std_dev) + + # Invert the tanh transformation to recover the original actions before tanh + untransformed_actions = tf.atanh(actions) + + new_log_prob = new_dist.log_prob(untransformed_actions) + + # Compute the log of the Jacobian for the tanh transformation + # adding 1e-6 ensures that the Value remains stable and avoids potential issues + # during computation + jacobian_log_det = tf.math.log(1 - tf.square(actions) + 1e-6) + + adjusted_log_prob = new_log_prob - jacobian_log_det + + return adjusted_log_prob + + def compute_actor_gradient(self, states, actions, old_probs, advantages): + """ optimize Actor Network weights. + + Parameters + ---------- + states: The saved states observed during interactions with the environment. + actions: The saved actions taken in response to the observed states. + old_probs: The saved probabilities of the actions taken, based on the previous policy. + values: The saved estimated values of the observed states. + advantages: the computed advantage values for each state in a given Data size. + """ + + with tf.GradientTape() as tape: + + adjusted_new_log_prob = self.calculate_adjusted_log_probability(states, actions) + + # The ratio between the new model and the old model’s action log probabilities + prob_ratio = tf.exp(adjusted_new_log_prob - old_probs) + + # If the ratio is too large or too small, it will be + # clipped according to the surrogate function. + weighted_probs = prob_ratio * advantages + clipped_probs = tf.clip_by_value( + prob_ratio, 1 - self.policy_clip, 1 + self.policy_clip + ) + weighted_clipped_probs = clipped_probs * advantages + + # Policy Gradient Loss + actor_loss = -tf.reduce_mean( + tf.minimum(weighted_probs, weighted_clipped_probs) + ) + + # calculate gradient + actor_params = self.actor_network.trainable_variables + actor_grads = tape.gradient(actor_loss, actor_params) + self.actor_optimizer.apply_gradients( + zip(actor_grads, actor_params) + ) + + # save the Actor Loss + self.actor_loss_history.append(actor_loss.numpy()) + ################################################################################ # Functions diff --git a/webots/controllers/RL_Supervisor/rl_supervisor.py b/webots/controllers/RL_Supervisor/rl_supervisor.py index b620956a..e257474c 100644 --- a/webots/controllers/RL_Supervisor/rl_supervisor.py +++ b/webots/controllers/RL_Supervisor/rl_supervisor.py @@ -68,6 +68,10 @@ SENSOR_ID_MOST_LEFT = 0 SENSOR_ID_MOST_RIGHT = 4 +IDLE = "IDLE_STATE" +READY = "READY_STATE" +TRAINING = "TRAINING_STATE" + # Path of saved models PATH = "models/" @@ -76,7 +80,7 @@ ################################################################################ -class RobotController: # pylint: disable=too-many-instance-attributes +class RobotController: """Class for data flow control logic.""" def __init__(self, smp_server, tick_size, agent): @@ -86,7 +90,6 @@ def __init__(self, smp_server, tick_size, agent): self.__no_line_detection_count = 0 self.__timestamp = 0 # Elapsed time since reset [ms] self.last_sensor_data = None - self.start_stop_line_detected = False self.steps = 0 def callback_status(self, payload: bytearray) -> None: @@ -102,6 +105,7 @@ def callback_line_sensors(self, payload: bytearray) -> None: sensor_data = struct.unpack("5H", payload) self.steps += 1 + is_start_stop_line_detected = False # Determine lost line condition if all(value == 0 for value in sensor_data): self.__no_line_detection_count += 1 @@ -111,19 +115,18 @@ def callback_line_sensors(self, payload: bytearray) -> None: # Detect start/stop line if ((sensor_data[SENSOR_ID_MOST_LEFT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE) and (sensor_data[SENSOR_ID_MOST_RIGHT] >= LINE_SENSOR_ON_TRACK_MIN_VALUE)): - self.start_stop_line_detected = True + is_start_stop_line_detected = True # Detect Start/Stop Line before Finish Trajectories - if (self.start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS): + if (is_start_stop_line_detected is True) and (self.steps < MIN_NUMBER_OF_STEPS): sensor_data = list(sensor_data) - sensor_data[SENSOR_ID_MOST_LEFT] = 0 + sensor_data[SENSOR_ID_MOST_LEFT] = 0 sensor_data[SENSOR_ID_MOST_RIGHT] = 0 - self.start_stop_line_detected = False + is_start_stop_line_detected = False # sequence stop criterion debounce no line detection and start/stop line detected - if self.__no_line_detection_count >= 30 or ( - (self.start_stop_line_detected is True) and (self.steps >= MIN_NUMBER_OF_STEPS) - ): + if ((self.__no_line_detection_count >= 30) or ((is_start_stop_line_detected is True) + and (self.steps >= MIN_NUMBER_OF_STEPS))): self.__agent.done = True self.__no_line_detection_count = 0 self.steps = 0 @@ -139,9 +142,8 @@ def callback_line_sensors(self, payload: bytearray) -> None: # Start storage The data after the second received sensor data if self.last_sensor_data is not None: - normalized_sensor_data = self.__agent.normalize_sensor_data(self.last_sensor_data) self.__agent.store_transition( - normalized_sensor_data, + self.last_sensor_data, self.__agent.action, self.__agent.adjusted_log_prob, self.__agent.value, @@ -151,7 +153,7 @@ def callback_line_sensors(self, payload: bytearray) -> None: self.last_sensor_data = sensor_data # Sends the motor speeds to the robot. - if self.__agent.done is False and self.__agent.state == "READY": + if self.__agent.done is False and self.__agent.state == READY: self.__agent.send_motor_speeds(sensor_data) def callback_mode(self, payload: bytearray) -> None: @@ -191,6 +193,20 @@ def process(self): # process new data (callbacks will be executed) self.__smp_server.process(self.__timestamp) + def manage_agent_cycle(self,robot_node): + """The function controls agent behavior""" + if self.__agent.state == READY: + self.__agent.update(robot_node) + + # Start the training + elif self.__agent.state == TRAINING: + self.last_sensor_data = None + self.__agent.perform_training() + + # save model + if (self.__agent.num_episodes > 1) and (self.__agent.num_episodes % 50 == 0): + self.__agent.save_models() + ################################################################################ # Functions @@ -198,7 +214,6 @@ def process(self): # pylint: disable=duplicate-code -# pylint: disable=too-many-branches # pylint: disable=too-many-statements def main_loop(): """Main loop: @@ -275,20 +290,7 @@ def main_loop(): while supervisor.step(timestep) != -1: controller.process() - if agent.state == "READY": - agent.update(robot_node) - - # Start the training - elif agent.state == "TRAINING": - supervisor.last_sensor_data = None - agent.perform_training() - - print(f"#{agent.num_episodes} actor loss: {agent.actor_loss_history[-1]:.4f}," - f"critic loss: {agent.critic_loss_history[-1]:.4f}") - - # save model - if (agent.num_episodes > 1) and (agent.num_episodes % 50 == 0): - agent.save_models() + controller.manage_agent_cycle(robot_node) # Resent any unsent Data if agent.unsent_data: From 437d25cf5fd805737e7174cb96602229311c4c4d Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 17:19:35 +0200 Subject: [PATCH 25/37] remove unnecessary pylint disable comment from trajectory_buffer --- webots/controllers/RL_Supervisor/trajectory_buffer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 151dcf52..f39824d8 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -28,7 +28,7 @@ # Imports ################################################################################ -import numpy as np # pylint: disable=import-error +import numpy as np ################################################################################ # Variables @@ -70,7 +70,7 @@ def generate_batches(self): Numpy-Array: Probs Numpy-Array: Vals Numpy-Array: Rewards - Numpy-Array: dones + Numpy-Array: Advantages List: Batches """ From 96c0b4956699b8c10dbf1c7ad94304bd197bac44 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 17:34:07 +0200 Subject: [PATCH 26/37] add pylint disable for import-error due to Python 3.9/3.10 library compatibility --- webots/controllers/RL_Supervisor/agent.py | 6 +++--- webots/controllers/RL_Supervisor/networks.py | 10 +++++----- webots/controllers/RL_Supervisor/trajectory_buffer.py | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 20ef40ac..b65dc0df 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -31,9 +31,9 @@ import csv import os import struct -import numpy as np -import tensorflow as tf -import tensorflow_probability as tfp +import numpy as np # pylint: disable=import-error +import tensorflow as tf # pylint: disable=import-error +import tensorflow_probability as tfp # pylint: disable=import-error from trajectory_buffer import Memory from networks import Models diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 9deee19b..65d67b89 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,11 +28,11 @@ # Imports ################################################################################ -import tensorflow as tf -import tensorflow_probability as tfp -from tensorflow import keras -from keras import layers -from keras.regularizers import l2 +import tensorflow as tf # pylint: disable=import-error +import tensorflow_probability as tfp # pylint: disable=import-error +from tensorflow import keras # pylint: disable=import-error +from keras import layers # pylint: disable=import-error +from keras.regularizers import l2 # pylint: disable=import-error ################################################################################ # Variables diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index f39824d8..3a244b28 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -28,7 +28,7 @@ # Imports ################################################################################ -import numpy as np +import numpy as np # pylint: disable=import-error ################################################################################ # Variables From f496711b427cad1f2f7492f00311682222996bd8 Mon Sep 17 00:00:00 2001 From: Akram Date: Mon, 9 Sep 2024 23:06:25 +0200 Subject: [PATCH 27/37] corrected incorrect variable initialization --- webots/controllers/RL_Supervisor/networks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 65d67b89..7895eb76 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -53,7 +53,7 @@ def __init__(self, actor_alpha, critic_alpha, std_dev, policy_clip): self.__actor_learning_rate = actor_alpha self.__critic_learning_rate = critic_alpha self.actor_network = self.build_actor_network() - self.critic_network = self.build_actor_network() + self.critic_network = self.build_critic_network() self.std_dev = std_dev self.policy_clip = policy_clip self.actor_optimizer = keras.optimizers.Adam(self.__actor_learning_rate) From 6537e80f85c86c025b4264148db28cfe0ef654e6 Mon Sep 17 00:00:00 2001 From: Gabryel Reyes Date: Tue, 10 Sep 2024 09:54:53 +0200 Subject: [PATCH 28/37] Use a requirements file to cache pip --- .github/workflows/main.yml | 8 ++++---- .github/workflows/requirements.txt | 10 ++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/requirements.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c0652cd0..a0fcdcba 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -209,11 +209,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + cache: 'pip' + cache-dependency-path: .github/workflows/requirements.txt - name: Install dependencies - run: | - pip install pylint "git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt" + run: pip install -r .github/workflows/requirements.txt - name: Analysing the code with pylint - run: | - pylint ./webots/controllers/*/*.py + run: pylint ./webots/controllers/*/*.py diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt new file mode 100644 index 00000000..26d7047e --- /dev/null +++ b/.github/workflows/requirements.txt @@ -0,0 +1,10 @@ +platformio==6.1.15 +numpy==1.26.4 +tensorflow==2.10.0 +tensorflow-probability==0.15.0 +keras==2.10.0 +matplotlib==3.9.0 +pandas==2.2.2 +pytest==8.3.2 +pylint==3.2.7 +git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt \ No newline at end of file From 51bb66a117999369b35dba6383de22ea5561d00f Mon Sep 17 00:00:00 2001 From: Gabryel Reyes Date: Tue, 10 Sep 2024 10:01:19 +0200 Subject: [PATCH 29/37] Limit python linting to 3.9 and 3.11 --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a0fcdcba..b258d3eb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -198,7 +198,7 @@ jobs: needs: intro strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10"] # Steps represent a sequence of tasks that will be executed as part of the job. steps: From 6463269d19525fdab440c78c6b5733430d96870f Mon Sep 17 00:00:00 2001 From: Gabryel Reyes Date: Tue, 10 Sep 2024 10:04:46 +0200 Subject: [PATCH 30/37] Remove disable pylint import errors --- webots/controllers/RL_Supervisor/agent.py | 10 +++++----- webots/controllers/RL_Supervisor/networks.py | 10 +++++----- webots/controllers/RL_Supervisor/plotting.py | 4 ++-- webots/controllers/RL_Supervisor/trajectory_buffer.py | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index b65dc0df..45769c84 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -31,11 +31,11 @@ import csv import os import struct -import numpy as np # pylint: disable=import-error -import tensorflow as tf # pylint: disable=import-error -import tensorflow_probability as tfp # pylint: disable=import-error -from trajectory_buffer import Memory -from networks import Models +import numpy as np +import tensorflow as tf +import tensorflow_probability as tfp +from trajectory_buffer import Memory +from networks import Models ################################################################################ # Variables diff --git a/webots/controllers/RL_Supervisor/networks.py b/webots/controllers/RL_Supervisor/networks.py index 7895eb76..064995f0 100644 --- a/webots/controllers/RL_Supervisor/networks.py +++ b/webots/controllers/RL_Supervisor/networks.py @@ -28,11 +28,11 @@ # Imports ################################################################################ -import tensorflow as tf # pylint: disable=import-error -import tensorflow_probability as tfp # pylint: disable=import-error -from tensorflow import keras # pylint: disable=import-error -from keras import layers # pylint: disable=import-error -from keras.regularizers import l2 # pylint: disable=import-error +import tensorflow as tf +import tensorflow_probability as tfp +from tensorflow import keras +from keras import layers +from keras.regularizers import l2 ################################################################################ # Variables diff --git a/webots/controllers/RL_Supervisor/plotting.py b/webots/controllers/RL_Supervisor/plotting.py index 8efb92b3..2b62ba90 100644 --- a/webots/controllers/RL_Supervisor/plotting.py +++ b/webots/controllers/RL_Supervisor/plotting.py @@ -6,8 +6,8 @@ # mini-batch steps. # Imports -import matplotlib.pyplot as plt # pylint: disable=import-error -import pandas as pd # pylint: disable=import-error +import matplotlib.pyplot as plt +import pandas as pd # Define the path to the CSV file LOG_FILE = "logs/training_logs.csv" diff --git a/webots/controllers/RL_Supervisor/trajectory_buffer.py b/webots/controllers/RL_Supervisor/trajectory_buffer.py index 3a244b28..f39824d8 100644 --- a/webots/controllers/RL_Supervisor/trajectory_buffer.py +++ b/webots/controllers/RL_Supervisor/trajectory_buffer.py @@ -28,7 +28,7 @@ # Imports ################################################################################ -import numpy as np # pylint: disable=import-error +import numpy as np ################################################################################ # Variables From eb7643ca936875223642e3c7fec5c3b93ce325a1 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 13:29:15 +0200 Subject: [PATCH 31/37] Refactored conditional logic by replacing with if-else blocks for better readability --- lib/APPReinforcementLearning/src/App.cpp | 84 ++++++++++++------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index f7a9e725..4833b57b 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -96,9 +96,7 @@ void App::loop() { Board::getInstance().process(); Speedometer::getInstance().process(); - bool isSensorDataSent = true; - bool isModeDataSent = true; - bool isStatusDataSent = true; + bool isDataSent = true; if (true == m_controlInterval.isTimeout()) { @@ -127,55 +125,57 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - isStatusDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); m_statusTimer.restart(); } - /* Send periodically line sensor data. */ - if (true == m_sendLineSensorsDataInterval.isTimeout() && - (&DrivingState::getInstance() == m_systemStateMachine.getState())) - { - isSensorDataSent = sendLineSensorsData(); - - m_sendLineSensorsDataInterval.restart(); - } - - /* Send Mode selected to The Supervisor. */ - if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent)) - { - uint8_t mode_options = ReadyState::getInstance().getSelectedMode(); - - if (0U < mode_options) - { - SMPChannelPayload::Mode payload = - (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - - isModeDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); - - m_modeSelectionSent = true; - } - } - - if (false == isSensorDataSent) - { - /* Failed to send Senssor data to the supervisor. Go to error state. */ - ErrorState::getInstance().setErrorMsg("SEND_SF"); - m_systemStateMachine.setState(&ErrorState::getInstance()); - } - - if (false == isStatusDataSent) + if (false == isDataSent) { /* Failed to send Status data to the supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("SD_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } - - if (false == isModeDataSent) + else { - /* Failed to send Mode data to the supervisor. Go to error state. */ - ErrorState::getInstance().setErrorMsg("MD_SF"); - m_systemStateMachine.setState(&ErrorState::getInstance()); + /* Send periodically line sensor data. */ + if (true == m_sendLineSensorsDataInterval.isTimeout() && + (&DrivingState::getInstance() == m_systemStateMachine.getState())) + { + isDataSent = sendLineSensorsData(); + + m_sendLineSensorsDataInterval.restart(); + } + if (false == isDataSent) + { + /* Failed to send Sensor data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("SEND_SF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + else + { + /* Send Mode selected to The Supervisor. */ + if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent)) + { + uint8_t mode_options = ReadyState::getInstance().getSelectedMode(); + + if (0U < mode_options) + { + SMPChannelPayload::Mode payload = + (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; + + isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + + m_modeSelectionSent = true; + } + } + if (false == isDataSent) + { + /* Failed to send Mode data to the supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("MD_SF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + } } m_smpServer.process(millis()); From 3da1583edc3ddf99bbe2edca2dc59716cc363e0d Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 13:40:15 +0200 Subject: [PATCH 32/37] Refactored conditional logic by replacing with if-else blocks for better readability --- lib/APPReinforcementLearning/src/App.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index 4833b57b..06b4bfe9 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -132,7 +132,7 @@ void App::loop() if (false == isDataSent) { - /* Failed to send Status data to the supervisor. Go to error state. */ + /* Failed to send Status data to the Supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("SD_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } @@ -148,7 +148,7 @@ void App::loop() } if (false == isDataSent) { - /* Failed to send Sensor data to the supervisor. Go to error state. */ + /* Failed to send Sensor data to the Supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("SEND_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } @@ -171,7 +171,7 @@ void App::loop() } if (false == isDataSent) { - /* Failed to send Mode data to the supervisor. Go to error state. */ + /* Failed to send Mode data to the Supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("MD_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } From 736d4e88580d8467559ac33622e645f7868ce587 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 15:42:03 +0200 Subject: [PATCH 33/37] Add requiremen.txt for python --- webots/controllers/RL_Supervisor/requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 webots/controllers/RL_Supervisor/requirements.txt diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt new file mode 100644 index 00000000..f6c65e91 --- /dev/null +++ b/webots/controllers/RL_Supervisor/requirements.txt @@ -0,0 +1,5 @@ +tensorflow +tensorflow-probability +tf-keras + +git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt \ No newline at end of file From 9a0a78b1e8cfafd0d32017f7f6e96bb0958fcc82 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 16:36:18 +0200 Subject: [PATCH 34/37] add fixed package versions in requirements.txt --- webots/controllers/RL_Supervisor/requirements.txt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt index f6c65e91..478f3dd3 100644 --- a/webots/controllers/RL_Supervisor/requirements.txt +++ b/webots/controllers/RL_Supervisor/requirements.txt @@ -1,5 +1,4 @@ -tensorflow -tensorflow-probability -tf-keras - +numpy==1.26.4 +tensorflow==2.10.0 +tensorflow-probability==0.15.0 git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt \ No newline at end of file From fc76987cb64f1501f59700445a4fd652cfed5ae1 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 16:53:43 +0200 Subject: [PATCH 35/37] Add more dependencies to requirements.txt Python --- webots/controllers/RL_Supervisor/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/webots/controllers/RL_Supervisor/requirements.txt b/webots/controllers/RL_Supervisor/requirements.txt index 478f3dd3..4b1bc18e 100644 --- a/webots/controllers/RL_Supervisor/requirements.txt +++ b/webots/controllers/RL_Supervisor/requirements.txt @@ -1,4 +1,6 @@ +matplotlib==3.9.0 numpy==1.26.4 +pandas==2.2.2 tensorflow==2.10.0 tensorflow-probability==0.15.0 git+https://github.com/gabryelreyes/SerialMuxProt.git#egg=SerialMuxProt&subdirectory=python/SerialMuxProt \ No newline at end of file From 277885c6e782b78dbc59ff6778955f5c95c3b025 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 16:55:09 +0200 Subject: [PATCH 36/37] Add a comment in agent.py --- webots/controllers/RL_Supervisor/agent.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/webots/controllers/RL_Supervisor/agent.py b/webots/controllers/RL_Supervisor/agent.py index 45769c84..82dafa93 100644 --- a/webots/controllers/RL_Supervisor/agent.py +++ b/webots/controllers/RL_Supervisor/agent.py @@ -108,7 +108,8 @@ def __init__( self.__chkpt_dir = chkpt_dir self.train_mode = False self.__top_speed = top_speed - self.__std_dev = 0.05 + self.__std_dev = 0.05 # When training without an existing model this should be + # set to 0.9 manually self.__memory = Memory(batch_size, max_buffer_length, gamma, gae_lambda) self.__neural_network = Models(actor_alpha, critic_alpha, self.__std_dev, policy_clip) self.__training_index = 0 # Track batch index during training From 4ae6987d8e0fe679d72524339ff6232207e13519 Mon Sep 17 00:00:00 2001 From: Akram Date: Tue, 10 Sep 2024 17:00:57 +0200 Subject: [PATCH 37/37] refactor: change condition to explicit boolean comparison --- lib/APPReinforcementLearning/src/App.cpp | 62 ++++++++++++------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/lib/APPReinforcementLearning/src/App.cpp b/lib/APPReinforcementLearning/src/App.cpp index 06b4bfe9..005193e2 100644 --- a/lib/APPReinforcementLearning/src/App.cpp +++ b/lib/APPReinforcementLearning/src/App.cpp @@ -125,28 +125,25 @@ void App::loop() payload = {SMPChannelPayload::Status::DONE}; } - isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload)); + if (false == m_smpServer.sendData(m_serialMuxProtChannelIdStatus, &payload, sizeof(payload))) + { + /* Failed to send Status data to the Supervisor. Go to error state. */ + ErrorState::getInstance().setErrorMsg("SD_SF"); + m_systemStateMachine.setState(&ErrorState::getInstance()); + } + else + { + m_statusTimer.restart(); + } - m_statusTimer.restart(); + } - if (false == isDataSent) - { - /* Failed to send Status data to the Supervisor. Go to error state. */ - ErrorState::getInstance().setErrorMsg("SD_SF"); - m_systemStateMachine.setState(&ErrorState::getInstance()); - } - else + /* Send periodically line sensor data. */ + if (true == m_sendLineSensorsDataInterval.isTimeout() && + (&DrivingState::getInstance() == m_systemStateMachine.getState())) { - /* Send periodically line sensor data. */ - if (true == m_sendLineSensorsDataInterval.isTimeout() && - (&DrivingState::getInstance() == m_systemStateMachine.getState())) - { - isDataSent = sendLineSensorsData(); - - m_sendLineSensorsDataInterval.restart(); - } - if (false == isDataSent) + if (false == sendLineSensorsData()) { /* Failed to send Sensor data to the Supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("SEND_SF"); @@ -154,27 +151,30 @@ void App::loop() } else { - /* Send Mode selected to The Supervisor. */ - if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent)) - { - uint8_t mode_options = ReadyState::getInstance().getSelectedMode(); + m_sendLineSensorsDataInterval.restart(); + } + } - if (0U < mode_options) - { - SMPChannelPayload::Mode payload = - (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; + /* Send Mode selected to The Supervisor. */ + if (&ReadyState::getInstance() == m_systemStateMachine.getState() && (false == m_modeSelectionSent)) + { + uint8_t mode_options = ReadyState::getInstance().getSelectedMode(); - isDataSent = m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload)); + if (0U < mode_options) + { + SMPChannelPayload::Mode payload = + (1 == mode_options) ? SMPChannelPayload::Mode::DRIVING_MODE : SMPChannelPayload::Mode::TRAINING_MODE; - m_modeSelectionSent = true; - } - } - if (false == isDataSent) + if(false == m_smpServer.sendData(m_serialMuxProtChannelIdMode, &payload, sizeof(payload))) { /* Failed to send Mode data to the Supervisor. Go to error state. */ ErrorState::getInstance().setErrorMsg("MD_SF"); m_systemStateMachine.setState(&ErrorState::getInstance()); } + else + { + m_modeSelectionSent = true; + } } }