-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestrun_v5_5k.py
186 lines (149 loc) · 6.47 KB
/
testrun_v5_5k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import numpy as np
import gym
import tensorflow as tf
from collections import deque
import random
import time
import matplotlib.pyplot as plt
import csv
from PIL import Image
# Hyperparameters
NUM_EPISODES = 5000
MAX_NUM_TIMESTEPS = 1000
MEMORY_SIZE = 50000
BATCH_SIZE = 128
INITIAL_EPSILON = 1.0
MIN_EPSILON = 0.1
EPSILON_DECAY = 0.995
GAMMA = 0.99
NUM_P_AV = 10
NUM_STEPS_FOR_UPDATE = 4
MIN_REPLAY_SIZE = 2000
# Create the Breakout environment
env = gym.make('ALE/Breakout-v5')
# Define the Q-Network
q_network = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 1)),
tf.keras.layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu'),
tf.keras.layers.Conv2D(64, (3, 3), strides=(1, 1), activation='relu'),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(env.action_space.n, activation='linear')
])
# Define the target network
target_q_network = tf.keras.models.clone_model(q_network)
target_q_network.set_weights(q_network.get_weights())
# Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00005)
# Experience replay buffer
memory_buffer = deque(maxlen=MEMORY_SIZE)
def preprocess_state(state):
"""Convert state to grayscale, resize, and normalize using PIL."""
if isinstance(state, tuple): # Handle if the state is a tuple
state = state[0] # Extract the actual state (assuming the first item is the image)
image = Image.fromarray(state) # Convert to PIL Image
gray_image = image.convert('L') # Convert to grayscale
gray_resized = gray_image.resize((84, 84)) # Resize to 84x84
gray_normalized = np.array(gray_resized, dtype=np.float32) / 255.0 # Normalize to [0, 1]
return np.expand_dims(gray_normalized, axis=-1) # Add channel dimension
def get_action(q_values, epsilon):
"""Epsilon-greedy action selection."""
if np.random.rand() < epsilon:
return env.action_space.sample() # Explore
else:
return np.argmax(q_values) # Exploit
def compute_loss(experiences, gamma, q_network, target_q_network):
"""Calculate the loss for training."""
states, actions, rewards, next_states, done_vals = experiences
# Use TensorFlow's functions to calculate the max Q-values
max_qsa = tf.reduce_max(target_q_network(next_states), axis=1) # axis=1 for batch dimension
y_targets = rewards + (gamma * max_qsa * (1 - done_vals))
q_values = q_network(states)
indices = tf.stack([tf.range(BATCH_SIZE), actions], axis=1) # Create indices for gathering
q_values = tf.gather_nd(q_values, indices)
loss = tf.reduce_mean(tf.square(y_targets - q_values)) # Mean squared error
return loss
def agent_learn(experiences, gamma):
"""Updates the weights of the Q networks."""
with tf.GradientTape() as tape:
loss = compute_loss(experiences, gamma, q_network, target_q_network)
gradients = tape.gradient(loss, q_network.trainable_variables)
optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))
return loss
def update_target_network():
"""Copy the weights from the Q-network to the target network."""
target_q_network.set_weights(q_network.get_weights())
def process_batch(experiences):
"""Ensure the consistency of the batch components."""
states, actions, rewards, next_states, done_vals = zip(*experiences)
# Convert to numpy arrays
states = np.array(states, dtype=np.float32)
actions = np.array(actions, dtype=np.int32)
rewards = np.array(rewards, dtype=np.float32)
next_states = np.array(next_states, dtype=np.float32)
done_vals = np.array(done_vals, dtype=np.float32)
return states, actions, rewards, next_states, done_vals
# Training loop
start = time.time()
reward_data = []
loss_data = []
epsilon = INITIAL_EPSILON
# Save training data to a CSV file
with open('training_data_v5_5k.csv', mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Episode', 'Total Rewards', 'Loss'])
for episode in range(NUM_EPISODES):
state = env.reset()
state = preprocess_state(state)
total_points = 0
for t in range(MAX_NUM_TIMESTEPS):
state_qn = np.expand_dims(state, axis=0) # Expand dimensions for model input
q_values = q_network(state_qn) # Get Q-values
action = get_action(q_values, epsilon) # Choose action
# Step and ensure it returns correctly
step_result = env.step(action)
next_state = step_result[0] # Get next state
reward = step_result[1] # Get reward
done = step_result[2] # Get done flag
# info = step_result[3] # We can ignore info for now
# Check if next_state is a valid image
next_state = preprocess_state(next_state)
memory_buffer.append((state, action, reward, next_state, float(done)))
total_points += reward
# Only start training once the memory buffer has enough samples
if len(memory_buffer) >= MIN_REPLAY_SIZE and t % NUM_STEPS_FOR_UPDATE == 0:
experiences = random.sample(memory_buffer, BATCH_SIZE)
batch = process_batch(experiences)
loss = agent_learn(batch, GAMMA)
loss_data.append(loss.numpy())
# Update the target network periodically
if t % (NUM_STEPS_FOR_UPDATE * 10) == 0:
update_target_network()
state = next_state
if done:
break
reward_data.append(total_points)
epsilon = max(MIN_EPSILON, epsilon * EPSILON_DECAY)
writer.writerow([episode + 1, total_points, loss.numpy() if len(loss_data) > 0 else 0])
if (episode + 1) % NUM_P_AV == 0:
avg_points = np.mean(reward_data[-NUM_P_AV:])
print(f"Episode {episode + 1} - Average Points: {avg_points:.2f}")
# Save the model if performance improves
if total_points >= 8.0:
fname = f"breakout_model_v5_{total_points}_{episode}.h5"
q_network.save(fname)
tot_time = time.time() - start
print(f"Training completed in {tot_time:.2f} seconds ({tot_time/60:.2f} minutes).")
# Plot Loss and Rewards
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.plot(reward_data)
plt.title('Rewards Over Episodes')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.subplot(1, 2, 2)
plt.plot(loss_data)
plt.title('Loss Over Training Steps')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.show()