-
Notifications
You must be signed in to change notification settings - Fork 0
/
RL_SAC_main.py
105 lines (78 loc) · 3.28 KB
/
RL_SAC_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# import argparse
import datetime
# import gym
import numpy as np
import itertools
import torch
from SAC import *
from torch.utils.tensorboard import SummaryWriter
from replay_memory import ReplayMemory
batch_size = 4
num_steps = 20
eval = True
updates_per_step = 1
env = Environment(state0, num_epis, num_prod, num_inj, my_rom)
torch.manual_seed(123456)
np.random.seed(123456)
# Agent
agent = SAC(latent_dim, u_dim)
#Tesnorboard
writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), ROM_RL, Deterministic, "autotune" if False else ""))
# Memory
memory = ReplayMemory(100, 123456)
# Training Loop
total_numsteps = 0
updates = 0
for i_episode in itertools.count(1):
episode_reward = 0
episode_steps = 0
done = False
state = env.reset()
while not done:
if 100 > total_numsteps:
action = env.action_space.sample() # Sample random action
else:
action = agent.select_action(state) # Sample action from policy
if len(memory) > batch_size:
# Number of updates per step in environment
for i in range(updates_per_step):
# Update parameters of all the networks
critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, batch_size, updates)
writer.add_scalar('loss/critic_1', critic_1_loss, updates)
writer.add_scalar('loss/critic_2', critic_2_loss, updates)
writer.add_scalar('loss/policy', policy_loss, updates)
writer.add_scalar('loss/entropy_loss', ent_loss, updates)
writer.add_scalar('entropy_temprature/alpha', alpha, updates)
updates += 1
next_state, reward, done, _ = env.step(action) # Step
episode_steps += 1
total_numsteps += 1
episode_reward += reward
# Ignore the "done" signal if it comes from hitting the time horizon.
# (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
mask = 1 if episode_steps == env._max_episode_steps else float(not done)
memory.push(state, action, reward, next_state, mask) # Append transition to memory
state = next_state
if total_numsteps > num_steps:
break
writer.add_scalar('reward/train', episode_reward, i_episode)
print("Episode: {}, total numsteps: {}, episode steps: {}, reward: {}".format(i_episode, total_numsteps, episode_steps, round(episode_reward, 2)))
if i_episode % 10 == 0 and eval is True:
avg_reward = 0.
episodes = 10
for _ in range(episodes):
state = env.reset()
episode_reward = 0
done = False
while not done:
action = agent.select_action(state, evaluate=True)
next_state, reward, done, _ = env.step(action)
episode_reward += reward
state = next_state
avg_reward += episode_reward
avg_reward /= episodes
writer.add_scalar('avg_reward/test', avg_reward, i_episode)
print("----------------------------------------")
print("Test Episodes: {}, Avg. Reward: {}".format(episodes, round(avg_reward, 2)))
print("----------------------------------------")
env.close()