Skip to content

Commit b1c2969

Browse files
authored
multi-processing-PPO
多进程的PPO算法
1 parent bf9f33f commit b1c2969

File tree

6 files changed

+527
-0
lines changed

6 files changed

+527
-0
lines changed

PPO/multi_processing_ppo/PPOModel.py

+271
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
from core import *
5+
6+
class GlobalNet:
7+
def __init__(self,state_dim,action_dim):
8+
"""network"""
9+
self.net_dim = 256
10+
self.learning_rate = 1e-4
11+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12+
self.act = ActorPPO(state_dim, action_dim, self.net_dim)
13+
self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate, ) # betas=(0.5, 0.99))
14+
self.cri = CriticAdv(state_dim, self.net_dim)
15+
self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate, ) # betas=(0.5, 0.99))
16+
17+
class AgentPPO:
18+
def __init__(self,net):
19+
max_buffer = 2**11
20+
self.gamma = 0.99
21+
self.buffer = BufferTupleOnline(max_buffer)
22+
23+
self.learning_rate = net.learning_rate
24+
self.device = net.device
25+
self.act = net.act.to(self.device)
26+
self.cri = net.cri.to(self.device)
27+
self.act.train()
28+
self.cri.train()
29+
self.act_optimizer = net.act_optimizer
30+
self.cri_optimizer = net.cri_optimizer
31+
32+
self.criterion = nn.SmoothL1Loss() # 一种损失函数
33+
34+
def select_action(self, states): # CPU array to GPU tensor to CPU array
35+
states = torch.tensor(states, dtype=torch.float32, device=self.device)
36+
37+
a_noise, log_prob = self.act.get__a__log_prob(states)
38+
a_noise = a_noise.cpu().data.numpy()[0]
39+
log_prob = log_prob.cpu().data.numpy()[0]
40+
return a_noise, log_prob # not tanh()
41+
42+
def update_buffer(self, env, max_step, reward_scale):
43+
# collect tuple (reward, mask, state, action, log_prob, )
44+
self.buffer.storage_list = list() # PPO is an online policy RL algorithm.
45+
# PPO (or GAE) should be an online policy.
46+
# Don't use Offline for PPO (or GAE). It won't speed up training but slower
47+
48+
rewards = list()
49+
steps = list()
50+
51+
step_counter = 0
52+
while step_counter < self.buffer.max_memo:
53+
state = env.reset()
54+
55+
reward_sum = 0
56+
step_sum = 0
57+
58+
for step_sum in range(max_step):
59+
# env.render()
60+
action, log_prob = self.select_action((state,))
61+
62+
next_state, reward, done, _ = env.step(np.tanh(action))
63+
reward_sum += reward
64+
65+
mask = 0.0 if done else self.gamma
66+
67+
reward_ = reward * reward_scale
68+
self.buffer.push(reward_, mask, state, action, log_prob, )
69+
70+
if done:
71+
break
72+
73+
state = next_state
74+
75+
rewards.append(reward_sum)
76+
steps.append(step_sum)
77+
78+
step_counter += step_sum
79+
return np.array(rewards).mean(), steps
80+
81+
def update_policy(self, batch_size, repeat_times):
82+
self.act.train()
83+
self.cri.train()
84+
clip = 0.25 # ratio.clamp(1 - clip, 1 + clip)
85+
lambda_adv = 0.98 # why 0.98? cannot use 0.99
86+
lambda_entropy = 0.01 # could be 0.02
87+
# repeat_times = 8 could be 2**3 ~ 2**5
88+
89+
actor_loss = critic_loss = None # just for print return
90+
91+
'''the batch for training'''
92+
max_memo = len(self.buffer)
93+
all_batch = self.buffer.sample_all()
94+
all_reward, all_mask, all_state, all_action, all_log_prob = [
95+
torch.tensor(ary, dtype=torch.float32, device=self.device)
96+
for ary in (all_batch.reward, all_batch.mask, all_batch.state, all_batch.action, all_batch.log_prob,)
97+
]
98+
99+
# all__new_v = self.cri(all_state).detach_() # all new value
100+
with torch.no_grad():
101+
b_size = 512
102+
all__new_v = torch.cat(
103+
[self.cri(all_state[i:i + b_size])
104+
for i in range(0, all_state.size()[0], b_size)], dim=0) # 这句相当于把[tensor1, tensor2...] cat 成了一个长tensor
105+
106+
'''compute old_v (old policy value), adv_v (advantage value)
107+
refer: GAE. ICLR 2016. Generalization Advantage Estimate.
108+
https://arxiv.org/pdf/1506.02438.pdf'''
109+
all__delta = torch.empty(max_memo, dtype=torch.float32, device=self.device)
110+
all__old_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # old policy value
111+
all__adv_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # advantage value
112+
113+
prev_old_v = 0 # old q value
114+
prev_new_v = 0 # new q value
115+
prev_adv_v = 0 # advantage q value
116+
for i in range(max_memo - 1, -1, -1):
117+
all__delta[i] = all_reward[i] + all_mask[i] * prev_new_v - all__new_v[i]
118+
all__old_v[i] = all_reward[i] + all_mask[i] * prev_old_v
119+
all__adv_v[i] = all__delta[i] + all_mask[i] * prev_adv_v * lambda_adv
120+
121+
prev_old_v = all__old_v[i]
122+
prev_new_v = all__new_v[i]
123+
prev_adv_v = all__adv_v[i]
124+
125+
all__adv_v = (all__adv_v - all__adv_v.mean()) / (all__adv_v.std() + 1e-5) # advantage_norm:
126+
127+
'''mini batch sample'''
128+
sample_times = int(repeat_times * max_memo / batch_size)
129+
for _ in range(sample_times):
130+
'''random sample'''
131+
# indices = rd.choice(max_memo, batch_size, replace=True) # False)
132+
indices = rd.randint(max_memo, size=batch_size)
133+
134+
state = all_state[indices]
135+
action = all_action[indices]
136+
advantage = all__adv_v[indices]
137+
old_value = all__old_v[indices].unsqueeze(1)
138+
old_log_prob = all_log_prob[indices]
139+
140+
"""Adaptive KL Penalty Coefficient
141+
loss_KLPEN = surrogate_obj + value_obj * lambda_value + entropy_obj * lambda_entropy
142+
loss_KLPEN = (value_obj * lambda_value) + (surrogate_obj + entropy_obj * lambda_entropy)
143+
loss_KLPEN = (critic_loss) + (actor_loss)
144+
"""
145+
146+
'''critic_loss'''
147+
new_log_prob = self.act.compute__log_prob(state, action) # it is actor_loss
148+
new_value = self.cri(state)
149+
150+
critic_loss = (self.criterion(new_value, old_value)) / (old_value.std() + 1e-5)
151+
self.cri_optimizer.zero_grad()
152+
critic_loss.backward()
153+
self.cri_optimizer.step()
154+
155+
'''actor_loss'''
156+
# surrogate objective of TRPO
157+
ratio = torch.exp(new_log_prob - old_log_prob)
158+
surrogate_obj0 = advantage * ratio
159+
surrogate_obj1 = advantage * ratio.clamp(1 - clip, 1 + clip)
160+
surrogate_obj = -torch.min(surrogate_obj0, surrogate_obj1).mean()
161+
loss_entropy = (torch.exp(new_log_prob) * new_log_prob).mean() # policy entropy
162+
163+
actor_loss = surrogate_obj + loss_entropy * lambda_entropy
164+
self.act_optimizer.zero_grad()
165+
actor_loss.backward()
166+
self.act_optimizer.step()
167+
168+
self.act.eval()
169+
self.cri.eval()
170+
return actor_loss.item(), critic_loss.item()
171+
172+
def update_policy_mp(self, batch_size, repeat_times,buffer_total):
173+
self.act.train()
174+
self.cri.train()
175+
clip = 0.25 # ratio.clamp(1 - clip, 1 + clip)
176+
lambda_adv = 0.98 # why 0.98? cannot use 0.99
177+
lambda_entropy = 0.01 # could be 0.02
178+
# repeat_times = 8 could be 2**3 ~ 2**5
179+
180+
actor_loss = critic_loss = None # just for print return
181+
182+
'''the batch for training'''
183+
[r, m, s, a, log] = [tuple() for _ in range(5)]
184+
max_memo = 0
185+
for buffer in buffer_total:
186+
max_memo += len(buffer[0])
187+
r += buffer[0]
188+
m += buffer[1]
189+
s += buffer[2]
190+
a += buffer[3]
191+
log += buffer[4]
192+
tran = namedtuple('Transition',('reward','mask','state','action','log_prob'))
193+
all_batch = tran(r,m,s,a,log)
194+
195+
all_reward, all_mask, all_state, all_action, all_log_prob = [
196+
torch.tensor(ary, dtype=torch.float32, device=self.device)
197+
for ary in (all_batch.reward, all_batch.mask, all_batch.state, all_batch.action, all_batch.log_prob,)
198+
]
199+
200+
# all__new_v = self.cri(all_state).detach_() # all new value
201+
with torch.no_grad():
202+
b_size = 512
203+
all__new_v = torch.cat(
204+
[self.cri(all_state[i:i + b_size])
205+
for i in range(0, all_state.size()[0], b_size)], dim=0) # 这句相当于把[tensor1, tensor2...] cat 成了一个长tensor
206+
207+
'''compute old_v (old policy value), adv_v (advantage value)
208+
refer: GAE. ICLR 2016. Generalization Advantage Estimate.
209+
https://arxiv.org/pdf/1506.02438.pdf'''
210+
all__delta = torch.empty(max_memo, dtype=torch.float32, device=self.device)
211+
all__old_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # old policy value
212+
all__adv_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # advantage value
213+
214+
prev_old_v = 0 # old q value
215+
prev_new_v = 0 # new q value
216+
prev_adv_v = 0 # advantage q value
217+
for i in range(max_memo - 1, -1, -1):
218+
all__delta[i] = all_reward[i] + all_mask[i] * prev_new_v - all__new_v[i]
219+
all__old_v[i] = all_reward[i] + all_mask[i] * prev_old_v
220+
all__adv_v[i] = all__delta[i] + all_mask[i] * prev_adv_v * lambda_adv
221+
222+
prev_old_v = all__old_v[i]
223+
prev_new_v = all__new_v[i]
224+
prev_adv_v = all__adv_v[i]
225+
226+
all__adv_v = (all__adv_v - all__adv_v.mean()) / (all__adv_v.std() + 1e-5) # advantage_norm:
227+
228+
'''mini batch sample'''
229+
sample_times = int(repeat_times * max_memo / batch_size)
230+
for _ in range(sample_times):
231+
'''random sample'''
232+
# indices = rd.choice(max_memo, batch_size, replace=True) # False)
233+
indices = rd.randint(max_memo, size=batch_size)
234+
235+
state = all_state[indices]
236+
action = all_action[indices]
237+
advantage = all__adv_v[indices]
238+
old_value = all__old_v[indices].unsqueeze(1)
239+
old_log_prob = all_log_prob[indices]
240+
241+
"""Adaptive KL Penalty Coefficient
242+
loss_KLPEN = surrogate_obj + value_obj * lambda_value + entropy_obj * lambda_entropy
243+
loss_KLPEN = (value_obj * lambda_value) + (surrogate_obj + entropy_obj * lambda_entropy)
244+
loss_KLPEN = (critic_loss) + (actor_loss)
245+
"""
246+
247+
'''critic_loss'''
248+
new_log_prob = self.act.compute__log_prob(state, action) # it is actor_loss
249+
new_value = self.cri(state)
250+
251+
critic_loss = (self.criterion(new_value, old_value)) / (old_value.std() + 1e-5)
252+
self.cri_optimizer.zero_grad()
253+
critic_loss.backward()
254+
self.cri_optimizer.step()
255+
256+
'''actor_loss'''
257+
# surrogate objective of TRPO
258+
ratio = torch.exp(new_log_prob - old_log_prob)
259+
surrogate_obj0 = advantage * ratio
260+
surrogate_obj1 = advantage * ratio.clamp(1 - clip, 1 + clip)
261+
surrogate_obj = -torch.min(surrogate_obj0, surrogate_obj1).mean()
262+
loss_entropy = (torch.exp(new_log_prob) * new_log_prob).mean() # policy entropy
263+
264+
actor_loss = surrogate_obj + loss_entropy * lambda_entropy
265+
self.act_optimizer.zero_grad()
266+
actor_loss.backward()
267+
self.act_optimizer.step()
268+
269+
self.act.eval()
270+
self.cri.eval()
271+
return actor_loss.item(), critic_loss.item()
Binary file not shown.
Binary file not shown.

PPO/multi_processing_ppo/core.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
import numpy as np
5+
import numpy.random as rd
6+
import torch
7+
import torch.nn as nn
8+
from collections import namedtuple
9+
10+
def layer_norm(layer, std=1.0, bias_const=1e-6):
11+
torch.nn.init.orthogonal_(layer.weight, std) # Tensor正交初始化
12+
torch.nn.init.constant_(layer.bias, bias_const) # 偏置常数初始化
13+
class BufferTupleOnline:
14+
def __init__(self, max_memo):
15+
self.max_memo = max_memo
16+
self.storage_list = list()
17+
self.transition = namedtuple(
18+
'Transition',
19+
# ('state', 'value', 'action', 'log_prob', 'mask', 'next_state', 'reward')
20+
('reward', 'mask', 'state', 'action', 'log_prob')
21+
)
22+
23+
def push(self, *args):
24+
self.storage_list.append(self.transition(*args))
25+
26+
def extend_memo(self, storage_list):
27+
self.storage_list.extend(storage_list)
28+
29+
def sample_all(self):
30+
return self.transition(*zip(*self.storage_list))
31+
32+
def __len__(self):
33+
return len(self.storage_list)
34+
35+
def update_pointer_before_sample(self):
36+
pass # compatibility
37+
class ActorPPO(nn.Module):
38+
def __init__(self, state_dim, action_dim, mid_dim):
39+
super().__init__()
40+
41+
def idx_dim(i):
42+
return int(8 * 1.5 ** i)
43+
44+
self.net = nn.Sequential(
45+
nn.Linear(state_dim, mid_dim), nn.ReLU(),
46+
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
47+
nn.Linear(mid_dim, action_dim),
48+
)
49+
50+
self.a_std_log = nn.Parameter(torch.zeros(1, action_dim) - 0.5, requires_grad=True) # 这个变量不仅是带梯度的,而且属于模型parameters的一部分
51+
self.constant_log_sqrt_2pi = np.log(np.sqrt(2 * np.pi))
52+
53+
# layer_norm(self.net__mean[0], std=1.0)
54+
# layer_norm(self.net__mean[2], std=1.0)
55+
layer_norm(self.net[-1], std=0.01) # output layer for action
56+
57+
def forward(self, s):
58+
a_mean = self.net(s)
59+
return a_mean.tanh()
60+
61+
def get__a__log_prob(self, state):
62+
a_mean = self.net(state)
63+
a_std = self.a_std_log.exp()
64+
a_noise = torch.normal(a_mean, a_std)
65+
66+
# a_delta = (a_noise - a_mean).pow(2) / (2 * a_std.pow(2))
67+
a_delta = ((a_noise - a_mean) / a_std).pow(2) / 2
68+
log_prob = -(a_delta + (self.a_std_log + self.constant_log_sqrt_2pi))
69+
log_prob = log_prob.sum(1)
70+
return a_noise, log_prob
71+
72+
def compute__log_prob(self, state, a_noise):
73+
a_mean = self.net(state)
74+
a_std = self.a_std_log.exp()
75+
76+
a_delta = ((a_noise - a_mean) / a_std).pow(2) / 2
77+
log_prob = -(a_delta + (self.a_std_log + self.constant_log_sqrt_2pi))
78+
return log_prob.sum(1)
79+
class CriticAdv(nn.Module): # 2020-05-05 fix bug
80+
def __init__(self, state_dim, mid_dim):
81+
super().__init__()
82+
83+
def idx_dim(i):
84+
return int(8 * 1.5 ** i)
85+
86+
self.net = nn.Sequential(
87+
nn.Linear(state_dim, mid_dim), nn.ReLU(),
88+
nn.Linear(mid_dim, mid_dim), nn.ReLU(),
89+
nn.Linear(mid_dim, 1),
90+
)
91+
92+
# layer_norm(self.net[0], std=1.0)
93+
# layer_norm(self.net[2], std=1.0)
94+
layer_norm(self.net[-1], std=1.0) # output layer for action
95+
96+
def forward(self, s):
97+
q = self.net(s)
98+
return q

0 commit comments

Comments
 (0)