Skip to content

Commit 7d88cd7

Browse files
committed
Update DDPG-HER
1 parent 24f5639 commit 7d88cd7

File tree

8 files changed

+810
-3
lines changed

8 files changed

+810
-3
lines changed

DDPG/DDPG_spinningup/main.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@
4141
print('Episode:', episode, 'Reward:%i' % int(ep_reward))
4242
rewardList.append(ep_reward)
4343

44-
painter = Painter(load_csv=True,load_dir='../DDPG_spinningup_PER/compare.csv')
44+
painter = Painter(load_csv=True,load_dir='../DDPG_spinningup_HER/HER.csv')
4545
painter.addData(rewardList,'DDPG')
46-
painter.saveData(save_dir='../DDPG_spinningup_PER/compare.csv')
46+
painter.saveData(save_dir='../DDPG_spinningup_HER/HER.csv')
4747
painter.drawFigure()
4848

4949

DDPG/DDPG_spinningup_HER/DDPGModel.py

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import numpy as np
2+
from copy import deepcopy
3+
from torch.optim import Adam
4+
import torch
5+
import core as core
6+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7+
8+
class ReplayBuffer: # 输入为size;obs的维度(3,):这里在内部对其解运算成3;action的维度3
9+
"""
10+
A simple FIFO experience replay buffer for DDPG agents.
11+
"""
12+
13+
def __init__(self, obs_dim, act_dim, size):
14+
self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
15+
self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
16+
self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
17+
self.rew_buf = np.zeros(size, dtype=np.float32)
18+
self.done_buf = np.zeros(size, dtype=np.float32)
19+
self.ptr, self.size, self.max_size = 0, 0, size
20+
21+
def store(self, obs, act, rew, next_obs, done):
22+
self.obs_buf[self.ptr] = obs
23+
self.obs2_buf[self.ptr] = next_obs
24+
self.act_buf[self.ptr] = act
25+
self.rew_buf[self.ptr] = rew
26+
self.done_buf[self.ptr] = done
27+
self.ptr = (self.ptr+1) % self.max_size
28+
self.size = min(self.size+1, self.max_size)
29+
30+
def sample_batch(self, batch_size=32):
31+
idxs = np.random.randint(0, self.size, size=batch_size)
32+
batch = dict(obs=self.obs_buf[idxs],
33+
obs2=self.obs2_buf[idxs],
34+
act=self.act_buf[idxs],
35+
rew=self.rew_buf[idxs],
36+
done=self.done_buf[idxs])
37+
return {k: torch.as_tensor(v, dtype=torch.float32,device=device) for k,v in batch.items()}
38+
39+
class DDPG:
40+
def __init__(self, obs_dim, act_dim, act_bound, actor_critic=core.MLPActorCritic, seed=0,
41+
replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, act_noise=0.1):
42+
43+
self.obs_dim = obs_dim
44+
self.act_dim = act_dim
45+
self.act_bound = act_bound
46+
self.gamma = gamma
47+
self.polyak = polyak
48+
self.act_noise = act_noise
49+
50+
torch.manual_seed(seed)
51+
np.random.seed(seed)
52+
53+
self.ac = actor_critic(obs_dim, act_dim, act_limit = 2.0).to(device=device)
54+
self.ac_targ = deepcopy(self.ac).to(device=device)
55+
56+
self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=pi_lr)
57+
self.q_optimizer = Adam(self.ac.q.parameters(), lr=q_lr)
58+
59+
for p in self.ac_targ.parameters():
60+
p.requires_grad = False
61+
62+
self.replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
63+
64+
def compute_loss_q(self, data): #返回(q网络loss, q网络输出的状态动作值即Q值)
65+
o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']
66+
67+
q = self.ac.q(o,a)
68+
69+
# Bellman backup for Q function
70+
with torch.no_grad():
71+
q_pi_targ = self.ac_targ.q(o2, self.ac_targ.pi(o2))
72+
backup = r + self.gamma * (1 - d) * q_pi_targ
73+
74+
# MSE loss against Bellman backup
75+
loss_q = ((q - backup)**2).mean()
76+
77+
return loss_q # 这里的loss_q没加负号说明是最小化,很好理解,TD正是用函数逼近器去逼近backup,误差自然越小越好
78+
79+
def compute_loss_pi(self, data):
80+
o = data['obs']
81+
q_pi = self.ac.q(o, self.ac.pi(o))
82+
return -q_pi.mean() # 这里的负号表明是最大化q_pi,即最大化在当前state策略做出的action的Q值
83+
84+
def update(self, data):
85+
# First run one gradient descent step for Q.
86+
self.q_optimizer.zero_grad()
87+
loss_q = self.compute_loss_q(data)
88+
loss_q.backward()
89+
self.q_optimizer.step()
90+
91+
# Freeze Q-network so you don't waste computational effort
92+
# computing gradients for it during the policy learning step.
93+
for p in self.ac.q.parameters():
94+
p.requires_grad = False
95+
96+
# Next run one gradient descent step for pi.
97+
self.pi_optimizer.zero_grad()
98+
loss_pi = self.compute_loss_pi(data)
99+
loss_pi.backward()
100+
self.pi_optimizer.step()
101+
102+
# Unfreeze Q-network so you can optimize it at next DDPG step.
103+
for p in self.ac.q.parameters():
104+
p.requires_grad = True
105+
106+
107+
# Finally, update target networks by polyak averaging.
108+
with torch.no_grad():
109+
for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()):
110+
# NB: We use an in-place operations "mul_", "add_" to update target
111+
# params, as opposed to "mul" and "add", which would make new tensors.
112+
p_targ.data.mul_(self.polyak)
113+
p_targ.data.add_((1 - self.polyak) * p.data)
114+
115+
def get_action(self, o, noise_scale, deterministic=True):
116+
a = self.ac.act(torch.as_tensor(o, dtype=torch.float32,device=device))
117+
if not deterministic:
118+
a += noise_scale * np.random.randn(self.act_dim)
119+
return np.clip(a, self.act_bound[0], self.act_bound[1])

0 commit comments

Comments
 (0)