1
+ import numpy as np
2
+ from copy import deepcopy
3
+ from torch .optim import Adam
4
+ import torch
5
+ import core as core
6
+ device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
7
+
8
+ class ReplayBuffer : # 输入为size;obs的维度(3,):这里在内部对其解运算成3;action的维度3
9
+ """
10
+ A simple FIFO experience replay buffer for DDPG agents.
11
+ """
12
+
13
+ def __init__ (self , obs_dim , act_dim , size ):
14
+ self .obs_buf = np .zeros (core .combined_shape (size , obs_dim ), dtype = np .float32 )
15
+ self .obs2_buf = np .zeros (core .combined_shape (size , obs_dim ), dtype = np .float32 )
16
+ self .act_buf = np .zeros (core .combined_shape (size , act_dim ), dtype = np .float32 )
17
+ self .rew_buf = np .zeros (size , dtype = np .float32 )
18
+ self .done_buf = np .zeros (size , dtype = np .float32 )
19
+ self .ptr , self .size , self .max_size = 0 , 0 , size
20
+
21
+ def store (self , obs , act , rew , next_obs , done ):
22
+ self .obs_buf [self .ptr ] = obs
23
+ self .obs2_buf [self .ptr ] = next_obs
24
+ self .act_buf [self .ptr ] = act
25
+ self .rew_buf [self .ptr ] = rew
26
+ self .done_buf [self .ptr ] = done
27
+ self .ptr = (self .ptr + 1 ) % self .max_size
28
+ self .size = min (self .size + 1 , self .max_size )
29
+
30
+ def sample_batch (self , batch_size = 32 ):
31
+ idxs = np .random .randint (0 , self .size , size = batch_size )
32
+ batch = dict (obs = self .obs_buf [idxs ],
33
+ obs2 = self .obs2_buf [idxs ],
34
+ act = self .act_buf [idxs ],
35
+ rew = self .rew_buf [idxs ],
36
+ done = self .done_buf [idxs ])
37
+ return {k : torch .as_tensor (v , dtype = torch .float32 ,device = device ) for k ,v in batch .items ()}
38
+
39
+ class DDPG :
40
+ def __init__ (self , obs_dim , act_dim , act_bound , actor_critic = core .MLPActorCritic , seed = 0 ,
41
+ replay_size = int (1e6 ), gamma = 0.99 , polyak = 0.995 , pi_lr = 1e-3 , q_lr = 1e-3 , act_noise = 0.1 ):
42
+
43
+ self .obs_dim = obs_dim
44
+ self .act_dim = act_dim
45
+ self .act_bound = act_bound
46
+ self .gamma = gamma
47
+ self .polyak = polyak
48
+ self .act_noise = act_noise
49
+
50
+ torch .manual_seed (seed )
51
+ np .random .seed (seed )
52
+
53
+ self .ac = actor_critic (obs_dim , act_dim , act_limit = 2.0 ).to (device = device )
54
+ self .ac_targ = deepcopy (self .ac ).to (device = device )
55
+
56
+ self .pi_optimizer = Adam (self .ac .pi .parameters (), lr = pi_lr )
57
+ self .q_optimizer = Adam (self .ac .q .parameters (), lr = q_lr )
58
+
59
+ for p in self .ac_targ .parameters ():
60
+ p .requires_grad = False
61
+
62
+ self .replay_buffer = ReplayBuffer (obs_dim = obs_dim , act_dim = act_dim , size = replay_size )
63
+
64
+ def compute_loss_q (self , data ): #返回(q网络loss, q网络输出的状态动作值即Q值)
65
+ o , a , r , o2 , d = data ['obs' ], data ['act' ], data ['rew' ], data ['obs2' ], data ['done' ]
66
+
67
+ q = self .ac .q (o ,a )
68
+
69
+ # Bellman backup for Q function
70
+ with torch .no_grad ():
71
+ q_pi_targ = self .ac_targ .q (o2 , self .ac_targ .pi (o2 ))
72
+ backup = r + self .gamma * (1 - d ) * q_pi_targ
73
+
74
+ # MSE loss against Bellman backup
75
+ loss_q = ((q - backup )** 2 ).mean ()
76
+
77
+ return loss_q # 这里的loss_q没加负号说明是最小化,很好理解,TD正是用函数逼近器去逼近backup,误差自然越小越好
78
+
79
+ def compute_loss_pi (self , data ):
80
+ o = data ['obs' ]
81
+ q_pi = self .ac .q (o , self .ac .pi (o ))
82
+ return - q_pi .mean () # 这里的负号表明是最大化q_pi,即最大化在当前state策略做出的action的Q值
83
+
84
+ def update (self , data ):
85
+ # First run one gradient descent step for Q.
86
+ self .q_optimizer .zero_grad ()
87
+ loss_q = self .compute_loss_q (data )
88
+ loss_q .backward ()
89
+ self .q_optimizer .step ()
90
+
91
+ # Freeze Q-network so you don't waste computational effort
92
+ # computing gradients for it during the policy learning step.
93
+ for p in self .ac .q .parameters ():
94
+ p .requires_grad = False
95
+
96
+ # Next run one gradient descent step for pi.
97
+ self .pi_optimizer .zero_grad ()
98
+ loss_pi = self .compute_loss_pi (data )
99
+ loss_pi .backward ()
100
+ self .pi_optimizer .step ()
101
+
102
+ # Unfreeze Q-network so you can optimize it at next DDPG step.
103
+ for p in self .ac .q .parameters ():
104
+ p .requires_grad = True
105
+
106
+
107
+ # Finally, update target networks by polyak averaging.
108
+ with torch .no_grad ():
109
+ for p , p_targ in zip (self .ac .parameters (), self .ac_targ .parameters ()):
110
+ # NB: We use an in-place operations "mul_", "add_" to update target
111
+ # params, as opposed to "mul" and "add", which would make new tensors.
112
+ p_targ .data .mul_ (self .polyak )
113
+ p_targ .data .add_ ((1 - self .polyak ) * p .data )
114
+
115
+ def get_action (self , o , noise_scale , deterministic = True ):
116
+ a = self .ac .act (torch .as_tensor (o , dtype = torch .float32 ,device = device ))
117
+ if not deterministic :
118
+ a += noise_scale * np .random .randn (self .act_dim )
119
+ return np .clip (a , self .act_bound [0 ], self .act_bound [1 ])
0 commit comments