Skip to content

Commit

Permalink
Solution to problem berkeleydeeprlcourse#4
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman Ogail committed Apr 18, 2019
1 parent 0f99079 commit 2919a91
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 10 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,5 @@ ENV/
*2019*
*log.txt*
.DS_Store

hw2/logs/
23 changes: 23 additions & 0 deletions hw2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,26 @@ Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the p
The only file that you need to look at is `train_pg_f18.py`, which you will implement.

See the [HW2 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw2.pdf) for further instructions.

# Answers to Homework Experiments
## Problem 4 (CartPole)
### Summary
The benchmark included running multiple experiments with tuning parameters like using [rewards to go, monte carlo rewards], [advantage normalization, no advantage normalization], [large batch size, small batch size]. Then number of iterations were 100 per experiment and each configuration were experimented 3 times to understand variance as well. Below are general observations:
- Convergence: using reward to go resulted into faster convergence than monte carlo reward
- Variance: the following parameters helped reducing the variance: increasing batch size and advantage normalization

### Plots

![](fig/lb_CartPole-v0.png)

![](fig/sb_CartPole-v0.png)

### Answers
Q1- Which gradient estimator has better performance without advantage-centering—the trajectory-centric one, or the one using reward-to-go?
> The reward to go is better because it has lower variance.
Q2- Did advantage centering help?
> Yes it did help reduce the variance and speed up convergence a bit
Q3- Did the batch size make an impact?
> Yes it did, larger batch sizes result in lower variance and low bias
Binary file added hw2/fig/lb_CartPole-v0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added hw2/fig/sb_CartPole-v0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 12 additions & 10 deletions hw2/train_pg_f18.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,14 @@ def sample_action(self, policy_parameters):
if self.discrete:
sy_logits_na = policy_parameters
# YOUR_CODE_HERE
_, sy_sampled_ac = tf.nn.top_k(sy_logits_na)
sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), axis=[1])
assert sy_sampled_ac.shape.as_list() == [sy_logits_na.shape.as_list()[0]]
else:
sy_mean, sy_logstd = policy_parameters
# YOUR_CODE_HERE
sy_sampled_ac = sy_mean + tf.multipy(tf.math.exp(sy_logstd),
tf.random_normal(shape=sy_mean.shape))
assert sy_sampled_ac.shape.as_list() == [sy_mean.shape.as_list()]
return sy_sampled_ac

#========================================================================================#
Expand Down Expand Up @@ -241,13 +243,16 @@ def get_log_prob(self, policy_parameters, sy_ac_na):
# YOUR_CODE_HERE
sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na,
logits=sy_logits_na)
assert sy_logprob_n.shape.as_list() == [sy_logits_na.shape.as_list()[0]]
else:
sy_mean, sy_logstd = policy_parameters
# YOUR_CODE_HERE
# initialize a single self.ac_dim-variate Gaussian.
mvn = tf.contrib.distributions.MultivariateNormalDiag(
loc=sy_mean, scale_diag=tf.math.exp(sy_logstd))
mvn = tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean,
scale_diag=tf.math.exp(sy_logstd))
sy_logprob_n = mvn.log_prob(sy_ac_na)

assert sy_logprob_n.shape.as_list() == sy_mean.shape.as_list()
return sy_logprob_n

def build_computation_graph(self):
Expand Down Expand Up @@ -290,7 +295,7 @@ def build_computation_graph(self):
#========================================================================================#
# YOUR CODE HERE
# EXPERIMENT use * instead of tf.multiply operator
self.loss = tf.reduce_mean(tf.multiply(self.sy_logprob_n, self.sy_adv_n))
self.loss = tf.reduce_mean(self.sy_logprob_n * self.sy_adv_n)
self.update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

# create tf summaries
Expand Down Expand Up @@ -433,12 +438,7 @@ def sum_of_rewards(self, re_n):
if self.reward_to_go:
for traj_re in re_n:
for t in range(len(traj_re)):
# rtg = 0
# for t_bar, r in enumerate(traj_re):
# rtg += self.gamma**(t_bar-t) * r
# q_n.append(rtg)
q_n.append(
sum([self.gamma**(t_bar - t) * r for t_bar, r in enumerate(traj_re)]))
q_n.append(sum([self.gamma**(t_ - t) * r for t_, r in enumerate(traj_re[t:])]))
else:
for traj_re in re_n:
q_n.extend([sum([self.gamma**t * r for t, r in enumerate(traj_re)])] * len(traj_re))
Expand Down Expand Up @@ -478,6 +478,8 @@ def compute_advantage(self, ob_no, q_n):
adv_n = q_n - b_n
else:
adv_n = q_n.copy()

assert len(adv_n) == len(q_n)
return adv_n

def estimate_return(self, ob_no, re_n):
Expand Down

0 comments on commit 2919a91

Please sign in to comment.