-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBlackJack_MC.py
327 lines (245 loc) · 9.45 KB
/
BlackJack_MC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import numpy as np
from collections import defaultdict
from lib import plotting
deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]
class BlackJack(object):
"""Create an environment of a Black Jack Game
dealer = list - dealer list card in his deck
player = list - player list card in his deck
done = bool - True if the game is done, and False otherwise
this class only can be accessed from act(), reset(), state()
act() = PARAMS : 1 if hit, 0 if stick
RETURN : (state), done_status, reward
reset() = PARAMS : None
RETURN : None
state() = PARAMS : None
RETURN : (state)
(state) is a tuple of player score, dealers score, usable ace condition
Black Jack Refferences:
[1] https://webdocs.cs.ualberta.ca/~sutton/book/ebook/node51.html (Example 5.1)
[2] http://www.bicyclecards.com/how-to-play/blackjack/
"""
def __init__(self):
self.reset()
def reset(self):
self.dealer = [self.draw()]
self.player = [self.draw()]
self.done = False
def natural(self,hand): # check if he got natural/blackjack condition
return sorted(hand)==[1,10]
def draw(self): # get one card
return np.random.choice(deck)
def usable(self, hand): # check if he got usable ace condition
return 1 in hand and sum(hand) + 10 <= 21
def busted(self, hand): # check if he got busted
return self.sum_hand(hand) > 21
def sum_hand(self, hand):
if self.usable(hand):
return sum(hand) + 10
else:
return sum(hand)
def state(self):
return self.sum_hand(self.player), self.sum_hand(self.dealer), \
self.usable(self.player)
def act(self, hit):
if not self.done:
if hit:
self.hit()
if self.busted(self.player):
self.done = True
return self.state(), self.done, -1
else:
return self.state(), self.done, 0
else:
return self.stick()
def hit(self):
self.player.append(self.draw())
def stick(self):
self.done = True
# Dealer doing hit while his score below 17
# see refference [2]
while self.sum_hand(self.dealer) < 17:
self.dealer.append(self.draw())
# player'll never get busted in here so just sum it
player_score = self.sum_hand(self.player)
dealer_score = -1 if self.busted(self.dealer) else self.sum_hand(self.dealer)
if self.natural(self.player) and self.natural(self.dealer):
reward = 1
elif self.natural(self.player):
reward = 1.5
elif dealer_score > player_score:
reward = -1
elif dealer_score < player_score:
reward = 1
else:
reward = 0
return self.state(), self.done, reward
env = BlackJack()
def policy_0(pl_score, de_score, use_ace):
"""Policy-0 : always "hit" except we got score >= 20
"""
# Using probability instead of actual act number for consistency
return np.array([1.0, 0.0]) if pl_score >= 20 else np.array([0.0, 1.0])
def policy_epsilon(Q, epsilon, state):
"""Policy-epsilon :
- always hit when player score < 12
- otherwise:
- epsilon probability choosing random action
- 1-epsilon probability choosing action wich has maximum Q value
"""
# Greedy choose hit when score < 12
if state[0] < 12:
return [0.0, 1.0]
if np.random.rand() <= epsilon:
# Explore
return np.array([0.5, 0.5])
else:
# Exploit
best_action = np.argmax(Q[state])
A = np.zeros(2)
A[best_action] = 1
return A
def mc_prediction(policy, n_episodes, alfa=0.05, discount=1.0, env=env):
# Make a dictionary with deafult value 0.0
V = defaultdict(float)
rewardsum = defaultdict(float)
counter = defaultdict(float)
for e in xrange(n_episodes):
# An episode is a list of tuple:
# (state, reward_after_following_policy)
episode = []
env.reset()
now_state = env.state()
terminate = False
# Generate one episode
while not terminate:
# Chosen action
action = np.random.choice(np.arange(len(act_prob)), p=act_prob)
act_prob = policy(*now_state)
# Take action
next_state, done, reward = env.act(action)
# Save this state
episode.append((now_state, reward))
# Move to the next state
now_state = next_state
if done:
terminate = True
# Uncomment block below for
# Monte-carlo First-Visit updates for non-stationary problem
# for i, data in enumerate(episode):
# state = data[0]
# G = sum(data[1]*(discount**ix) for ix,data in enumerate(episode[i:]))
#
# # Incremental average
# V[state] = V[state] + alfa*(G - V[state])
# Uncomment block below for
# Monte-carlo First-Visit updates for stationary problem
for i, data in enumerate(episode):
state = data[0]
# data[1] is an actual reward
G = sum(data[1]*(discount**ix) for ix, data in enumerate(episode[i:]))
# in Black Jack in one episode, you'll only see a state once
# so you don't need to find another same state
counter[state] += 1
rewardsum[state] += G
# average (this is a sutton's book style)
V[state] = rewardsum[state] / counter[state]
# below is David silver's style using incremental average
# V[state] = V[state] + (1.0/counter[state])*(G - V[state])
return V
def mc_control(policy, n_episodes, epsilon=0.1, discount=1.0, env=env):
# Make a dictionary with deafult value 0.0
Q = defaultdict(lambda: [0.0, 0.0])
rewardsum = defaultdict(float)
counter = defaultdict(float)
for e in xrange(n_episodes):
# An episode is a list of tuple:
# (state, action_based_on_policy, reward_after_following_action)
# Choosen action now is needed because we use Q-value
episode = []
env.reset()
now_state = env.state()
terminate = False
# Generate one episode
while not terminate:
# Chosen action
act_prob = policy(Q, epsilon, now_state)
action = np.random.choice(np.arange(len(act_prob)), p=act_prob)
# Take action
next_state, done, reward = env.act(action)
# Save this state
episode.append((now_state, action, reward))
# Move to the next state
now_state = next_state
if done:
terminate = True
# MC_control, done without waiting all episodes
# Computes like MC policy evaluation
for i, data in enumerate(episode):
state = data[0]
action = data[1]
# data[2] is an actual reward
G = sum(data[2]*(discount**ix) for ix, data in enumerate(episode[i:]))
counter[state] += 1
rewardsum[state] += G
# average
Q[state][action] = rewardsum[state] / counter[state]
return Q, policy
"""Block code below evaluate a policy and return a plotted V-value
"""
print "MONTE-CARLO EVALUATE POLICY_0"
V = mc_prediction(policy_0, n_episodes=10000)
# Delete state with player score below 12 to make it same with example
new_V = defaultdict(float)
for key, data in V.iteritems():
if key[0] >= 12:
new_V[key] = data
# Using plotting library from Denny Britz repo
plotting.plot_value_function(new_V, title="Policy_0 Evaluation")
"""Block code below optimize Q by a policy and return Q and plotted V-value
"""
print "MONTE-CARLO CONTROL OPTIMIZE THE POLICY AND Q-VALUE"
Q, policy = mc_control(policy_epsilon, n_episodes=100000)
# For plotting purpose, find V-value from Q-Value
V = defaultdict(float)
for state, actions in Q.iteritems():
action_value = max(actions)
V[state] = action_value
# Delete state with player score below 12 to make it same with example
new_V = defaultdict(float)
for key, data in V.iteritems():
if key[0] >= 12:
new_V[key] = data
# Using plotting library from Denny Britz repo
plotting.plot_value_function(new_V, title="Optimal Value Function")
"""Block code below using optimized Q-value and policy before,
Then run it on a game
"""
print "SIMULATE THE OPTIMIZED POLICY AND Q-VALUE"
def print_state( pl_score, de_score, use_ace, reward=0):
if env.done:
print "== Game Over =="
print "Reward: {}".format(reward)
print "Player: {} | Dealer: {} | Usable Ace: {}".format(
pl_score, de_score, use_ace)
# You shouldn't print deck list
print "Player Deck: {}".format(env.player)
print "Dealer Deck: {}".format(env.dealer)
def act(hit, env=env):
state, done, reward = env.act(hit)
pl_score, de_score, use_ace = state
print_state(pl_score, de_score, use_ace, reward)
return state, done, reward
def reset(env=env):
env.reset()
pl_score, de_score, use_ace = env.state()
print_state(pl_score, de_score, use_ace)
return pl_score, de_score, use_ace
state = reset()
done = False
while not done:
print ""
act_prob = policy(Q, 0.1, state)
action = np.random.choice(np.arange(len(act_prob)), p=act_prob)
print "action: HIT" if action==1 else "action: STICK"
state, done, reward = act(action)