-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmedium_program_1.py
148 lines (127 loc) · 5.44 KB
/
medium_program_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import sys
import networkx as nx
import pandas as pd
import numpy as np
import math
import time
import copy
import ntpath
import matplotlib
import matplotlib.pyplot as plt
import pdb
from collections import defaultdict
start_time = time.time()
def Q_learning(discount_factor, alpha, inputfilename):
data = pd.read_csv(inputfilename)
#Initialize Q_s_a Dictionary
Q_s_a = defaultdict(list)
#Get list of unique states from data
u_s = data.s.unique()
#Get list of unique actions available from data
u_a = data.a.unique()
#pdb.set_trace()
#Iterate over each row in the data
for loop in range(1,5):
for t in range(0,len(data[:]["s"])):
#Initialize the state to the state in the row 't' in the data.
state = data['s'][t]
#Get action from row 't'
action = data['a'][t]
#Get Reward from row 't'
R = data['r'][t]
#Get next state from row 't'
next_state = data['sp'][t]
#state and action Index for the Q_s_a dictionary
index_s_a = str(state) + "_" + str(action)
#Get maximum Q_s_a for the next_state from all available actions
#Note: Q_s_a will be 0 in iterations where this was not computed.
# Will run the for 't' loop some iterations where Q_s_a will appear initialized.
#Get the action for next state that gives maximum Q_s_a
max_Q_ns_action = max(u_a, key=lambda a: return_Q_ns_a(next_state, a, Q_s_a))
#print("ACTION IS", max_Q_ns_action)
#Create next_state and action index that gives maximum Q_s_a
index_max_Q_ns_a = str(next_state) + "_" + str(max_Q_ns_action)
#Find maximum Q_s_a value using the next_state and action
if str(index_max_Q_ns_a) not in Q_s_a.keys():
#print("Initializing Q_s_a for next state and action to 0 as key not found yet")
Q_s_a[index_max_Q_ns_a] = 0
max_Q_ns_a = 0
else:
#print("index_max_Q_ns_a Key present")
max_Q_ns_a = Q_s_a[index_max_Q_ns_a]
#print("max_Q_ns_a is", max_Q_ns_a)
if str(index_s_a) not in Q_s_a.keys():
#print("Initializing Q_s_a for state and action to 0 as key not found yet")
Q_s_a[index_s_a] = 0
#Track a counter to ensure all keys are present and initialized
#Find factorized reward to be added to Q_s_a for given state and action.
factorized_reward = alpha*(R + discount_factor*max_Q_ns_a - Q_s_a[index_s_a])
#Update Q_s_a for given state and action pair
Q_s_a[index_s_a] = Q_s_a[index_s_a] + factorized_reward
#pdb.set_trace()
#print("returning Q_s_a", Q_s_a)
return Q_s_a, u_s, u_a
def return_Q_ns_a(next_state, a, Q_s_a):
ind_a = str(next_state) + "_" + str(a)
if str(ind_a) not in Q_s_a.keys():
#print("Returning 0 as key not found in dictionary yet")
return 0
#print("Returning Q_s_a", ind_a, Q_s_a[ind_a])
return Q_s_a[ind_a]
def best_policy(Q_s_a, u_s, u_a, filename,num_states):
"""Given an MDP and a utility function U, determine the best policy,
as a mapping from state to action"""
pi = {}
u_s=sorted(u_s)
#pdb.set_trace()
new_u_s = defaultdict(list)
k = 1
#Fill new_u_s with adjacent available states to account for missing states
while k <= num_states:
for s in u_s:
#if k == int(s):
# continue
#print("US S", s)
#print("val of k", k)
new_u_s[k] = s
k = k + 1
if k > num_states:
#print("Breaking")
break
break
#print("While loop exited")
#print("LENGTH", len(new_u_s))
with open(filename, 'w') as f:
for st in range(1,len(new_u_s)+1,1):
#print("CURRENT STATE, i", new_u_s[st], i)
if st in u_s:
#print("Writing", st)
s = str(st)
else:
s = str(new_u_s[st])
#pi[s] = s
#print("Writing to file", pi[s])
#f.write("{}\n".format(pi[s]))
#continue
#print("SSSS", s, st)
pi[s] = max(u_a, key=lambda a: return_Q_ns_a(s, a, Q_s_a))
#print("Best Policy", s,pi[s])
f.write("{}\n".format(pi[s]))
f.close()
return pi
def main():
inputfilename = "C:/Users/mink_/AA228Student/workspace/project2/medium.csv"
outfilename = "C:/Users/mink_/AA228Student/workspace/project2/medium.policy"
print("INPUT ", inputfilename)
gamma = 1
alpha = 0.1
train_time_start = time.time()
Q_s_a, u_s, u_a = Q_learning(gamma, alpha, inputfilename)
train_time_end = time.time()
print("Train time is %s seconds ---" % (train_time_end - train_time_start))
num_states = 50000
best_policy(Q_s_a, u_s, u_a, outfilename,num_states)
print("DONEE")
print("Time taken to get the output %s seconds ---" % (time.time() - start_time))
if __name__ == '__main__':
main()