forked from npbool/kddcup
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextfea.py
148 lines (123 loc) · 3.4 KB
/
extfea.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pickle
import datetime
with open("umap",'r') as f:
usermap = pickle.load(f)
with open("cmap", 'r') as f:
coursemap = pickle.load(f)
with open("omap", 'r') as f:
objmap = pickle.load(f)
op_map = {"nagivate":0, "access":1, "problem":2, "page_close":3, "discussion":4, "video":5, "wiki":6}
op_clsnum = len(op_map)
user_st = 0
course_st = len(usermap)
obj_st = course_st + len(coursemap)
print user_st, course_st, obj_st
def todate(s):
y,m,d = s.split('-')
return datetime.date(int(y),int(m),int(d))
def extfea(logcol):
args = [line.strip().split(',') for line in logcol]
#print logcol[0]
noperation = len(args)
userid = int(args[0][1])
courseid = int(args[0][2])
objs = set([int(l[6]) for l in args])
last_obj = int(args[-1][6])
st_index = 0
feastr = ""
"""
feastr += "%d:1" % (userid+st_index,)
st_index += len(usermap)
feastr += " %d:1" % (courseid+st_index,)
st_index += len(coursemap)
feastr += "".join([" %d:1" % (objid+st_index,) for objid in sorted(objs)])
st_index += len(objmap)
"""
feastr += " %d:%d" % (st_index, noperation)
st_index += 1
op_num = [0 for i in range(op_clsnum)]
server_op_num = [0 for i in range(op_clsnum)]
browser_op_num = [0 for i in range(op_clsnum)]
#1,0,0,2014-06-14T09:43:40,server,problem,4
date_list = []
date_set = set()
for l in args:
position = l[4]
op_id = op_map[l[5]]
logdate,logtime = l[3].split('T')
date_list.append(logdate)
op_num[op_id]+=1
if(position=="server"):
server_op_num[op_id]+=1
else:
browser_op_num[op_id]+=1
log_times = len(set(date_list))
start_date = todate(date_list[0])
end_date = todate(date_list[-1])
log_datespan = (end_date-start_date).days
for op_id in range(op_clsnum):
feastr += " %d:%d" % (op_id+st_index, op_num[op_id])
st_index += op_clsnum
for op_id in range(op_clsnum):
feastr += " %d:%d" % (op_id+st_index, server_op_num[op_id])
st_index += op_clsnum
for op_id in range(op_clsnum):
feastr += " %d:%d" % (op_id+st_index, browser_op_num[op_id])
st_index += op_clsnum
feastr + " %d:%d" % (st_index, log_times)
st_index += 1
feastr += " %d:%d" % (st_index, log_datespan)
st_index += 1
return feastr
def ext_file(logfile):
fin = open(logfile,'r')
prev_eid = ""
col = []
feadic = {}
enroll_id = ""
for line in fin:
#print line
enroll_id = line.split(',',1)[0]
if enroll_id == prev_eid:
col.append(line)
else:
#print enroll_id, len(col)
#raw_input()
if(len(col)>0):
feastr = extfea(col)
feadic[int(prev_eid)] = feastr
col = [line,]
prev_eid = enroll_id
feastr = extfea(col)
feadic[int(enroll_id)] = feastr
fin.close()
return feadic
def write_train(truthfile,feadic,outputfile):
tf = open(truthfile)
train = open(outputfile,'w')
for line in tf:
eid, res = line.strip().split(',')
if int(eid) not in feadic:
print eid
continue
feastr = feadic[int(eid)]
train.write("%s %s\n" % (res, feastr))
tf.close()
train.close()
def write_test(testfile, feadic, outputfile):
tf = open(testfile)
test = open(outputfile,'w')
for line in tf:
eid, remain = line.strip().split(',',1)
if int(eid) not in feadic:
print eid
continue
feastr = feadic[int(eid)]
test.write("0 %s\n" % feastr)
tf.close()
test.close()
if __name__=="__main__":
feadic = ext_file("clean/log_train.csv",)
write_train("clean/truth_train.csv", feadic, "fea/train")
feadic = ext_file("clean/log_test.csv")
write_test("clean/enrollment_test.csv", feadic, "fea/test")