-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclean.py
344 lines (271 loc) · 11.6 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
import numpy as N
import pandas as pd
from math import log, exp
import logging
def getRadarLength(TimeToEnd):
"""
Returns a n-tuble with (n1, n2...)
where n? is the number of measurements for radar ?
To add a dataframe column do:
df['RadarLength'] = df['TimeToEnd'].apply(getRadarLength)
"""
tlist = map(float, TimeToEnd.split())
nlist = [0,]
previous_time = 9999999999
current_radar = 0
for it in tlist:
if it > previous_time:
current_radar += 1
nlist.append(0)
#assert(current_radar < 2)##it seems that there are cases with >2 radars
nlist[current_radar] += 1
previous_time = it
return tuple(nlist)
def separate_listInColumn(x):
"""
Returns a tuple where all the measurements are separeted by radar
Input:
x : should be a panda Dataframe
- First column must be the tuble of radar length
- Following columns should be columns to separate
"""
# determine if there is more than one time step
if type(x.iloc[1]) == float:
# There is a single time step, and the pandas read a float
listrads = [x.iloc[1]]
elif type(x.iloc[1]) == str:
# There are multiple time steps, and the pandas read a string
listrads = map(float, x.iloc[1].split())
# The list in then sliced by radar given in the first elements of x
# x.iloc[0] is a tuple with the length of each radar measurement, i.e.
# x.iloc[0][0] is the # of measurement with the 1st radar (x.iloc[0][1] for the 2nd radar)
# The following line could be rewritten more clearly (but less efficient?) like this:
# rad_measurements = x.iloc[0]
# nrad1, nrad2 = x.iloc[0]
# rad1, rad2 = listrads[:nrad1], listrads[nrad1:nrad1 + nrad2]
#if len(x.iloc[0]) < 2:
# return [listrads,]
#return listrads[:x.iloc[0][0]], listrads[x.iloc[0][0]:x.iloc[0][0] + x.iloc[0][1]]
by_rads = [listrads[:x.iloc[0][0]], ]
for idx in range(len(x.iloc[0]))[:-1]:
by_rads.append(listrads[sum(x.iloc[0][:idx+1]):sum(x.iloc[0][:idx+2])])
return tuple(by_rads)
def getIthRadar(x, iradar =1):
"""
Returns a list of measurements for the ith radar
Returns None if there are no ith radar
Input:
x : should be a panda Dataframe
- First column must be the tuble of radar length
(as produced by getRadarLength)
- Second columns must contains the values to separate
iradar : the ith radar to return the data from (default=1st)
"""
if len(x.iloc[0])<iradar:
return None## Or should it be NA?
## The longer but clearer way
try:
listrads = map(float, x.iloc[1].split())
except AttributeError:
return (0, )
rad_start_index = sum(x.iloc[0][:iradar-1])
rad_stop_index = rad_start_index + x.iloc[0][iradar-1]
return tuple(listrads[rad_start_index:rad_stop_index])
def getListReductions(x):
"""
Returns mean, range (max - min) and # of values of the given list
Input:
x : should be a list or tuple
(or something that can be turned into a numpy array)
"""
xarray = N.array(x)
return xarray.mean(), xarray.ptp(axis=0), len(xarray)
def kdp(rr3):
"""Returns kdp value from the rr3.
see
https://www.kaggle.com/c/how-much-did-it-rain/forums/t/11500/kdp-0-for-all-datasets
"""
try:
return N.sign(rr3) * exp(log(abs(rr3) / 40.6) / 0.866)
except ValueError:
return 0
def getKdpFromRR3(x):
"""Returns mean, range and std of kpds
Input:
x : should be a list or tuple of RR3 values
(or something that can be turned into a numpy array)
"""
xarray = N.array(map(kdp, x))
return xarray.mean(), xarray.ptp(axis=0)
def getStringReductions(x):
"""
Returns mean, range (max - min) and # of values of the given string
Input:
x : should be a space-separated value string
"""
xarray = N.array(map(float, x.split()))
##Exclude the error terms -99000, -99001, -99003
xarray = xarray[xarray > -99000]
return xarray.mean(), xarray.ptp(axis=0), len(xarray)
def get_dataframe_with_split_multiple_radars(input_df):
"""
Separates the rows containing multiple radar time series into mutliple rows, introducing a new
column identifying the group the new rows came from.
Input:
raw pandas dataframe, as read directly from the training dataset.
Output:
new pandas dataframe, with each row corresponding to an individual radar.
"""
# Get a list of all the columns that will have to be split by radar.
columns_to_split = list(input_df.columns)
columns_to_split.remove('Id') # not a time series!
columns_to_split.remove('Expected') # not a time series!
# Create an array of all the column names, in the same order as in the
# original data.
right_column_order = ['unique_Id']
for col in input_df.columns:
right_column_order.append(col)
# Append a new column that will represent how many radars were in the set from
# which a given row came from.
right_column_order.append('number_of_radars')
# Loop over all rows of the input dataframe, splitting mutliple radar rows
# as we go along.
# note about the algorithm below:
# It seems pretty ugly to loop on an index (not very pythonic).
# However, since I don't know ahead of time how many rows I'll have,
# a stackoverflow comment suggests that creating a list of dictionaries is faster.
# See: http://stackoverflow.com/questions/10715965/add-one-row-in-a-pandas-dataframe
#
# list of dictionaries, which will become our new dataframe
list_new_dataframe_dict = []
id_counter = -1 # unique identifier for our split radar entries
num_lines = len(input_df)
for index in range(num_lines):
if index%100 == 0:
print 'doing row %i of %i ...'%(index,num_lines)
# create a copy of the row, so we can manipulate
# it without polluting the initial dataframe.
row = input_df.loc[index].copy()
ID = row['Id']
expected = row['Expected']
# don't want to pollute input dataframe! We're
# hacking the copy here.
row['RadarCounts'] = getRadarLength(row['TimeToEnd'])
number_of_radars = len(row['RadarCounts'])
# list of dictionaries spawned by this row
list_newrows_dict = []
# initialize all relevant dictionaries with
# "family" data, ie stuff that is the same for all radars in row.
for i in row['RadarCounts']:
id_counter += 1
list_newrows_dict.append({ 'unique_Id':id_counter,
'Id':ID,
'number_of_radars':number_of_radars,
'Expected':expected })
# populate the new dictionaries just created above with every column data
for col in columns_to_split:
# get subrow so we can apply splitting methods
subrow = row[['RadarCounts',col]]
# fill the dictionaries with the split data
for array, dict in zip(separate_listInColumn(subrow),list_newrows_dict):
dict[col] = N.array(array)
# extend the main list of dictionaries with the entries from this row
list_new_dataframe_dict.extend( list_newrows_dict )
# create the new dataframe from the list of dictionaries
output_df = pd.DataFrame(list_new_dataframe_dict)[right_column_order].set_index('unique_Id')
return output_df
def get_clean_array(array):
"""
Remove all error code and NaN values.
"""
error_codes = [ -99900.0, -99901.0, -99903.0, 999.0]
# take out the NaN
float_array = array[ N.where(N.isfinite(array)) ]
I = N.ones_like(float_array)
for code in error_codes:
I *= float_array != code
left_over_array = float_array[ N.where(I) ]
return left_over_array
def get_clean_average(array):
"""
Remove all error code and NaN values before taking average and range.
If nothing is left, yield NaN.
"""
left_over_array = get_clean_array(array)
if len(left_over_array ) == 0:
return N.NaN
else:
return N.average(left_over_array)
def get_clean_range(array):
"""
Remove all error code and NaN values before taking average and range.
If nothing is left, yield NaN.
"""
left_over_array = get_clean_array(array)
if len(left_over_array ) == 0:
return N.NaN
else:
return N.ptp(left_over_array)
def get_clean_average_and_range_dataframe(input_df):
"""
Computes the averages and ranges of time series, removing missing data (or error codes).
When this is impossible, the missing value is replaced by the column average.
Input:
pandas dataframe from function "get_dataframe_with_split_multiple_radars"
Output:
new pandas dataframe, with averages and ranges replacing time series
"""
# Get a list of all the columns that will have to be split by radar.
columns_to_split = list(input_df.columns)
dict_averages = {}
for key in ['Id', 'Expected', 'number_of_radars']:
columns_to_split.remove(key) # not a time series!
dict_averages[key] = input_df[key].values
for col in columns_to_split:
print 'Averaging column %s ...'%col
# compute the average of the time series
avg = input_df[col].apply(get_clean_average).values
# replace missing values by column average
I_nan = N.where(N.isnan(avg))[0]
if len(I_nan) > 0:
print ' - There are %i NaN averages'%len(I_nan)
I_finite = N.where(N.isfinite(avg))[0]
print ' - There are %i finite averages'%len(I_finite)
finite_average = N.average(avg[I_finite])
avg[I_nan] = finite_average
#print c, I_nan.shape, finite_average
dict_averages['avg_%s'%col] = avg
print 'Computing range for column %s ...'%col
# compute the average of the time series
rng = input_df[col].apply(get_clean_range).values
# replace missing values by column average
I_nan = N.where(N.isnan(rng))[0]
if len(I_nan) > 0:
print ' - There are %i NaN ranges'%len(I_nan)
I_finite = N.where(N.isfinite(rng))[0]
print ' - There are %i finite ranges'%len(I_finite)
finite_range = N.average(rng[I_finite])
rng[I_nan] = finite_range
#print c, I_nan.shape, finite_average
dict_averages['range_%s'%col] = rng
# create the new dataframe from the list of dictionaries
output_df = pd.DataFrame(dict_averages)
return output_df
if __name__ == "__main__":
#print getRadarLength([5,4,3,2,1])
#print getRadarLength([5,4,3,7,1])
##The following lines show how to create a new column with the length of each radar
import pandas as pd
data = {"a": ["5 4 3 2 1", "5 4 3 7 1", "6 7 7", "3 5 6 1", "1 2 3 4 5 6 7 8"], "b" : [(3,2), (2,3), (1,2), (2,2), (2,3,2)]}
df = pd.DataFrame(data)
print df
#df['z'] = df['a'].apply(getRadarLength)
#print df[[1,0]].apply(separate_listInColumn, axis=1)
#df['z'] = df[[1,0]].apply(separate_listInColumn, axis=1)
#print df[['b','a']].apply(separate_listInColumn, axis=1)
#df['r1'], df['r2'] = zip(*df[['b','a']].apply(separate_listInColumn, axis=1))
#print '\n\n\n'
#print df
df['a1'] = df[['b','a']].apply(getIthRadar, axis=1)
print list(df['a1'])
#print zip(*df['a'].apply(getListReductions))