-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
159 lines (144 loc) · 5.97 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas
import json
import os
import numpy as np
from functools import reduce
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
class Preprocessing(object):
TRAIN_PATH = 'C:\\Users\\nsadeh\\Documents\\Kaggle Projects\\Data Sets\\Google Analytics\\train.csv'
TEST_PATH = 'DUMMY'
JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
DEVICE_OS = ['Windows', 'Macintosh', 'Android', 'iOS', 'Linux', 'Chrome OS']
DEVICE_BROWSER = ['Chrome', 'Safari', 'Firefox', 'Internet Explorer', 'Edge', 'Opera Mini', 'Opera']
def __init__(self,
train_path=TRAIN_PATH,
test_path=TEST_PATH):
self.train_path = train_path
self.test_path = test_path
self.pageview_scaler = None
self.device_encoder = None
def load_data(self,
training=True):
"""
loads training/testing data
:param training: boolean: whether we be loading train/test data
:return: 6 pandas.DataFrame objects of the 5 tables in this set
"""
path = self.train_path if training else self.test_path
try:
assert os.path.exists(path)
except AssertionError:
raise Exception('Path to {} file does not exist'.format('training'
if training
else 'testing'))
print('Loading data')
train_raw = pandas.read_csv(path,
dtype={'fullVisitorId': 'str'})
device = self.read_json(train_raw, 'device')
geo_network = self.read_json(train_raw, 'geoNetwork')
totals = self.read_json(train_raw, 'totals')
traffic_source = self.read_json(train_raw, 'trafficSource')
train_raw.drop(self.JSON_COLUMNS,
axis=1,
inplace=True)
print('Data loaded')
return train_raw, device, geo_network, totals, traffic_source
def read_json(self, data, column):
"""
Parses the json columns
:param data: pandas.DataFrame with the column
:param column: column of JSON
:return: pandas.DataFrame with what's in the JSON column
"""
try:
assert isinstance(data, pandas.DataFrame)
assert isinstance(column, str)
assert column in data.columns
except AssertionError:
raise Exception('Wrong inputs in read_json')
json_var = data[column]
record = json_var.map(json.loads).values.tolist()
json_df = pandas.DataFrame.from_records(record)
return json_df
@staticmethod
def remove_consts(data):
assert isinstance(data, pandas.DataFrame)
constants = [col
for col in data.columns
if data[col].nunique(dropna=False) == 1]
return data.drop(constants, axis=1)
@staticmethod
def month(date):
"""
Transforms a string of the format yyyymmdd to one of the calendar months
:param date: string of form yyyymmdd
:return: month as a name
"""
month = int(str(date)[4:6]) - 1
months = ['January',
'February',
'March',
'April',
'May',
'June',
'July',
'August',
'September',
'October',
'November',
'December']
return months[month]
def process_totals(self, totals, training=True):
"""
Shrink number range into something more manageable
:param totals: dataFrame with columns
:param training: whether to fit a transformation or use an existing one
:return: pageviews, transaction_value (target)
TODO: still have NaN issue
"""
totals = self.remove_consts(totals)
pagviews = np.log(totals.pageviews.values)
if training:
scaler = MinMaxScaler()
normed_pageviews = scaler.fit(pagviews)
self.pageview_scaler = scaler
else:
assert self.pageview_scaler, 'pageview scaler not initialized'
normed_pageviews = self.pageview_scaler.transform(pagviews)
return np.log(totals.transactionRevenue.values), normed_pageviews
def process_device(self, device, training=True):
device = device[['browser', 'isMobile', 'operatingSystem']]
device = reduce(lambda var, val: self.to_categorical(device,
['browser', 'isMobile', 'operatingSystem'],
[self.DEVICE_BROWSER, ['True', 'False'], self.DEVICE_OS]))
if training:
encoder = OneHotEncoder()
encoded_device = encoder.fit_transform(device.as_matrix())
self.device_encoder = encoder
else:
assert self.device_encoder, 'pageview scaler not initialized'
encoded_device = self.pageview_scaler.transform(device)
return pandas.DataFrame(encoded_device
@staticmethod
def to_categorical(df,
var,
allowed_vals,
replace='Other'):
"""
Converts DataFrame column to categorical with allowed values
:param df: DataFrame
:param var: column in DataFrame
:param allowed_vals: values allowed
:return: DataFrame where column var is categorical with len(allowed_vals) + 1 cats
"""
vals = df[var].values
updated_vals = [val
if val in allowed_vals
else replace
for val in vals]
df[var] = pandas.Series(updated_vals).astype('category')
return df
if __name__ == '__main__':
pre = Preprocessing()
train_raw, device, geo_network, totals, traffic_source = pre.load_data()
print(traffic_source)