-
Notifications
You must be signed in to change notification settings - Fork 1
/
predict.py
266 lines (219 loc) · 10.9 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#================================
# SuperLearner predict script
#================================
# Use a pre-trained SuperLearner
# ML model to make predictions.
#================================
# Dependencies
import argparse
import pickle
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
import sys
#=======================================
# Main execution
#=======================================
if __name__ == '__main__':
#===========================
# Command line inputs
#===========================
print("Parsing SuperLearner predict arguments...")
parser = argparse.ArgumentParser()
parsed, unknown = parser.parse_known_args()
for arg in unknown:
if arg.startswith(("-", "--")):
parser.add_argument(arg)
print(arg)
args = parser.parse_args()
#===========================================================
# Load the SuperLearner models
#===========================================================
model_dir = args.model_dir
# SuperLearner train.py always saves these files in model_dir:
train_data = model_dir+'/train.csv'
test_data = model_dir+'/test.csv'
predict_data_csv = args.predict_data+'.csv'
predict_data_ixy = args.predict_data+'.ixy'
predict_output_file = model_dir+"/sl_predictions.csv"
sys.path.append(model_dir)
# Insert an extra slash just in case missing on command line
with open(model_dir+"/"+'SuperLearners.pkl','rb') as file_object:
superlearner = pickle.load(file_object)
# For a given output variable, list the models:
predict_var = args.predict_var
print("Submodels within SuperLearner and their weights:")
list_models = list(superlearner[predict_var].named_estimators_.keys())
print(list_models)
# The following only works for the scipy.optimize.nnls
# stacking regressor, not the sklearn stacking regressors.
print(superlearner[predict_var].final_estimator_.weights_)
#===========================================================
# Load the train and test data to make plot and estimate
# prediction errors.
#===========================================================
num_inputs = int(args.num_inputs)
train_df = pd.read_csv(train_data).astype(np.float32)
X_train = train_df.values[:, :num_inputs]
Y_train = train_df.values[:, num_inputs:]
test_df = pd.read_csv(test_data).astype(np.float32)
X_test = test_df.values[:, :num_inputs]
Y_test = test_df.values[:, num_inputs:]
all_df = pd.concat((train_df,test_df),axis=0)
X_all = all_df.values[:, :num_inputs]
Y_all = all_df.values[:, num_inputs:]
#===========================================================
# Make some predictions with the testing data
#===========================================================
Y_hat_train = superlearner[predict_var].predict(X_train)
Y_hat_test = superlearner[predict_var].predict(X_test)
# Compute line of best fit between testing and training targets
test_line = np.polynomial.polynomial.Polynomial.fit(
np.squeeze(Y_test),
np.squeeze(Y_hat_test),1)
test_xy = test_line.linspace(n=100,domain=[Y_all.min(),Y_all.max()])
train_line = np.polynomial.polynomial.Polynomial.fit(
np.squeeze(Y_train),
np.squeeze(Y_hat_train),1)
train_xy = train_line.linspace(n=100,domain=[Y_all.min(),Y_all.max()])
# Compute some additional statistics
# We want to see to what extent we can use
# Y_hat_test to predict the error since we'll
# be using Y_hat to estimate its own errors.
# In this case, this is exactly what
# the error in the mean and the predictions,
# s_y and s_p, respectively, are getting at.
# Equations based on McClave & Dietrich,
# _Statistics_, 6th Ed., pp. 672, 682, 707.
n_sample_size = np.size(Y_test)
x_bar = np.mean(Y_test)
y_bar = np.mean(Y_hat_test)
ssxx = np.sum((np.squeeze(Y_test)-np.mean(Y_test))**2)
ssyy = np.sum((np.squeeze(Y_hat_test)-np.mean(Y_hat_test))**2)
ssxy = np.sum((np.squeeze(Y_hat_test)-np.mean(Y_hat_test))*(np.squeeze(Y_test)-np.mean(Y_test)))
# Root squared errors
rse = np.sqrt((np.squeeze(Y_test)-Y_hat_test)**2)
# When trying to predict error based on Y_hat,
# the sse_error = sum((error_i - mean(error))^2)
# which when expanded algebreically
sse_error = ssxx + ssyy - 2.0*ssxy
# Estimator for the standard error of regression between
# true targets and predicted targets.
sse = sse_error # Do this if using the test predictions to estimate the error
s = np.sqrt(sse/(n_sample_size-2))
# Estimate of the sampling distribution of the predictions
# of the mean value of the targets at specific value.
# This is nice, but it's very flat -> does not change much
# based on Y_test.
ssxx = ssyy # (Include this line too for Y_hat_test as a predictor of error, otherwise Y_test is the predictor.)
s_y = 2*s*np.sqrt((1/n_sample_size) + ((np.squeeze(Y_test)-np.mean(Y_test))**2)/ssxx)
s_p = 2*s*np.sqrt(1+(1/n_sample_size) + ((np.squeeze(Y_test)-np.mean(Y_test))**2)/ssxx)
fig, ax = plt.subplots()
ax.plot(Y_test,rse,'ko')
ax.plot(Y_test,s_y,'ro')
ax.plot(Y_test,s_p,'co')
ax.plot(Y_test,s*np.ones(np.shape(Y_test)),'r.')
ax.set_ylabel('Error metric [mg O2/L/h]')
ax.set_xlabel('Target respiration rate [mg O2/L/h]')
ax.legend(['RSE','Error in mean','Prediction Error','Regression Error'],loc='lower left')
ax.grid()
plt.savefig(model_dir+'/sl_error.png')
# Print out correlations between the various error estimates:
print("RSE: "+str(np.mean(rse))+" +/- "+str(np.std(rse)))
print("mean.error: "+str(np.mean(s_y))+" +/- "+str(np.std(s_y)))
print("predict.error: "+str(np.mean(s_p))+" +/- "+str(np.std(s_p)))
print("RSE vs mean.error: "+str(np.corrcoef(rse,s_y)))
print("RSE vs predict.error: "+str(np.corrcoef(rse,s_p)))
# Causes workflow to crash, not very useful, comment out.
#print("RSE vs s: "+str(np.corrcoef(rse,s*np.ones(np.shape(Y_test)))))
# Print out data used in plot below so plot can be recreated
df_train_scatter_out = pd.DataFrame(data=np.squeeze(Y_train), columns=['target'])
df_train_scatter_out['predicted'] = np.squeeze(Y_hat_train)
df_train_scatter_out.to_csv(model_dir+'/sl_scatter_train.csv')
df_test_scatter_out = pd.DataFrame(data=np.squeeze(Y_test), columns=['target'])
df_test_scatter_out['predicted'] = np.squeeze(Y_hat_test)
df_test_scatter_out.to_csv(model_dir+'/sl_scatter_test.csv')
# Evaluate using a high/low split
# Commenting out for now since this can be run offline with
# the archived data. For some datasets, this code will result
# in a crash if they don't have any data points in either the
# high or low category. To reactiveate this code, need to
# implement a check for the number of points in each
# category.
#print("evaluating hold out on a high/low split")
#threshold = -500
#ho_metrics = {}
#sl_scatter_test_high = np.delete(df_test_scatter_out, np.where(df_test_scatter_out["predicted"] >= threshold)[0], axis=0)
#sl_scatter_test_low = np.delete(df_test_scatter_out, np.where(df_test_scatter_out["predicted"] < threshold)[0], axis=0)
#sl_scatter_test_high = pd.DataFrame(sl_scatter_test_high, columns = df_test_scatter_out.columns)
#sl_scatter_test_low = pd.DataFrame(sl_scatter_test_low, columns = df_test_scatter_out.columns)
#ho_metrics["r2_high"] = sklearn.metrics.r2_score(sl_scatter_test_high["target"], sl_scatter_test_high["predicted"])
#ho_metrics["r2_low"] = sklearn.metrics.r2_score(sl_scatter_test_low["target"], sl_scatter_test_low["predicted"])
#with open(args.model_dir + '/hold-out-metrics-high-low-split.json', 'w') as json_file:
# json.dump(ho_metrics, json_file, indent = 4)
#===========================================================
# Make an evaluation plot
#===========================================================
fig, ax = plt.subplots(figsize=(10,10))
ax.plot(Y_train,np.squeeze(Y_hat_train),'ko',markersize=10)
ax.plot(Y_test,np.squeeze(Y_hat_test),'ko',markersize=10,fillstyle='none')
ax.plot(test_xy[0],test_xy[1],'r-')
ax.plot(train_xy[0],train_xy[1],'r--')
ax.grid()
ax.set_xlabel('Original target values, mg O2/L/h')
ax.set_ylabel('Model predictions, mg O2/L/h')
# Predict and metric with the individual models
# (The same thing can be achieved with built in:
# superlearner[predict_var].transform(X)
# but done explicitly here.)
for model_name in list_models:
model_object = superlearner[predict_var].named_estimators_[model_name]
Y_hat_train_mod = model_object.predict(X_train)
Y_hat_test_mod = model_object.predict(X_test)
# Color coded dots
ax.plot(np.concatenate((Y_train,Y_test),axis=0),
np.concatenate((np.squeeze(Y_hat_train_mod),np.squeeze(Y_hat_test_mod)),axis=0),
'.',markersize=5)
# One-to-one line
ax.plot(Y_all,Y_all,'k')
# Set zoom
#ax.set_xlim([-45,0])
#ax.set_ylim([-45,0])
# Legend
ax.legend(['Stacked TRAIN','Stacked TEST','TEST corr','TRAIN corr']+list_models+['one-to-one'])
# Save figure to file
plt.savefig(model_dir+'/sl_scatter.png')
#===========================================================
# Make predictions with a large data set
#===========================================================
predict_df = pd.read_csv(predict_data_csv).astype(np.float32)
predict_df.fillna(predict_df.mean(),inplace=True)
X = predict_df.values
Y_predict = superlearner[predict_var].predict(X)
# Estimate the error based on the training data
Y_hat_error = 2*s*np.sqrt((1/n_sample_size) + ((np.squeeze(Y_predict)-np.mean(Y_test))**2)/ssxx)
Y_hat_pred_error = 2*s*np.sqrt(1+(1/n_sample_size) + ((np.squeeze(Y_predict)-np.mean(Y_test))**2)/ssxx)
#===========================================================
# Write output file
#===========================================================
# Put the predictions with lon lat data separated beforehand.
# This is from the ixy file which when running FPI analysis
# may include the dropped features as columns. Therefore, only
# pick out the ID, lon, lat for use later.
output_df = pd.read_csv(predict_data_ixy,
usecols=['Sample_ID', 'Sample_Longitude', 'Sample_Latitude'])
output_df[predict_var] = pd.Series(Y_predict)
output_df['mean.error'] = pd.Series(Y_hat_error)
output_df['predict.error'] = pd.Series(Y_hat_pred_error)
output_df.to_csv(
predict_output_file,
index=False,
na_rep='NaN')
print("Done!")