-
Notifications
You must be signed in to change notification settings - Fork 0
/
ada_boost_dem_random_forests.py
76 lines (60 loc) · 2.22 KB
/
ada_boost_dem_random_forests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt
import utilities as ut
''' Finds the average classification error from a decision tree model using
validation
'''
def get_avg_error(clf, points, y):
predictions = clf.predict(points)
error = sum(y != predictions)
return error / float(len(points))
''' Experiments with different parameters to minimize error on the voter data
Tries two stops of early stopping at the range of ns specified.
Type 0 refers to min sample leaves = n and type 1 is max_depth = n
uses k fold cross verification
Plots the results.
returns the best n and what the error was for that n
'''
def Trials(data, ns, crit='gini', k=5):
x, y = data
val_errors = []
train_errors = []
length = len(x) / k
bestn = None
best_err = np.inf
for n in ns:
clf = AdaBoostClassifier(RandomForestClassifier(100, criterion=crit, max_depth=n))
clf = clf.fit(x, y)
val_err = np.mean(cross_val_score(clf, x, y, cv=5, scoring='accuracy'))
train_err = clf.score(x, y)
val_err = np.mean(k_val_errors)
if val_err < best_err:
bestn = n
best_err = val_err
val_errors.append(val_err)
train_errors.append(train_err)
print 100 * (1 - min(val_errors))
plt.figure(trial_type)
title = "Error vs n Max Depth Stopping Criterion using ata_boost random forest"
plt.title(title)
plt.xlabel("n")
plt.ylabel("Error")
plt.plot(ns, train_errors, label="Training Error")
plt.plot(ns, val_errors, label = "Validation Error")
plt.legend(loc="best")
plt.show()
return bestn, best_err
''' try some ranges of parameters with both models and then train a model with
whatever was best
'''
if __name__ == '__main__':
train_x, train_y, test = ut.import_data()
n, err = Trials((train_x, train_y), range(1, 5, 1), 1)
clf = RandomForestClassifier(max_depth=b)
clf = clf.fit(train_x, train_y)
result = clf.predict(test)
print n, err
ut.write_output_file(result, "ata_boost_100_forest.csv")