-
Notifications
You must be signed in to change notification settings - Fork 52
/
decisiontree.q
134 lines (124 loc) · 6.29 KB
/
decisiontree.q
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
\c 20 100
\l funq.q
\l iris.q
\l weather.q
\l winequality.q
/ http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm
/ http://www.saedsayad.com/decision_tree.htm
/ Paper_3-A_comparative_study_of_decision_tree_ID3_and_C4.5.pdf
/ https://www.jair.org/media/279/live-279-1538-jair.pdf
/ http://www.ams.org/publicoutreach/feature-column/fc-2014-12
/ http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_hpsplit_details06.htm
-1"load weather data, remove the day column and move Play to front";
show t:weather.t
-1"use the id3 algorithm to build a decision tree";
-1 .ml.ptree[0] tr:.ml.id3[();::] t;
`:tree.dot 0: .ml.pgraph tr
-1"the tree is built with triplets.";
-1"the first value is the decision feature,";
-1"and the second value is operator to use on the feature";
-1"and the third value is a dictionary representing the leaves";
-1"we can then use the (p)redict (d)ecission (t)ree function to classify our data";
avg t.Play=p:.ml.pdt[tr] t / accuracy
-1"since the test and training data are the same, it is no surprise we have 100% accuracy";
-1".ml.pdt does not fail on missing features. it digs deeper into the tree";
.ut.assert[.71428571428571431] avg t.Play=p:.ml.pdt[.ml.id3[();::] (1#`Outlook) _ t] t
-1"id3 only handles discrete features. c4.5 handles continues features";
-1".ml.q45 implements many of the features of c4.5 including:";
-1"* information gain normalized by split info";
-1"* handling of continuous features";
-1"* use of Minumum Description Length Principal (MDL) ";
-1" to penalize features with many distinct continuous values";
-1"* pre-prunes branches that create branches with too few leaves";
-1"* post-prunes branches that overfit by given confidence value";
-1"we can test this feature by changing humidity into a continuous variable";
show s:@[t;`Humidity;:;85 90 78 96 80 70 65 95 70 80 70 90 75 80f]
-1"we can see how id3 creates a bushy tree";
-1 .ml.ptree[0] .ml.id3[();::] s;
-1"while q45 picks a single split value";
z:@[{.qml.nicdf x};.0125;2.241403];
-1 .ml.ptree[0] tr:.ml.prune[.ml.perr[z]] .ml.q45[();::] s;
.ut.assert[1f] avg s.Play=p:.ml.pdt[tr] s / accuracy
-1"we can still handle null values by using the remaining features";
.ut.assert[`Yes] .ml.pdt[tr] d:`Outlook`Temperature`Humidity`Wind!(`Rain;`Hot;85f;`)
-1"we can even can handle nulls in the training data by propagating them down the tree";
s:update Temperature:` from s where Humidity=70f
-1 .ml.ptree[0] tr:.ml.q45[();::] s;
.ut.assert[`No] .ml.pdt[tr] d
-1 "we can also use the Gini impurity instead of entropy (faster with similar behavior)";
-1 .ml.ptree[0] tr:.ml.dt[.ml.gr;.ml.ogr;.ml.wgini;();::] t;
d:`Outlook`Temperature`Humidity`Wind!(`Rain;`Hot;`High;`) / remove null
.ut.assert[`No] .ml.pdt[tr] d
-1 "we can also create an aid tree when the target is numeric";
-1 .ml.ptree[0] tr:.ml.aid[(1#`minsl)!1#3;::] update "e"$`Yes=Play from t; / regression tree
.ut.assert[.2] .ml.pdt[tr] d
-1 "we can also create a thaid tree for classification";
-1 .ml.ptree[0] tr:.ml.thaid[(1#`minsl)!1#3;::] t; / classification tree
.ut.assert[`Yes] .ml.pdt[tr] d
-1 "we can now split the iris data into training and test batches (w/ stratification)";
w:`train`test!3 1
show d:.ut.part[w;iris.t.species] iris.t
-1 "note that stratification can work on any type of list or table";
.ut.part[w;;iris.t] count[iris.t]?5;
.ut.part[w;select species from iris.t] iris.t;
-1 "next we confirm relative frequencies of species are the same";
.ut.assert[1b] .ml.identical value count each group d.train.species
-1 "then create a classification tree";
-1 .ml.ptree[0] tr:.ml.ct[();::] `species xcols d`train;
-1 "testing the tree on the test set produces an accuracy of:";
avg d.test.species=p:.ml.pdt[tr] d`test
-1 "we can save the decision tree into graphviz compatible format";
`:tree.dot 0: .ml.pgraph tr;
-1 "using graphviz to convert the .dot file into a png";
@[system;"dot -Tpng -o tree.png tree.dot";0N!];
-1 "we can predict iris petal lengths with a regression tree";
-1 "first we need to one-hot encode the species";
t:"f"$.ut.onehot iris.t
-1 "then split the data into training and test batches"
show d:.ut.part[w;0N?] t
-1 "and generate a regression tree";
-1 .ml.ptree[0] tr:.ml.rt[();::] `plength xcols d`train;
-1 "we now compute the root mean square error (rmse)";
.ml.rms d.test.plength-p:.ml.pdt[tr] d`test
-1 "using breiman algorithm, compute pruning alphas";
dtf:.ml.ct[();::]
ef:.ml.wmisc
/ http://mlwiki.org/index.php/Cost-Complexity_Pruning
t:([]z:`b`b`b`b`w`w`w`w`w`w`b`b`w`w`b`b)
t:t,'([]x:1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4)
t:t,'([]y:1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 )
-1 .ml.ptree[0] tr:dtf t;
.ut.assert[0 0.125 0.125 0.25] first atr:flip .ml.dtmina[ef] scan (0f;tr)
-1 "we then pick the alpha (and therefore subtree) with cross validation";
b:sqrt (1_a,0w)*a:atr 0 / geometric mean
I:.ut.part[(k:10)#1;0N?] til count t
show e:avg each t[`z][I]=p:.ml.dtcv[dtf;ef;b;t] .ml.kfold I
-1 .ml.ptree[0] atr[1] 0N!.ml.imax 0N!avg e;
-1 "the old (deprecated) interface splits the data before iterating";
.ut.assert[e] avg each ts[;`z]=p:.ml.dtxv[dtf;ef;b;ts:t@I] peach til k
-1 "returning to the iris data, we can grow and prune that too";
-1 .ml.ptree[0] tr:dtf t:iris.t;
.ut.assert[0 .01 .02 .02 .04 .88 1f] 3*first atr:flip .ml.dtmina[ef] scan (0f;tr)
b:sqrt (1_a,0w)*a:atr 0 / geometric mean
I:.ut.part[(k:10)#1;0N?] til count t
show e:avg each t[`species][I]=p:.ml.dtcv[dtf;ef;b;t] .ml.kfold I
-1 .ml.ptree[0] atr[1] 0N!.ml.imax 0N!avg e;
-1 "or even grow and prune a regression tree with wine quality data";
d:.ut.part[`train`test!1 1;0N?] winequality.red.t
dtf:.ml.rt[();::]
ef:.ml.wmse
-1 "the fully grown tree has more than 200 leaves!";
.ut.assert[1b] 200<0N!count .ml.leaves tr:dtf d`train
-1 "we can improve this by performing k-fold cross validation";
-1 "first we find the list of critical alphas";
atr:flip .ml.dtmina[ef] scan (0f;tr)
b:sqrt (1_a,0w)*a:atr 0 / geometric mean
I:.ut.part[(k:5)#1;0N?] til count t:d`train
-1 "then we compute the accuracy of each of these alphas with k-fold cv";
show e:avg each e*e:t[`quality][I]-p:(.ml.dtcv[dtf;ef;b;t]) .ml.kfold I
-1 "finally, we pick the tree whose alpha had the min error";
-1 .ml.ptree[0] btr:atr[1] 0N!.ml.imin 0N!avg e;
-1 "the pruned tree has less than 25 leaves";
.ut.assert[1b] 25>0N!count .ml.leaves btr
-1 "and an rms less than .73";
.ut.assert[1b] .73>0N!.ml.rms d.test.quality - .ml.pdt[btr] d`test