-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecision_tree.py
executable file
·170 lines (145 loc) · 6.74 KB
/
decision_tree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""This module includes methods for training and predicting using decision trees."""
import numpy as np
from statistics import mode, StatisticsError
def calculate_information_gain(data, labels):
"""
Computes the information gain for each feature in data
:param data: d x n matrix of d features and n examples
:type data: ndarray
:param labels: n x 1 vector of class labels for n examples
:type labels: array
:return: d x 1 vector of information gain for each feature
:rtype: array
"""
all_labels = np.unique(labels)
num_classes = len(all_labels)
class_count = np.zeros(num_classes)
d, n = data.shape
parent_entropy = 0
for c in range(num_classes):
class_count[c] = np.sum(labels == all_labels[c])
if class_count[c] > 0:
class_prob = class_count[c] / n
parent_entropy -= class_prob * np.log(class_prob)
# print("Parent entropy is %d\n" % parent_entropy)
gain = parent_entropy * np.ones(d) #initialization of gains for every attribute
# we use a matrix dot product to sum to make it more compatible with sparse matrices
num_x = data.dot(np.ones(n)) # number of examples containing each feature
prob_x = num_x / n # fraction of examples containing each feature
prob_not_x = 1 - prob_x
for c in range(num_classes):
# print("Computing contribution of class %d." % c)
num_y = np.sum(labels == all_labels[c])
# this next line sums across the rows of data, multiplied by the
# indicator of whether each column's label is c. It counts the number
# of times each feature is on among examples with label c.
# We again use the dot product for sparse-matrix compatibility
data_with_label = data[:, labels == all_labels[c]]
num_y_and_x = data_with_label.dot(np.ones(data_with_label.shape[1]))
# Prevents Python from outputting a divide-by-zero warning
with np.errstate(invalid='ignore'):
prob_y_given_x = num_y_and_x / (num_x + 1e-8) # probability of observing class c for each feature
prob_y_given_x[num_x == 0] = 0
nonzero_entries = prob_y_given_x > 0
if np.any(nonzero_entries):
with np.errstate(invalid='ignore', divide='ignore'):
children_entropy = - np.multiply(np.multiply(prob_x, prob_y_given_x), np.log(prob_y_given_x))
gain[nonzero_entries] -= children_entropy[nonzero_entries]
# The next lines compute the probability of y being c given x = 0 by
# subtracting the quantities we've already counted
# num_y - num_y_and_x is the number of examples with label y that
# don't have each feature, and n - num_x is the number of examples
# that don't have each feature
with np.errstate(invalid='ignore'):
prob_y_given_not_x = (num_y - num_y_and_x) / ((n - num_x) + 1e-8)
prob_y_given_not_x[n - num_x == 0] = 0
nonzero_entries = prob_y_given_not_x > 0
if np.any(nonzero_entries):
with np.errstate(invalid='ignore', divide='ignore'):
children_entropy = - np.multiply(np.multiply(prob_not_x, prob_y_given_not_x), np.log(prob_y_given_not_x))
gain[nonzero_entries] -= children_entropy[nonzero_entries]
return gain
def decision_tree_train(train_data, train_labels, params):
"""Train a decision tree to classify data using the entropy decision criterion.
:param train_data: d x n numpy matrix (ndarray) of d binary features for n examples
:type train_data: ndarray
:param train_labels: length n numpy vector with integer labels
:type train_labels: array_like
:param params: learning algorithm parameter dictionary. Must include a 'max_depth' value
:type params: dict
:return: dictionary encoding the learned decision tree
:rtype: dict
"""
max_depth = params['max_depth']
labels = np.unique(train_labels)
num_classes = labels.size
model = recursive_tree_train(train_data, train_labels, depth=0, max_depth=max_depth, num_classes=num_classes)
return model
def recursive_tree_train(data, labels, depth, max_depth, num_classes):
"""Helper function to recursively build a decision tree by splitting the data by a feature.
:param data: d x n numpy matrix (ndarray) of d binary features for n examples
:type data: ndarray
:param labels: length n numpy array with integer labels
:type labels: array_like
:param depth: current depth of the decision tree node being constructed
:type depth: int
:param max_depth: maximum depth to expand the decision tree to
:type max_depth: int
:param num_classes: number of classes in the classification problem
:type num_classes: int
:return: dictionary encoding the learned decision tree node
:rtype: dict
"""
n = np.shape(data)[1]
node = {
"Prediction": None,
"Test": None,
"Left": None,
"Right": None
}
if depth == max_depth or num_classes <= 1:
node["Prediction"] = mode(labels)
return node
w = np.argmax(calculate_information_gain(data,labels))
node["Test"] = w
table_true,table_false = [],[]
for i in range(n):
if data[w,i]:
table_true.append(i)
else:
table_false.append(i)
data2 = np.delete(data,w,0)
data_l = np.delete(data2,table_false,1) # examples where attribute is true
labels_l = [labels[i] for i in table_true] # minus labels attached to above
remaining_l = np.unique(labels_l)
data_r = np.delete(data2,table_true,1) # examples where attribute is false
labels_r = [labels[i] for i in table_false] # minus labels attached to above
remaining_r = np.unique(labels_r)
if len(labels_l) > 0:
node["Left"] = recursive_tree_train(data_l,labels_l,depth+1,max_depth,remaining_l.size)
if len(labels_r) > 0:
node["Right"] = recursive_tree_train(data_r,labels_r,depth+1,max_depth,remaining_r.size)
return node
def decision_tree_predict(data, model):
"""Predict most likely label given computed decision tree in model.
:param data: d x n ndarray of d binary features for n examples.
:type data: ndarray
:param model: learned decision tree model
:type model: dict
:return: length n numpy array of the predicted class labels
:rtype: array_like
"""
labels = np.empty(data.shape[1])
for i,x in enumerate(data.T):
labels[i] = getPrediction(x,model)
return labels
def getPrediction(example,model):
prediction = model["Prediction"]
if prediction == None:
test = example[model["Test"]]
nex = np.delete(example,model["Test"])
if test:
prediction = getPrediction(nex,model["Left"])
else:
prediction = getPrediction(nex,model["Right"])
return prediction