Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modifications for python3 #59

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion skfeature/function/information_theoretical_based/FCBF.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def fcbf(X, y, **kwargs):
delta = 0

# t1[:,0] stores index of features, t1[:,1] stores symmetrical uncertainty of features
t1 = np.zeros((n_features, 2), dtypes='object')
t1 = np.zeros((n_features, 2), dtype='object')
for i in range(n_features):
f = X[:, i]
t1[i, 0] = i
Expand Down
10 changes: 2 additions & 8 deletions skfeature/function/similarity_based/reliefF.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ def reliefF(X, y, **kwargs):
near_miss_term[label] = np.zeros(n_features)
for ele in miss_list:
near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label])
score += near_miss_term[label]/(k*p_dict[label])
score -= near_hit_term/k
score = score+near_miss_term[label]/(k*p_dict[label])
score = score-near_hit_term/k
return score


Expand All @@ -110,9 +110,3 @@ def feature_ranking(score):
"""
idx = np.argsort(score, 0)
return idx[::-1]






62 changes: 42 additions & 20 deletions skfeature/function/wrapper/decision_tree_backward.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score


def decision_tree_backward(X, y, n_selected_features):
def decision_tree_backward(X, y,metric):
"""
This function implements the backward feature selection algorithm based on decision tree

Expand All @@ -14,8 +16,8 @@ def decision_tree_backward(X, y, n_selected_features):
input data
y: {numpy array}, shape (n_samples,)
input class labels
n_selected_features : {int}
number of selected features

metric: metric to be while performing backward selection

Output
------
Expand All @@ -24,36 +26,56 @@ def decision_tree_backward(X, y, n_selected_features):
"""

n_samples, n_features = X.shape
# using 10 fold cross validation
cv = KFold(n_samples, n_folds=10, shuffle=True)
# using 5 fold stratified cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=1)
# choose decision tree as the classifier
clf = DecisionTreeClassifier()

# selected feature set, initialized to contain all features
F = range(n_features)
F = list(range(n_features))
count = n_features
acc = 0
#Finding the f1-score/error of the initial set of features
for train, test in cv.split(X,y):
clf.fit(X[train], y[train])
y_predict = clf.predict(X[test])
if metric == "log-loss":
acc_tmp = log_loss(y[test], y_predict)
else:
acc_tmp = f1_score(y[test],y_predict,average="micro")
acc += acc_tmp
max_acc = float(acc)/5
#This loop will keep on iterating till we find a set of features beyond which the f1-score/error does not improve
while True:

while count > n_selected_features:
max_acc = 0
idx = -1
for i in range(n_features):
if i in F:
F.remove(i)
X_tmp = X[:, F]
acc = 0
for train, test in cv:
#Finding the f1-score/error after removing a particular feature
for train, test in cv.split(X_tmp,y):
clf.fit(X_tmp[train], y[train])
y_predict = clf.predict(X_tmp[test])
acc_tmp = accuracy_score(y[test], y_predict)
if metric == "log-loss":
acc_tmp = log_loss(y[test], y_predict)
else:
acc_tmp = f1_score(y[test],y_predict,average="micro")
acc += acc_tmp
acc = float(acc)/10
acc = float(acc)/5
F.append(i)
# record the feature which results in the largest accuracy
if acc > max_acc:
# record the feature, removing which results in the largest f1-score or the smallest error
if metric == "log-loss" and acc < max_acc:
max_acc = acc
idx = i
# delete the feature which results in the largest accuracy
F.remove(idx)
count -= 1
return np.array(F)


elif metric == "f1-score" and acc>max_acc:
max_acc = acc
idx = i
# delete the feature, removing which results in the largest f1-score or the smallest error
if idx!=-1:
F.remove(idx)
count -= 1
else:
break
return (np.array(F),max_acc)
58 changes: 38 additions & 20 deletions skfeature/function/wrapper/decision_tree_forward.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss


def decision_tree_forward(X, y, n_selected_features):
def decision_tree_forward(X, y, metric):
"""
This function implements the forward feature selection algorithm based on decision tree

Expand All @@ -14,44 +15,61 @@ def decision_tree_forward(X, y, n_selected_features):
input data
y: {numpy array}, shape (n_samples, )
input class labels
n_selected_features: {int}
number of selected features

metric: metric to be used while performing forward selection
Output
------
F: {numpy array}, shape (n_features,)
index of selected features
"""

n_samples, n_features = X.shape
# using 10 fold cross validation
cv = KFold(n_samples, n_folds=10, shuffle=True)
# using 5 fold stratified cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True)
# choose decision tree as the classifier
clf = DecisionTreeClassifier()

# selected feature set, initialized to be empty
F = []
count = 0
while count < n_selected_features:
max_acc = 0
max_acc = 0
max_error = 100
#This loop will keep on iterating till we find a set of features beyond which the f1-score/error does not improve
while True:
idx = -1
for i in range(n_features):
if i not in F:
F.append(i)
X_tmp = X[:, F]
acc = 0
for train, test in cv:
error = 0
#Finding the f1-score/error after adding a particular feature
for train, test in cv.split(X_tmp,y):
clf.fit(X_tmp[train], y[train])
y_predict = clf.predict(X_tmp[test])
acc_tmp = accuracy_score(y[test], y_predict)
acc += acc_tmp
acc = float(acc)/10
if metric == "log-loss":
acc_tmp = log_loss(y[test], y_predict)
error += acc_tmp
else:
acc_tmp = f1_score(y[test], y_predict,average="micro")
acc += acc_tmp
acc = float(acc)/5
error = float(error)/5
F.pop()
# record the feature which results in the largest accuracy
if acc > max_acc:
# record the feature adding which results in the largest f1-score or the lowest error
if metric == "log-loss" and error<max_error:
max_error = error
idx = i
elif metric == "f1-score" and acc > max_acc:
max_acc = acc
idx = i
# add the feature which results in the largest accuracy
F.append(idx)
count += 1
return np.array(F)

# add the feature adding which results in the largest f1-score or the lowest score
if idx!=-1:
F.append(idx)
count += 1
else:
break
if metric =="log-loss":
return (np.array(F),max_error)
else:
return (np.array(F),max_acc)
8 changes: 3 additions & 5 deletions skfeature/function/wrapper/svm_backward.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def svm_backward(X, y, n_selected_features):

n_samples, n_features = X.shape
# using 10 fold cross validation
cv = KFold(n_samples, n_folds=10, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
# choose SVM as the classifier
clf = SVC()

# selected feature set, initialized to contain all features
F = range(n_features)
F = list(range(n_features))
count = n_features

while count > n_selected_features:
Expand All @@ -40,7 +40,7 @@ def svm_backward(X, y, n_selected_features):
F.remove(i)
X_tmp = X[:, F]
acc = 0
for train, test in cv:
for train, test in cv.split(X_tmp):
clf.fit(X_tmp[train], y[train])
y_predict = clf.predict(X_tmp[test])
acc_tmp = accuracy_score(y[test], y_predict)
Expand All @@ -55,5 +55,3 @@ def svm_backward(X, y, n_selected_features):
F.remove(idx)
count -= 1
return np.array(F)


6 changes: 3 additions & 3 deletions skfeature/function/wrapper/svm_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def svm_forward(X, y, n_selected_features):

n_samples, n_features = X.shape
# using 10 fold cross validation
cv = KFold(n_samples, n_folds=10, shuffle=True)
cv = KFold(n_splits=10, shuffle=True)
# choose SVM as the classifier
clf = SVC()

Expand All @@ -39,7 +39,7 @@ def svm_forward(X, y, n_selected_features):
F.append(i)
X_tmp = X[:, F]
acc = 0
for train, test in cv:
for train, test in cv.split(X_tmp):
clf.fit(X_tmp[train], y[train])
y_predict = clf.predict(X_tmp[test])
acc_tmp = accuracy_score(y[test], y_predict)
Expand All @@ -53,4 +53,4 @@ def svm_forward(X, y, n_selected_features):
# add the feature which results in the largest accuracy
F.append(idx)
count += 1
return np.array(F)
return np.array(F)