-
Notifications
You must be signed in to change notification settings - Fork 5
/
sim_genex_1.py
59 lines (50 loc) · 2.4 KB
/
sim_genex_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split, cross_val_score
# from tpot.config.classifier_nn import classifier_config_nn
from sklearn.pipeline import make_pipeline
# from tpot.config import classifier_config_dict_light
from tpot.config import classifier_config_dict
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os
import glob
# personal_config = classifier_config_dict_light
personal_config = classifier_config_dict
personal_config['tpot.builtins.DatasetSelector'] = {
'subset_list': ['subsets.csv'],
'sel_subset': range(19)
}
accuracy_ls = []
n_gen = 100
n_pop = 100
dat_name = 'simulatedGenex'
tpot_data = pd.read_csv('simulatedGenex.csv')
Xdata = tpot_data.loc[:, tpot_data.columns != 'class']
Xdata = Xdata.drop(Xdata.columns[0], axis=1) # simvar1 is in feature list subset S5; it's okay to drop
Ydata = tpot_data['class']
subset_df = pd.read_csv('subsets.csv')
all_features = ";".join(subset_df['Features'].tolist())
uniq_features = set(all_features.split(';')) # unique features in all subsets
overlap_features = list(uniq_features.intersection(set(list(Xdata.columns.values))))
X_subset = Xdata[overlap_features]
X_train, X_test, y_train, y_test = train_test_split(X_subset, Ydata, random_state = 1618,
train_size=0.75, test_size=0.25)
del X_subset
del Xdata
del Ydata
del tpot_data
for seed in range(100):
# X_train, X_test, y_train, y_test = train_test_split(X_subset, Ydata, random_state=seed,
# train_size=0.75, test_size=0.25)
tpot = TPOTClassifier(generations=n_gen, config_dict=personal_config,
population_size=n_pop, verbosity=2, random_state=seed,
early_stop=10,
# template = 'DatasetSelector-CombineDFs-Transformer-Classifier')
template='DatasetSelector-Transformer-Classifier')
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('pipelines/' + dat_name + str(seed) + '.py')
accuracy_ls.append([tpot._optimized_pipeline_score, tpot.score(X_test, y_test)])
accuracy_mat = pd.DataFrame(accuracy_ls, columns = ['Training CV Accuracy', 'Testing Accuracy'])
accuracy_mat.to_csv("accuracies/" + str(n_gen) + '_' + str(n_pop) + '_' + str(seed) + ".tsv", sep = "\t")