-
Notifications
You must be signed in to change notification settings - Fork 0
/
hyperparameter_tuning.py
128 lines (102 loc) · 5.79 KB
/
hyperparameter_tuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import cvae_general
import pandas as pd
from tensorflow.keras.optimizers import Adam
import itertools
import numpy as np
import helper
import random
import os
import argparse
if __name__ == '__main__':
ns = [8, 9, 10, 11]
activation_functions = ['relu', 'sigmoid', 'tanh']
latent_space_dimensions = [2, 4, 8]
learning_rates = [0.001, 0.01]
epsilons = [1e-7, 1e-5, 1e-3, 1e-1]
layout_index = [0, 1, 2, 3, 4]
lists = [ns, layout_index, activation_functions, latent_space_dimensions, learning_rates, epsilons]
lists = list(itertools.product(*lists))
lists = [x for x in lists if not (x[2] == 'relu' and x[4] == 0.01)]
lists = [x for x in lists if not (x[2] == 'sigmoid' and (x[0] in [10, 11] or x[1] in [3, 4]))]
lists = [x for x in lists if not (x[0] == 11 and x[1] == 4)]
print('Loading arguments')
parser = argparse.ArgumentParser(prog="Hyperparameter Tuning Script", description="Trains a CVAE model based on a hyperparameter combination and saves the performance to a file")
parser.add_argument('combo_averages', help="Path to .pickle, .csv, or .tsv for observed combination mean samples", type=str)
parser.add_argument('training_data', help="Path to .pickle, .csv, or .tsv for individual training samples", type=str)
parser.add_argument('t_start', help="Position of first one-hot-encoded tissue in the training data", type=int)
parser.add_argument('t_end', help="Position of last one-hot-encoded tissue in the training data", type=int)
parser.add_argument('s_start', help="Position of first one-hot-encoded species in the training data", type=int)
parser.add_argument('s_end', help="Position of last one-hot-encoded species in the training data", type=int)
parser.add_argument('d_start', help="Position of first probe in the training data", type=int)
parser.add_argument('index', help="Index of the hyperparameter combination", type=int)
parser.add_argument('output_dir', help="Path to output where hyperparameter combination performances will be scored", type=str)
parser.add_argument('--val_seed', help="Random seed for selecting the validation dataset", default=-1, type=int)
args = parser.parse_args()
if os.path.splitext(args.combo_averages)[1] == '.pickle':
combo_averages = pd.read_pickle(args.combo_averages)
elif os.path.splitext(args.combo_averages)[1] == '.csv' or args.combo_averages.split('.', 1)[1] == 'csv.gz':
combo_averages = pd.read_table(args.combo_averages, sep=',', index_col=[0,1])
else:
combo_averages = pd.read_table(args.combo_averages, index_col=0)
print('Observed combination mean sample dimentions: ' + str(combo_averages.shape))
if os.path.splitext(args.training_data)[1] == '.pickle':
training = pd.read_pickle(args.training_data)
elif os.path.splitext(args.training_data)[1] == '.csv' or args.training_data.split('.', 1)[1] == 'csv.gz':
training = pd.read_table(args.training_data, sep=',', index_col=0)
else:
training = pd.read_table(args.training_data, index_col=0)
training = training.dropna(axis=1)
print('Training data dimensions: '+ str(training.shape))
index = args.index - 1
if not os.path.isdir(args.output_dir):
os.mkdir(args.output_dir)
if 'best' + str(index+1) + '.txt' in os.listdir(args.output_dir):
print('Hyperparameter combination already trained')
exit()
tissue_index = training.columns.values[args.t_start:args.t_end+1]
species_index = training.columns.values[args.s_start:args.s_end+1]
train_data, val_data = helper.get_training_val_datasets(training, tissue_index, args.t_start, args.t_end, species_index, args.s_start, args.s_end, args.val_seed)
Xtrain = train_data[train_data.columns[args.d_start:]]
ytrain = train_data[train_data.columns[:args.d_start]]
Xval = val_data[val_data.columns[args.d_start:]]
yval = val_data[val_data.columns[:args.d_start]]
print('Training and validation data dimensions')
print(Xtrain.shape)
print(ytrain.shape)
print(Xval.shape)
print(yval.shape)
n = lists[index][0]
layouts = [(1, [2**n]), (2, [2**n, 2**n]), (3, [2**n, 2**n, 2**n]), (2, [2**(n+1), 2**n]), (3, [2**(n+2), 2**(n+1), 2**n])]
layout = layouts[lists[index][1]]
activation_function = lists[index][2]
latent_space = lists[index][3]
learning_rate = lists[index][4]
epsilon = lists[index][5]
print(layout)
print(activation_function)
print(latent_space)
print(learning_rate)
print(epsilon)
max_performance = -1
max_model = None
rand_seed = random.randint(1, 10000)
cvae, encoder, decoder = cvae_general.define_cvae(Xtrain, ytrain, latent_space, layout[0], layout[1], activation_function, rand_seed)
trained_cvae = cvae_general.train_cvae(cvae, Xtrain, ytrain, Xval, yval, 32, 50, Adam(learning_rate=learning_rate, epsilon=epsilon), 5)
if not np.isnan(trained_cvae.history['loss'][-1]):
predictions = helper.predict_group_mean_normal(val_data, tissue_index, args.t_start, args.t_end+1, species_index, args.s_start, args.s_end+1, args.d_start, decoder, latent_space)
result = helper.combo_mean_samplewise_performance_pearson(predictions, combo_averages)
print(result)
if result > max_performance:
max_performance = result
max_model = [layout, activation_function, latent_space, learning_rate, epsilon, rand_seed]
print(index+1)
print(max_model)
print(max_performance)
output_file = open(args.output_dir + '/best' + str(index+1) + '.txt', 'w')
output_file.write(str(max_performance) + '\n')
if max_model is not None:
output_file.write(str(max_model[0][0]) + '\n')
output_file.write(str(max_model[0][1]) + '\n')
for i in range(1, 6):
output_file.write(str(max_model[i]) + '\n')
output_file.close()