forked from chu-data-lab/CleanML
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexperiment.py
125 lines (100 loc) · 5.1 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Run experiments"""
import datetime
import logging
import numpy as np
import time
from clean import clean
import config
from init import init
from preprocess import preprocess
from train import train_and_evaluate
import utils
def one_search_experiment(dataset, error_type, train_file, model, seed, n_jobs=1, hyperparams=None, skip_test_files=[]):
"""One experiment on the datase given an error type, a train file, a model and a random search seed
Args:
dataset (dict): dataset dict in config.py
error_type (string): error type
train_file (string): filename of training set (dirty or clean)
model (dict): ml model dict in model.py
seed (int): seed for this experiment
"""
np.random.seed(seed)
# generate random seeds for down sample and training
down_sample_seed, train_seed = np.random.randint(1000, size=2)
# load and preprocess data
X_train, y_train, X_test_list, y_test_list, test_group_memberships, test_files = \
preprocess(dataset, error_type, train_file, normalize=True, down_sample_seed=down_sample_seed)
test_files = list(set(test_files).difference(set(skip_test_files)))
# train and evaluate
result = train_and_evaluate(X_train, y_train, X_test_list, y_test_list, test_group_memberships,
test_files, model, n_jobs=n_jobs, seed=train_seed, hyperparams=hyperparams)
return result
def one_split_experiment(dataset, n_retrain=5, seed=1, n_jobs=1, nosave=True, error_type=None):
"""Run experiments on one dataset for one split.
Args:
dataset (dict): dataset dict in config.py
models (list): list of model dict in model.py
nosave (bool): whether not save results
seed: experiment seed
n_retrain: times of repeated experiments
"""
# generate seeds for n experiments
np.random.seed(seed)
seeds = np.random.randint(10000, size=n_retrain)
# load result dict
result = utils.load_result(dataset['data_dir'])
result2019 = utils.load_result2019(dataset['data_dir'])
# run experiments
for error in dataset["error_types"]:
if error_type is not None and error != error_type:
continue
for train_file in utils.get_train_files(error):
for model in config.models:
for seed in seeds:
version = utils.get_version(utils.get_dir(dataset, error, train_file))
key = "/".join((dataset['data_dir'], 'v'+str(version), error, train_file, model['name'], str(seed)))
if key in result.keys():
print("Ignore experiment {} that has been completed before.".format(key))
continue
if key in result2019.keys():
hyperparams = result2019[key]["best_params"]
skip_test_files = [k.rstrip("_test_acc") for k in result2019[key].keys() if "_test_acc" in k]
else:
hyperparams = None
skip_test_files = []
print("{} Processing {}".format(datetime.datetime.now(), key))
res = one_search_experiment(dataset, error, train_file, model, seed, n_jobs=n_jobs, hyperparams=hyperparams, skip_test_files=skip_test_files)
if key in result2019.keys():
res = {**result2019[key], **res}
if not nosave:
utils.save_result(dataset['data_dir'], key, res)
def experiment(datasets, log=False, n_jobs=1, nosave=False, error_type=None, arg_seeds=None):
"""Run expriments on all datasets for all splits"""
# set logger for experiments
if log:
logging.captureWarnings(False)
logging.basicConfig(filename='logging_{}.log'.format(datetime.datetime.now()), level=logging.DEBUG)
# set seeds for experiments
np.random.seed(config.root_seed)
split_seeds = np.random.randint(10000, size=config.n_resplit)
experiment_seed = np.random.randint(10000)
# run experiments
for dataset in datasets:
if log:
logging.debug("{}: Experiment on {}".format(datetime.datetime.now(), dataset['data_dir']))
for i, seed in enumerate(split_seeds):
if arg_seeds is not None:
if i not in arg_seeds:
continue
if utils.check_completed(dataset, seed, experiment_seed):
print("Ignore {}-th experiment on {} that has been completed before.".format(i, dataset['data_dir']))
continue
tic = time.time()
init(dataset, seed=seed, max_size=config.max_size)
clean(dataset, error_type)
one_split_experiment(dataset, n_retrain=config.n_retrain, n_jobs=n_jobs, nosave=nosave, seed=experiment_seed, error_type=error_type)
toc = time.time()
t = (toc - tic) / 60
remaining = t*(len(split_seeds)-i-1)
if log:
logging.debug("{}: {}-th experiment takes {} min. Estimated remaining time: {} min".format(datetime.datetime.now(), i, t, remaining))