-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_tests.py
126 lines (108 loc) · 6.07 KB
/
run_tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
In this script we show how to test and plot RL results.
"""
import sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
font = {'family': 'normal',
'size': 70}
matplotlib.rc('font', **font)
sys.path.append('../')
from rl_stats.tests import run_test, compute_central_tendency_and_error
def run_test_and_plot(data1, # array of performance of dimension (n_steps, n_seeds) for alg 1
data2, # array of performance of dimension (n_steps, n_seeds) for alg 2
point_every=1, # evaluation frequency, one datapoint every X steps/episodes
test_id="welch", # choose among ['t_test', "welch", 'mann_whitney', 'ranked_t_test', 'bootstrap', 'permutation']. Welch recommended,
# see paper.
confidence_level=0.01, # confidence level alpha of the test
id_central='median', # id of the central tendency ('mean' or 'median')
id_error=80, # id of the error areas ('std', 'sem', or percentiles in ]0, 100]
legends='alg 1/alg 2', # labels of the two input vectors
xlabel='training steps', # label of the x axis
save=True, # save in ./plot.png if True
downsampling_fact=5 # factor of downsampling on the x-axis for visualization purpose (increase for smoother plots)
):
# if they are paths, load the files as numpy arrays.
if isinstance(data1, str):
data1 = np.loadtxt(data1)
data2 = np.loadtxt(data2)
assert data1.ndim == 2, "data should be an array of dimension 2 (n_steps, n_seeds)"
assert data2.ndim == 2, "data should be an array of dimension 2 (n_steps, n_seeds)"
nb_steps = max(data1.shape[0], data2.shape[0])
steps = [0]
while len(steps) < nb_steps:
steps.append(steps[-1] + point_every)
steps = np.array(steps)
if steps is not None:
assert steps.size == nb_steps, "x should be of the size of the longest data array"
sample_size1 = data1.shape[1]
sample_size2 = data2.shape[1]
# downsample for visualization purpose
sub_steps = np.arange(0, nb_steps, downsampling_fact)
steps = steps[sub_steps]
nb_steps = steps.size
# handle arrays of different lengths by padding with nans
sample1 = np.zeros([nb_steps, sample_size1])
sample1.fill(np.nan)
sample2 = np.zeros([nb_steps, sample_size2])
sample2.fill(np.nan)
sub_steps1 = sub_steps[:data1.shape[0] // downsampling_fact]
sub_steps2 = sub_steps[:data2.shape[0] // downsampling_fact]
sample1[:data1[sub_steps1, :].shape[0], :] = data1[sub_steps1, :]
sample2[:data2[sub_steps2, :].shape[0], :] = data2[sub_steps2, :]
# test
sign_diff = np.zeros([len(steps)])
for i in range(len(steps)):
sign_diff[i] = run_test(test_id, sample1[i, :], sample2[i, :], alpha=confidence_level)
central1, low1, high1 = compute_central_tendency_and_error(id_central, id_error, sample1)
central2, low2, high2 = compute_central_tendency_and_error(id_central, id_error, sample2)
# plot
fig, ax = plt.subplots(1, 1, figsize=(20, 10))
lab1 = plt.xlabel(xlabel)
lab2 = plt.ylabel('performance')
plt.plot(steps, central1, linewidth=10)
plt.plot(steps, central2, linewidth=10)
plt.fill_between(steps, low1, high1, alpha=0.3)
plt.fill_between(steps, low2, high2, alpha=0.3)
splitted = legends.split('/')
leg = ax.legend((splitted[0], splitted[1]), frameon=False)
# plot significative difference as dots
idx = np.argwhere(sign_diff == 1)
y = max(np.nanmax(high1), np.nanmax(high2))
plt.scatter(steps[idx], y * 1.05 * np.ones([idx.size]), s=100, c='k', marker='o')
# style
for line in leg.get_lines():
line.set_linewidth(10.0)
ax.spines['top'].set_linewidth(5)
ax.spines['right'].set_linewidth(5)
ax.spines['bottom'].set_linewidth(5)
ax.spines['left'].set_linewidth(5)
if save:
plt.savefig('./plot.png', bbox_extra_artists=(leg, lab1, lab2), bbox_inches='tight', dpi=100)
plt.show()
if __name__ == '__main__':
import argparse
import sys
data1 = np.loadtxt('./data/sac_hc_all_perfs.txt')
data2 = np.loadtxt('./data/td3_hc_all_perfs.txt')
sample_size = 20
data1 = data1[:, np.random.randint(0, data1.shape[1], sample_size)]
data2 = data2[:, np.random.randint(0, data1.shape[1], sample_size)]
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
add = parser.add_argument
add('--data1', type=str, default=data1, help='path to text file containing array of performance of dimension (n_steps, n_seeds) for alg 1. Can also receive the array '
'directly.')
add('--data2', type=str, default=data2, help='path to text file containing array of performance of dimension (n_steps, n_seeds) for alg 2. Can also receive the array '
'directly.')
add('--point_every', type=int, default=1, help='evaluation frequency, one datapoint every X steps/episodes')
add('--test_id', type=str, default="Welch t-test", help="choose in [t-test, Welch t-test, Mann-Whitney, Ranked t-test, bootstrap, permutation], welch recommended (see paper)")
add('--confidence_level', type=float, default=0.01, help='confidence level alpha of the test')
add('--id_central', type=str, default='median', help="id of the central tendency ('mean' or 'median')")
add('--id_error', default=80, help="id of the error areas ('std', 'sem', or percentiles in ]0, 100]")
add('--legends', type=str, default='SAC/TD3', help='labels of the two input vectors "legend1/legend2"')
add('--xlabel', type=str, default='training episodes', help='label of the x axis, usually episodes or steps')
add('--save', type=bool, default=True, help='save in ./plot.png if True')
add('--downsampling_fact', type=int, default=5, help='factor of downsampling on the x-axis for visualization purpose (increase for smoother plots)')
kwargs = vars(parser.parse_args())
run_test_and_plot(**kwargs)