-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
311 lines (272 loc) · 13.6 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import numpy as np
from prompt import AQPPrompt
import logging
logger = logging.getLogger(__name__)
def get_join_group_sql(dataset):
sql=""
if dataset=='tpch':
sql='''
select c_nationkey,avg(c_acctbal),avg(s_acctbal),sum(c_acctbal),sum(s_acctbal)
from tpch_35g.customer join tpch_35g.supplier on c_nationkey=s_nationkey
group by c_nationkey order by c_nationkey
'''
elif dataset=='tpch-5':
sql='''
select n_name,avg(c_acctbal),sum(c_acctbal)
from tpch_35g.customer join tpch_35g.nation on c_nationkey=n_nationkey
group by n_name order by n_name
'''
elif dataset=='tpcds':
sql='''
select ss_promo_sk,avg(ss_wholesale_cost),avg(ss_list_price),avg(ws_wholesale_cost),avg(ws_list_price),
sum(ss_wholesale_cost),sum(ss_list_price),sum(ws_wholesale_cost),sum(ws_list_price)
from tpcds_0_6667g.store_sales join tpcds_0_6667g.web_sales on ss_promo_sk=ws_promo_sk
group by ss_promo_sk order by ss_promo_sk
'''
elif dataset=='tpcds-2':
sql='''
select s_store_name,avg(ss_wholesale_cost),avg(ss_list_price),avg(ss_sales_price),avg(ss_ext_sales_price), sum(ss_wholesale_cost),sum(ss_list_price), sum(ss_sales_price),sum(ss_ext_sales_price)
from tpcds_0_6667g.store_sales join tpcds_0_6667g.store on ss_store_sk=s_store_sk
group by s_store_name order by s_store_name
'''
elif dataset=='movielen':
sql='''
select r_movieid,avg(rating),sum(rating)
from movielen_1m.ratings join movielen_1m.genome_scores on r_movieid=m_movieid
group by r_movieid order by r_movieid
'''
elif dataset=='genome':
sql='''
select r_movieid,avg(rating),sum(rating)
from movielen_25m.ratings join movielen_25m.genome_scores on r_movieid=g_movieid
group by r_movieid order by r_movieid
'''
elif dataset=='sdr':
sql='''
select prot_category,avg(l4_ul_throughput),avg(l4_dw_throughput),
avg(l4_ul_packets),avg(l4_dw_packets),
sum(l4_ul_throughput),sum(l4_dw_throughput),
sum(l4_ul_packets),sum(l4_dw_packets)
from sdr_5m.sdr_flow_with_outlier
join sdr_5m.dim_sub_prot
on prot_category=protocol_id
group by prot_category
order by prot_category
'''
elif dataset=='flights':
sql='''
select a_unique_carrier, sum(a_taxi_out), avg(a_taxi_out), sum(a_air_time), avg(a_air_time), sum(a_distance), avg(a_distance)
from flights_300k.flight_a join flights_300k.flight_b on a_unique_carrier = b_unique_carrier
group by a_unique_carrier order by a_unique_carrier
'''
elif dataset=='flights-2':
sql='''
select a_unique_carrier, sum(a_taxi_out), avg(a_taxi_out), sum(a_air_time), avg(a_air_time), sum(a_distance), avg(a_distance)
from flights_300k.flight_a join flights_300k.flight_b on a_origin_state_abr = b_origin_state_abr
group by a_unique_carrier order by a_unique_carrier
'''
elif dataset=='census':
sql='''
select a_education_num, sum(a_age), avg(a_age), sum(a_hours_per_week), avg(a_hours_per_week), sum(a_fnlwgt), avg(a_fnlwgt)
from census_150k.adult_a join census_150k.adult_b on a_education_num = b_education_num
group by a_education_num order by a_education_num
'''
elif dataset=='census-2':
sql='''
select a_relationship, sum(a_age), avg(a_age), sum(a_hours_per_week), avg(a_hours_per_week), sum(a_fnlwgt), avg(a_fnlwgt)
from census_150k.adult_a join census_150k.adult_b on a_education_num = b_education_num
group by a_relationship order by a_relationship
'''
elif dataset=='imdb':
sql='''
select a_tconst,avg(a_average_rating),sum(a_average_rating),avg(a_num_votes),sum(a_num_votes)
from imdb_100000k.ratings a join imdb_100000k.ratings b on a_tconst = b_tconst
group by a_tconst order by a_tconst
'''
if dataset=='test':
sql='''
select c_nationkey,avg(c_acctbal),avg(s_acctbal),
sum(c_acctbal),sum(s_acctbal)
from test.customer_test join tpch_1m.supplier on c_nationkey=s_nationkey
group by c_nationkey order by c_nationkey
'''
return sql
def get_group_sql(dataset):
sql=""
if dataset=='tpch':
sql='''
select c_nationkey,avg(c_acctbal),sum(c_acctbal)
from tpch_1m.customer
group by c_nationkey order by c_nationkey
'''
elif dataset=='tpcds':
sql='''
select ss_promo_sk,avg(ss_wholesale_cost),avg(ss_list_price),
sum(ss_wholesale_cost),sum(ss_list_price)
from tpcds_1m.store_sales
group by ss_promo_sk order by ss_promo_sk
'''
elif dataset=='movielen' or dataset=='genome':
sql='''
select r_movieid,avg(rating),sum(rating)
from movielen_1m.ratings
group by r_movieid order by r_movieid
'''
elif dataset=='sdr':
sql='''
select prot_category,avg(l4_ul_throughput),avg(l4_dw_throughput),
avg(l4_ul_packets),avg(l4_dw_packets),
sum(l4_ul_throughput),sum(l4_dw_throughput),
sum(l4_ul_packets),sum(l4_dw_packets)
from sdr_1m.sdr_flow_with_outlier
group by prot_category
order by prot_category
'''
return sql
def get_model_join_option(dataset,k,rate):
option=""
if dataset=='tpch':
option=' with k={} model-{}-./config/train/tpch_customer_torch_cvae.json model-0.1-./config/train/tpch_supplier_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='tpch-5':
option=' with k={} model-{}-./config/train/tpch_customer_torch_cvae.json model-0.1-./config/train/tpch_nation_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='tpcds':
option=' with k={} model-{}-./config/train/tpcds_ssales_torch_cvae.json model-0.5-./config/train/tpcds_wsales_torch_cvae.json'.format(k,"%.3f" % (rate*2))
elif dataset=='tpcds-2':
option=' with k={} model-{}-./config/train/tpcds_ssales_torch_cvae.json model-1.0-./config/train/tpcds_store_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='movielen':
option=' with k={} model-{}-./config/train/movielen_ratings_torch_cvae.json model-1.0-./config/train/movielen_movies_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='sdr':
option=' with k={} model-{}-./config/train/outlier_sdr_flow_torch_cvae.json model-0.1-./config/train/dim_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='genome':
option=' with k={} model-{}-./config/train/movielen_ratings_torch_cvae.json model-0.5-./config/train/movielen_genome_torch_cvae.json'.format(k,"%.3f" % (rate*2))
elif dataset=='census':
option=' with k={} model-{}-./config/train/census_a_torch_cvae.json model-0.1-./config/train/census_b_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='census-2':
option=' with k={} model-{}-./config/train/census2_a_torch_cvae.json model-0.1-./config/train/census2_b_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='flights':
option=' with k={} model-{}-./config/train/flights_a_torch_cvae.json model-0.1-./config/train/flights_b_torch_cvae.json'.format(k,"%.3f" % (rate*10))
elif dataset=='flights-2':
option=' with k={} model-{}-./config/train/flights2_a_torch_cvae.json model-0.1-./config/train/flights2_b_torch_cvae.json'.format(k,"%.3f" % (rate*10))
return option
def get_model_group_option(dataset,k,rate):
option=""
if dataset=='tpch':
option=' with k={} model-{}-./config/train/tpch_customer_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='tpcds':
option=' with k={} model-{}-./config/train/tpcds_ssales_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='movielen':
option=' with k={} model-{}-./config/train/movielen_ratings_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='sdr':
option=' with k={} model-{}-./config/train/outlier_sdr_flow_torch_cvae.json'.format(k,"%.3f" % (rate))
elif dataset=='genome':
option=' with k={} model-{}-./config/train/movielen_ratings_torch_cvae.json'.format(k,"%.3f" % (rate))
return option
def get_ground_truth_path(dataset):
ground_truth_path=""
if dataset=='tpch':
ground_truth_path = "./ground_truth/tpch-1m/cs_truth.csv"
elif dataset=='tpcds':
ground_truth_path = "./ground_truth/tpcds-1m/sw_truth.csv"
elif dataset=='movielen':
ground_truth_path = "./ground_truth/movielen-1m/rm_truth.csv"
elif dataset=='sdr':
ground_truth_path = "./ground_truth/sdr-1m/sdr_outlier_truth.csv"
elif dataset=='genome':
ground_truth_path = "./ground_truth/movielen-1m/rg_truth.csv"
return ground_truth_path
def get_join_group_ground_truth_path(dataset):
ground_truth_path=""
if dataset=='tpch':
ground_truth_path = "./ground_truth/tpch-35g/cs_truth.csv"
elif dataset=='tpch-5':
ground_truth_path = "./ground_truth/tpch-35g/cn_truth.csv"
elif dataset=='tpcds':
ground_truth_path = "./ground_truth/tpcds-0.6667g/sw_truth.csv"
elif dataset=='tpcds-2':
ground_truth_path = "./ground_truth/tpcds-0.6667g/ss_truth.csv"
elif dataset=='movielen':
ground_truth_path = "./ground_truth/movielen-1m/rm_truth.csv"
elif dataset=='sdr':
ground_truth_path = "./ground_truth/sdr-1m/sdr_outlier_truth.csv"
elif dataset=='genome':
ground_truth_path = "./ground_truth/movielen-1m/rg_truth.csv"
elif dataset=='census':
ground_truth_path = "./ground_truth/census/census_truth.csv"
elif dataset=='census-2':
ground_truth_path = "./ground_truth/census/census2_truth.csv"
elif dataset=='flights':
ground_truth_path = "./ground_truth/flights/flights_truth.csv"
elif dataset=='flights-2':
ground_truth_path = "./ground_truth/flights/flights2_truth.csv"
if dataset=='test':
ground_truth_path = "./ground_truth/tpch-1m/test.csv"
return ground_truth_path
def get_group_ground_truth_path(dataset):
ground_truth_path=""
if dataset=='tpch':
ground_truth_path = "./ground_truth/tpch-1m/csnj_truth.csv"
elif dataset=='tpcds':
ground_truth_path = "./ground_truth/tpcds-1m/swnj_truth.csv"
elif dataset=='movielen':
ground_truth_path = "./ground_truth/movielen-1m/rmnj_truth.csv"
elif dataset=='sdr':
ground_truth_path = "./ground_truth/sdr-1m/sdr_outliernj_truth.csv"
elif dataset=='genome':
ground_truth_path = "./ground_truth/movielen-1m/rgnj_truth.csv"
if dataset=='test':
ground_truth_path = "./ground_truth/tpch-1m/testnj.csv"
return ground_truth_path
if __name__ == "__main__":
prompt = AQPPrompt()
# datasets=['tpch','tpcds','movielen','sdr']
# datasets=['tpch','tpcds','genome','sdr']
# datasets=['tpcds','tpcds-2','sdr']
# datasets=['census','census-2','flights','flights']
datasets=['census-2']
average_running_times={t:[] for t in datasets}
average_query_errors={t:[] for t in datasets}
rounds=3
k=1
res=[]
query_type='join'
for t in datasets:
logger.info("dataset:{}".format(t))
for r in range(1,11):
running_times=[]
query_errors=[]
rate=r*0.001
### aqp option
option=''
if query_type=='join':
option=get_model_join_option(t,k,rate)
sql=get_join_group_sql(t)+option
ground_truth_path=get_join_group_ground_truth_path(t)
elif query_type=='group':
option=get_model_group_option(t,k,rate)
sql=get_group_sql(t)+option
ground_truth_path=get_group_ground_truth_path(t)
logger.info("query:{}".format(sql))
logger.info("ground_truth_path:{}".format(ground_truth_path))
for i in range(rounds):
logger.info("round:{}".format(i+1))
query_error=None
runing_time,query_error=prompt.execute_approximate_query(sql,ground_truth_path=ground_truth_path)
running_times.append(round(runing_time,3))
if query_error is not None:
query_errors.append(round(query_error,6))
average_time=round(np.mean(running_times[:]),3)
average_running_times[t].append(average_time)
res.append("dataset:{} rate:{} option:{} running times:{}".format(t,rate,option,running_times))
res.append("dataset:{} rate:{} option:{} average running time:{}".format(t,rate,option,average_time))
if len(query_errors)>0:
average_error=round(np.mean(query_errors),6)
average_query_errors[t].append(average_error)
res.append("dataset:{} rate:{} option:{} query errors:{}".format(t,rate,option,query_errors))
res.append("dataset:{} rate:{} option:{} average query error:{}".format(t,rate,option,average_error))
res.append("------------------------")
res.append("#####################################################")
# logger.info("---------final result---------")
# for t in res:
# logger.info(t)
for t in datasets:
logger.info("dataset:{}\naverage running times:{}\naverage query errors:{}".format(t,average_running_times[t],average_query_errors[t]))