forked from mlcommons/storage
-
Notifications
You must be signed in to change notification settings - Fork 0
/
benchmark.sh
executable file
·401 lines (367 loc) · 15.1 KB
/
benchmark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Defaults
export PYTHONPATH=${SCRIPT_DIR}/dlio_benchmark
CONFIG_PATH=${SCRIPT_DIR}/storage-conf
WORKLOAD_PATH=${CONFIG_PATH}/workload
# TODO add DLRM when supported
WORKLOADS=("unet3d" "cosmoflow" "resnet50")
get_config_file() {
local workload=$1; shift
local accelerator_type=$1; shift
echo "${workload}_${accelerator_type}"
}
CATEGORIES=("closed" "open")
DEFAULT_CATEGORY="closed"
CLOSED_CATEGORY_PARAMS=(
# dataset params
"dataset.num_files_train" "dataset.num_subfolders_train" "dataset.data_folder"
# reader params
"reader.read_threads" "reader.computation_threads" "reader.transfer_size" "reader.prefetch_size"
# checkpoint params
"checkpoint.checkpoint_folder"
# storage params
"storage.storage_type" "storage.storage_root")
OPEN_CATEGORY_PARAMS=(
# all closed params
"${CLOSED_CATEGORY_PARAMS[@]}"
# framework params
"framework"
# dataset params
"dataset.format" "dataset.num_samples_per_file"
# reader params
"reader.data_loader"
)
HYDRA_OUTPUT_CONFIG_DIR="configs"
EXTRA_PARAMS=(
# benchmark do not rely on any profilers
++workload.workflow.profiling=False
++workload.profiling.profiler=none
# config directory inside results used during runs
++hydra.output_subdir=$HYDRA_OUTPUT_CONFIG_DIR
)
ACCELERATOR_TYPES=("a100" "h100")
STEPS_PER_EPOCH=500
# host memory multiplier for dataset generation
HOST_MEMORY_MULTIPLIER=5
usage() {
echo -e "Usage: $0 [datasize/datagen/run/configview/reportgen] [options]"
echo -e "Script to launch the MLPerf Storage benchmark.\n"
}
datasize_usage() {
echo -e "Usage: $0 datasize [options]"
echo -e "Get minimum dataset size required for the benchmark run.\n"
echo -e "\nOptions:"
echo -e " -h, --help\t\t\tPrint this message"
echo -e " -w, --workload\t\tWorkload dataset to be generated. Possible options are 'unet3d', 'cosmoflow' 'resnet50' "
echo -e " -g, --accelerator-type\tSimulated accelerator type used for the benchmark. Possible options are 'a100' 'h100' "
echo -e " -n, --num-accelerators\tSimulated number of accelerators(of same accelerator type)"
echo -e " -c, --num-client-hosts\tNumber of participating client hosts"
echo -e " -m, --client-host-memory-in-gb\tMemory available in the client where benchmark is run"
}
datagen_usage() {
echo -e "Usage: $0 datagen [options]"
echo -e "Generate benchmark dataset based on the specified options.\n"
echo -e "\nOptions:"
echo -e " -h, --help\t\t\tPrint this message"
echo -e " -s, --hosts\t\t\tComma separated IP addresses of the participating hosts(without space). eg: '192.168.1.1,192.168.2.2'"
echo -e " -c, --category\t\tBenchmark category to be submitted. Possible options are 'closed'(default)"
echo -e " -w, --workload\t\tWorkload dataset to be generated. Possible options are 'unet3d', 'cosmoflow' 'resnet50' "
echo -e " -g, --accelerator-type\tSimulated accelerator type used for the benchmark. Possible options are 'a100' 'h100' "
echo -e " -n, --num-parallel\t\tNumber of parallel jobs used to generate the dataset"
echo -e " -r, --results-dir\t\tLocation to the results directory. Default is ./results/workload.model/DATE-TIME"
echo -e " -p, --param\t\t\tDLIO param when set, will override the config file value"
}
run_usage() {
echo -e "Usage: $0 run [options]"
echo -e "Run benchmark on the generated dataset based on the specified options.\n"
echo -e "\nOptions:"
echo -e " -h, --help\t\t\tPrint this message"
echo -e " -s, --hosts\t\t\tComma separated IP addresses of the participating hosts(without space). eg: '192.168.1.1,192.168.2.2'"
echo -e " -c, --category\t\tBenchmark category to be submitted. Possible options are 'closed'(default)"
echo -e " -w, --workload\t\tWorkload to be run. Possible options are 'unet3d', 'cosmoflow' 'resnet50' "
echo -e " -g, --accelerator-type\tSimulated accelerator type used for the benchmark. Possible options are 'a100' 'h100' "
echo -e " -n, --num-accelerators\tSimulated number of accelerators(of same accelerator type)"
echo -e " -r, --results-dir\t\tLocation to the results directory."
echo -e " -p, --param\t\t\tDLIO param when set, will override the config file value"
}
configview_usage() {
echo -e "Usage: $0 configview [options]"
echo -e "View the final config based on the specified options.\n"
echo -e "\nOptions:"
echo -e " -h, --help\t\t\tPrint this message"
echo -e " -w, --workload\t\tWorkload to be viewed. Possible options are 'unet3d', 'cosmoflow' 'resnet50' "
echo -e " -g, --accelerator-type\tSimulated accelerator type used for the benchmark. Possible options are 'a100' 'h100' "
echo -e " -p, --param\t\t\tDLIO param when set, will override the config file value"
}
reportgen_usage() {
echo -e "Usage: $0 reportgen [options]"
echo -e "Generate a report from the benchmark results.\n"
echo -e "\nOptions:"
echo -e " -h, --help\t\t\tPrint this message"
echo -e " -r, --results-dir\t\tLocation to the results directory"
}
validate_in_list() {
local typ=$1; shift
local element=$1; shift
list=("$@")
if [[ ! " ${list[@]} " =~ " ${element} " ]]; then
echo "argument ${element} for ${typ} is invalid. It has be one of (${list[*]})."
exit 1
fi
}
validate_category() {
validate_in_list "category" $1 "${CATEGORIES[@]}" ;
}
validate_workload() {
validate_in_list "workload" $1 "${WORKLOADS[@]}" ;
}
validate_accelerator_type() {
validate_in_list "accelerator_type" $1 "${ACCELERATOR_TYPES[@]}" ;
}
validate_params() {
local category=$1; shift
local params=("$@")
for param in "${params[@]}"
do
param_name=$(echo $param | cut -d '=' -f 1)
param_value=$(echo $param | cut -d '=' -f 2)
validate_non_empty $param_name $param_value
if [[ " ${category} " =~ " open " ]]; then
validate_in_list "params" $param_name "${OPEN_CATEGORY_PARAMS[@]}"
elif [[ " ${category} " =~ " closed " ]]; then
validate_in_list "params" $param_name "${CLOSED_CATEGORY_PARAMS[@]}"
if [[ "$param_name" == "reader.prefetch_size" && "$param_value" -gt 2 ]]; then
echo "reader.prefetch_size value should not exceed 2"
exit 1
fi
fi
done
}
validate_non_empty() {
local name=$1
local value=$2
if [[ -z "$value" ]]; then
echo "${name} should not be empty. Pass -h option for help menu"
exit 1
fi
}
add_prefix_params() {
local input_array=( "$@" )
local prefixed_array=()
# prefix is fixed as per directory structure
prefix="++workload."
for element in "${input_array[@]}"; do
prefixed_array+=("$prefix$element")
done
echo "${prefixed_array[@]}"
}
get_key_value_from_file() {
local config_name=$1;shift
local key=$1;shift
abs_config_name=$WORKLOAD_PATH/$config_name.yaml
value=`grep "$key:" $abs_config_name| tr -d ' ' | cut -d':' -f2`
echo "$value"
}
datasize() {
local workload=$1;shift
local accelerator_type=$1;shift
local num_accelerators=$1;shift
local num_client_hosts=$1;shift
local client_host_memory_per_host_in_gb=$1; shift
config_name=$(get_config_file $workload $accelerator_type)
num_steps_per_epoch=$(get_key_value_from_file $config_name "total_training_steps")
batch_size=$(get_key_value_from_file $config_name "batch_size")
record_length=$(get_key_value_from_file $config_name "record_length")
num_samples_per_file=$(get_key_value_from_file $config_name "num_samples_per_file")
computation_time=$(get_key_value_from_file $config_name "computation_time")
epochs=$(get_key_value_from_file $config_name "epochs")
if [[ -z "$num_steps_per_epoch" ]]; then
# if total_training_steps is not set in config file, set num_steps_per_epoch to constant
num_steps_per_epoch=${STEPS_PER_EPOCH}
steps_per_epoch_from_config_file=0
else
# if total_training_steps is set in config file, set num_steps_per_epoch to value from config gile
steps_per_epoch_from_config_file=1
fi
if [[ -z "$batch_size" ]]; then
echo "Invalid config file. Batch size should not empty"
exit 1
fi
if [[ -z "$epochs" ]]; then
epochs=1
fi
# calculate required minimum samples given number of steps per epoch
min_samples_steps_per_epoch=$(echo "$num_steps_per_epoch * $batch_size * $num_accelerators" | bc)
# calculate required minimum samples given host memory to eliminate client-side caching effects
min_samples_host_memory=$(echo "$num_client_hosts * $client_host_memory_per_host_in_gb * $HOST_MEMORY_MULTIPLIER * 1024 * 1024 * 1024 / $record_length" | bc)
# ensure we meet both constraints: min_samples = max(min_samples_v1, min_samples_v2)
min_samples=$(( $min_samples_steps_per_epoch > $min_samples_host_memory ? $min_samples_steps_per_epoch : $min_samples_host_memory ))
# calculate minimum files to generate
min_total_files=$(echo "$min_samples / $num_samples_per_file" | bc)
min_files_size=$(echo "$min_samples * $record_length / 1024 / 1024 / 1024" | bc)
# approx total time of the benchmark
if [ $steps_per_epoch_from_config_file -eq 1 ]; then
# if total_training_steps is set in config file, max time will be calculated on configured total_training_steps
min_samples=$min_samples_steps_per_epoch
fi
total_time_min=$(echo "$min_samples / $batch_size / $num_accelerators * $computation_time * $epochs / 60" | bc)
echo "The benchmark will run for approx ${total_time_min} minutes(best case)"
echo "Minimum ${min_total_files} files are required which will consume ${min_files_size} GB of storage"
echo "----------------------------------------------"
echo "Set --param dataset.num_files_train=${min_total_files} with ./benchmark.sh datagen/run commands"
}
datagen() {
local hosts=$1;shift
local category=$1; shift
local workload=$1;shift
local accelerator_type=$1;shift
local parallel=$1;shift
local results_dir=$1; shift
local params=("$@")
validate_category $category
validate_workload $workload
validate_accelerator_type $accelerator_type
if [[ ! -z "$params" ]]; then
validate_params $category "${params[@]}"
fi
if [[ ! -z "$results_dir" ]]; then
EXTRA_PARAMS=(
"${EXTRA_PARAMS[@]}" ++hydra.run.dir=$results_dir
)
fi
config_name=$(get_config_file $workload $accelerator_type)
prefixed_array=$(add_prefix_params ${params[@]})
mpirun -hosts $hosts -np $parallel python3 dlio_benchmark/dlio_benchmark/main.py --config-path=$CONFIG_PATH workload=$config_name ++workload.workflow.generate_data=True ++workload.workflow.train=False ${prefixed_array[@]} ${EXTRA_PARAMS[@]}
}
run() {
local hosts=$1;shift
local category=$1;shift
local workload=$1;shift
local accelerator_type=$1;shift
local num_accelerators=$1;shift
local results_dir=$1; shift
local params=("$@")
validate_category $category
validate_workload $workload
validate_accelerator_type $accelerator_type
if [[ ! -z "$params" ]]; then
validate_params $category "${params[@]}"
fi
if [[ ! -z "$results_dir" ]]; then
EXTRA_PARAMS=(
"${EXTRA_PARAMS[@]}" ++hydra.run.dir=$results_dir
)
fi
config_name=$(get_config_file $workload $accelerator_type)
prefixed_array=$(add_prefix_params ${params[@]})
mpirun -hosts $hosts -np $num_accelerators python3 dlio_benchmark/dlio_benchmark/main.py --config-path=$CONFIG_PATH workload=$config_name ++workload.workflow.generate_data=False ++workload.workflow.train=True ${prefixed_array[@]} ${EXTRA_PARAMS[@]}
#python report.py --result-dir $results_dir --config-path=$CONFIG_PATH
}
configview() {
local workload=$1;shift
local accelerator_type=$1;shift
local params=("$@")
validate_workload $workload
validate_accelerator_type $accelerator_type
config_name=$(get_config_file $workload $accelerator_type)
prefixed_array=$(add_prefix_params ${params[@]})
python3 dlio_benchmark/dlio_benchmark/main.py --config-path=$CONFIG_PATH workload=$config_name ${prefixed_array[@]} --cfg=job
}
main() {
local mode=$1; shift
if [ "$mode" = "datasize" ]
then
while [ $# -gt 0 ]; do
case "$1" in
-h | --help ) datasize_usage; exit 0 ;;
-w | --workload ) workload="$2"; shift 2 ;;
-g | --accelerator-type ) accelerator_type="$2"; shift 2 ;;
-n | --num-accelerators ) num_accelerators="$2"; shift 2 ;;
-c | --num-client-hosts ) num_client_hosts="$2"; shift 2 ;;
-m | --client-host-memory-in-gb ) client_host_memory_per_host_in_gb="$2"; shift 2 ;;
* ) echo "Invalid option $1"; datasize_usage; exit 1 ;;
esac
done
validate_non_empty "workload" $workload
validate_non_empty "accelerator-type" $accelerator_type
validate_non_empty "num-accelerators" $num_accelerators
validate_non_empty "num-client-hosts" $num_client_hosts
validate_non_empty "client_host_memory_per_host_in_gb" $client_host_memory_per_host_in_gb
datasize $workload $accelerator_type $num_accelerators $num_client_hosts $client_host_memory_per_host_in_gb
elif [ "$mode" = "datagen" ]
then
params=()
while [ $# -gt 0 ]; do
case "$1" in
-h | --help ) datagen_usage; exit 0 ;;
-s | --hosts ) hosts="$2"; shift 2 ;;
-c | --category ) category="$2"; shift 2 ;;
-w | --workload ) workload="$2"; shift 2 ;;
-g | --accelerator-type ) accelerator_type="$2"; shift 2 ;;
-n | --num-parallel ) parallel="$2"; shift 2 ;;
-r | --results-dir ) results_dir="$2"; shift 2 ;;
-p | --param ) params+=("$2"); shift 2 ;;
* ) echo "Invalid option $1"; datagen_usage; exit 1 ;;
esac
done
category=${category:-$DEFAULT_CATEGORY}
validate_non_empty "hosts" $hosts
validate_non_empty "workload" $workload
validate_non_empty "accelerator-type" $accelerator_type
parallel=${parallel:-1}
datagen $hosts $category $workload $accelerator_type $parallel "$results_dir" "${params[@]}"
elif [ "$mode" = "run" ]
then
params=()
while [ $# -gt 0 ]; do
case "$1" in
-h | --help ) run_usage; exit 0 ;;
-s | --hosts ) hosts="$2"; shift 2 ;;
-c | --category ) category="$2"; shift 2 ;;
-w | --workload ) workload="$2"; shift 2 ;;
-g | --accelerator-type ) accelerator_type="$2"; shift 2 ;;
-n | --num-accelerators ) num_accelerators="$2"; shift 2 ;;
-r | --results-dir ) results_dir="$2"; shift 2 ;;
-p | --param ) params+=("$2"); shift 2 ;;
* ) echo "Invalid option $1"; run_usage; exit 1 ;;
esac
done
category=${category:-$DEFAULT_CATEGORY}
validate_non_empty "hosts" $hosts
validate_non_empty "workload" $workload
validate_non_empty "accelerator-type" $accelerator_type
validate_non_empty "num-accelerators" $num_accelerators
validate_non_empty "results-dir" $results_dir
run $hosts $category $workload $accelerator_type $num_accelerators "$results_dir" "${params[@]}"
elif [ "$mode" = "configview" ]
then
params=()
while [ $# -gt 0 ]; do
case "$1" in
-h | --help ) configview_usage; exit 0 ;;
-w | --workload ) workload="$2"; shift 2 ;;
-g | --accelerator-type ) accelerator_type="$2"; shift 2 ;;
-p | --param ) params+=("$2"); shift 2 ;;
* ) echo "Invalid option $1"; configview_usage; exit 1 ;;
esac
done
validate_non_empty "workload" $workload
validate_non_empty "accelerator-type" $accelerator_type
configview $workload $accelerator_type "${params[@]}"
elif [ "$mode" = "reportgen" ]
then
while [ $# -gt 0 ]; do
case "$1" in
-h | --help ) reportgen_usage; exit 0 ;;
-r | --results-dir ) results_dir="$2"; shift 2 ;;
* ) echo "Invalid option $1"; reportgen_usage; exit 1 ;;
esac
done
validate_non_empty "results-dir" $results_dir
python3 ${SCRIPT_DIR}/report.py --result-dir $results_dir
else
usage; exit 1
fi
}
main $@