-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvast-perf.sh
825 lines (711 loc) · 32.4 KB
/
vast-perf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
#!/usr/bin/bash
# This script can be run either on a Vast-CNode, or another linux host.
#New stuff
# now supports loopback mode when there are vlan tag interfaces.
# In order to simplify usage...we no longer require any args. However:
# * if you are _not_ running on a CNode, you must specify --vms=$vip
####Background/details##
##If you are running on a Vast-Cnode
# If you use 'modulo' for CN_DIST_MODE, it will require a larger VIP pool, specifically ($numCNodes + $JOBS) - 1.
# If running on a cluster with more than 8 CNodes, the script will not execute on the node holding VMS (this is to prevent OOM issues)
# If you notice issues on 8CN or smaller clusters where VMS crashes, set --usevms=false
# If you choose 'RDMA', the script will automatically change to TCP, because Vast-CNodes do not have RDMA-client packages.
# This script will temporarily adjust CNode <-> CNode routing tables to ensure that traffic does not cross the ISL's.
####
##When running on a non-vast client
## it requires that FIO 3.1 or higher is available on the host (by default it is on vast-cnodes)
# If FIO is in a different location than /usr/bin/fio : specify --binary=/path/to/fio
# you must make sure that the number of VIPs in the VIP-pool you use is at least equal to the number of jobs you are using.
# If you choose '--proto=rdma' : make sure that you have already setup and verified that you can mount NFS using RDMA.
#
###Behavior, 'best practices'
# You will want to run the write_bw test for at least 5 minutes (300 seconds) to blow through Optane buffers.
# The first time you run a read test, it may want to create some files as it reads, this will slow down the test, just re-run it until it stops trying to create files.
# When running on a Vast-cnode: start with JOBS=12 before testing with smaller values. That way the files will get pre-created for subsequent runs.
# Refrain from pressing 'crtl-c' when you are performing write tests, or if you are performing read tests and you see that FIO is 'laying out .. files'. Doing so may result in unkillable FIO processes
# If you press crtl-c during any test, then immediately re-run the script with the 'cleanup' job with --test=cleanup .This will ensure that mounts/etc are cleaned up.
# Minimal error checking is done. incorrect usage may yield unexpected results. Read through the 'Variables' section below and make sure you understand.
#####
###How to Run###
##Running on a CNode
# 1. put this script on Cnode-1
# 2. 'bash /home/vastdata/vast-perf.sh' <--only runs on one host it will run with config defaults (see below), and discover VMS ip.
# 3. Once you verify it works, copy to all cnodes: `clush -g cnodes -c /home/vastdata/vast-perf.sh`.
# 4. Run on all Cnodes like this `clush -g cnodes "bash /home/vastdata/vast-perf.sh"`
# 5. Specify different tests with the '--test=' flag (write_bw , read_iops, write_iops)
# 6. Run like this to clean everything up: `clush -g cnodes "bash /home/vastdata/vast-perf.sh --test=cleanup --delete=1"`
####
## Running on an external client ##
# you don't _need_ clustershell/clush , but it makes it easier to run this on multiple hosts.
# 1. put this script on client-1 (whatever that is)
# 2. 'bash vast-perf.sh --vms=x.x.x.x --pool=2 --proto=rdma' <--only runs on one host.
# 3. Once you verify it works, copy to all hosts (using whatever mechanism you have)
# 4. run on all hosts..here's a clush example, but pdsh/etc can work: clush -g clients "bash /home/vastdata/vast-perf.sh --vms=x.x.x.x"
# 5. Specify different tests with the '--test=' flag (write_bw , read_iops, write_iops)
# 6. Run like this to clean everything up: clush -g clients "bash /home/vastdata/vast-perf.sh --vms=x.x.x.x --test=cleanup --delete=1"
#=============================================================================
# Variables
# Below are defaults (which will get overridden by user specified ARGS).
# Don't set these variables directly, use the '--' args.
mVIP="empty" # the VMS-VIP of the vast cluster you are testing against. If you run on CNodes, don't worry about setting. If you run on an external client, you must specify the VMS-VIP with --vms=ip
NFSEXPORT="/" # the NFS export to use. On a brand new cluster use '/' (no quotes)
TEST="read_bw" # one of 'write_bw' , 'read_bw', 'write_iops' , 'read_iops' , 'cleanup'
RUNTIME=120 # runtime in seconds of the test.
JOBS=8 # how many threads per host. This will also result in N mountpoints per host.
JOB_NAME="randrw" # default fio job name (used in the filename)
SIZE="20g" # size of each file, one per thread.
BLOCKSIZE="1mb" #leave alone for max b/w. Note that this only applies to the b/w tests, for iops tests it will be 4kb (hardcoded)
MIX=100 # only applicable if TEST="mix_bw" or "mix_iops"
POOL=2 # what pool to run on, typically this will be '2', but check!
PROTO="tcp" #rdma or tcp. When in doubt, use tcp
REMOTE_PATH="fio" # change this to whatever you want it to be. This is the subdir underneath the export which will be created.
FIO_BIN=/usr/bin/fio #location where fio binary exists.
MOUNT=/mnt/fiodemo #where the mountpoints will get created on the host/cnode you are running this on.
DELETE_ALL=0 #if you use --delete=1 , it will clean up all files & mountpoints.
ioengine=libaio #use libaio most of the time. other options: posixaio
iodepth=8 #For b/w tests, lower values can result in slightly better latency. For IOPS tests, higher values can yield higher IOps
USE_VMS="true" # should the VMS cnodes also be a client? Note that in clusters larger than USABLE_CNODES , vms won't be used even if this is set to 1.
CN_DIST_MODE=random #or 'modulo' ( experimental ) . Only applies to running on a vast-cnode.
PROXY="empty" #use IP:port if you need a proxy to get to VMS.
EXTRA_FIO_ARGS=" --numa_mem_policy=local --gtod_reduce=1 --clocksource=cpu --refill_buffers --randrepeat=0 --create_serialize=0 --random_generator=lfsr --fallocate=none" #don't change these unless you know...
DIRECT=1 # o_direct or not..
ADMINUSER="admin"
ADMINPASSWORD=123456
LOOPBACK=1 # only applies when running on cnodes. default is on now. BUT: this requires a lot of vips..
VERBOSE=1 # verbosely output details, may be too noisy on large clusters
###experimental flags ###
VIPFILE="empty" #use at own risk.
CN_AVOID_ISL=0 # only set this to 1 if you are in the lab or know what you are doing. if there are bugs, it can screw up routing.
ALT_POOL="empty" # experimental. don't set this or use --alt-pool "2 3 4 5" option.
VLAN_ID="empty" # only useful if we are attempting to modify routing (CN_AVOID_ISL=1)
VLAN_IFACES="empty" # a hack for now. we need to know what the vlan ifaces are. used in conjuction with VLAN_ID & CN_AVOID_ISL
###Following are hardcoded and not change-able via args/flags.
USABLE_CNODES=15 #this isn't changable via OPTS. its experimental. Use the --usevms=1/0 flag instead.
NOT_CNODE=0 # this only applies in the lab. leave at 0 normally
CLIENT_ISL_AVOID=0 # experimental. only for use in the lab. it can change routes.
#=============================================================================
# Functions
avoid_isl_func () {
# experimental. this only applies if you are NOT running on cnodes, since we already have a hack for that.
# this is only really useful in the lab, where we have clients directly attached to same switches as clusters.
if [ $IS_VAST == 0 ] && [ "$CLIENT_ISL_AVOID" == 1 ]; then
#basically, we need to find out if the CNode iface matches the client.
# for that..ssh key must work to the VMS-ip.
# what we do is ssh to the VMS ip, do a 'clush' looking for the various ip's, and make a list.
# Then, we have to look at the client side IPs, and determine if the iface which is on the same 'subnet' will
# be able to talk directly to the iface on the CNode. This assumes that the clients have ib0 on the same switch
# that the cnode has ib0 on.
CLIENT_IFACES="ib0 ib1"
for iface in $CLIENT_IFACES; do
export IPS=`ssh vastdata@${mVIP} clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $1'}`
#now check what the route looks like on this node for each ip.
for IP in ${IPS}; do
current_route=`/sbin/ip route get ${IP}|egrep -o 'dev \w+'|awk {'print $2'}`
if [[ ${current_route} == $iface ]]; then
echo "route is OK for $IP -> $iface , skipping"
else
echo "temporarily changing route for $IP -> $iface"
sudo /sbin/ip route add ${IP}/32 dev ${iface}
fi
done
done
fi
}
remove_isl_func() {
# experimental. this only applies if you are NOT running on cnodes, since we already have a hack for that.
# this is only really useful in the lab, where we have clients directly attached to same switches as clusters.
if [ $IS_VAST == 0 ] && [ "$CLIENT_ISL_AVOID" == 1 ]; then
CLIENT_IFACES="ib0 ib1"
for iface in $CLIENT_IFACES; do
export IPS=`ssh vastdata@${mVIP} clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $1'}`
#now check what the route looks like on this node for each ip.
for IP in ${IPS}; do
#this just deletes all /32 routes that we might have..
sudo /sbin/ip route del ${IP}/32 dev ${iface} >/dev/null 2>1
done
done
fi
}
mount_func () {
export node=`hostname`
DIRS=()
MD_DIRS=()
# Do the mounts. Stash away the command and the verbose output format
# but only output them on mount failure to reduce noise.
for i in ${needed_vips[@]}; do
sudo mkdir -p ${MOUNT}/${i}
if [[ ${PROTO} == "rdma" ]]; then
mount_cmd="sudo mount -v -t nfs -o retry=0,proto=rdma,soft,port=20049,vers=3 ${i}:${NFSEXPORT} ${MOUNT}/${i}"
mount_output=$(eval $mount_cmd 2>&1)
if [ $? -eq 0 ]; then
if [ ${VERBOSE} -eq 1 ]; then
echo "mounted ${MOUNT}/${i} ok"
fi
else
echo "mount of ${MOUNT}/${i} failed : $? , going to unmount everything and exit"
echo "Command: $mount_cmd"
echo "Output:"
echo "$mount_output"
cleanup
exit
fi
else
mount_cmd="sudo mount -v -t nfs -o retry=0,tcp,soft,rw,vers=3 ${i}:${NFSEXPORT} ${MOUNT}/${i}"
mount_output=$(eval $mount_cmd 2>&1)
if [ $? -eq 0 ]; then
if [ ${VERBOSE} -eq 1 ]; then
echo "mounted ${MOUNT}/${i} ok"
fi
else
echo "mount of ${MOUNT}/${i} failed : $? , going to unmount everything and exit"
echo "Command: $mount_cmd"
echo "Output:"
echo "$mount_output"
cleanup
exit
fi
fi
export fio_dir=${MOUNT}/$i/${REMOTE_PATH}/${node}
DIRS+=${fio_dir}:
sudo mkdir -p ${fio_dir}
sudo chmod 777 ${fio_dir}
done
}
multipath_func () {
#basically we want to build a mount command
# which uses all VIPs. I believe the limit with current multipath driver
# is 32, so we'll set a limit of that.
#
# first, shuffle
all_vips=( $(shuf -e "${all_vips[@]}") )
needed_vips=()
for ((idx=0; idx<${JOBS} && idx<${#all_vips[@]}; ++idx)); do
needed_vips+=(${all_vips[$idx]})
done
}
write_bw_test () {
if [ -z ${RUNTIMEWRITE} ]; then
#didn't specify runtime, so just create files of size specified
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=0 --group_reporting --directory=${DIRS}
else
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=0 --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
fi
}
seq_write_test () {
if [ -z ${RUNTIMEWRITE} ]; then
#didn't specify runtime, so just create files of size specified
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=write --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --group_reporting --directory=${DIRS}
else
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=write --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
fi
}
read_bw_test () {
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --iodepth=${iodepth} --rw=randread --bs=${BLOCKSIZE} --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME} $EXTRA_FIO_ARGS
}
write_iops_test () {
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=4kb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=0 --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
}
read_iops_test () {
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --fallocate=none --iodepth=${iodepth} --rw=randread --bs=4kb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
}
mix_bw_test () {
if [ -z ${RUNTIMEWRITE} ]; then
#didn't specify runtime, so just create files of size specified
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=${MIX} --group_reporting --directory=${DIRS}
else
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=1mb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=${MIX} --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
fi
}
mix_iops_test () {
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --refill_buffers --create_serialize=0 --randrepeat=0 --create_on_open=1 --fallocate=none --iodepth=${iodepth} --rw=randrw --bs=4kb --direct=${DIRECT} --size=${SIZE} --numjobs=${JOBS} --rwmixread=${MIX} --group_reporting --directory=${DIRS} --time_based=1 --runtime=${RUNTIME}
}
#only use if you know what you are doing.
read_bw_reuse_test () {
rando_dir=${MOUNT}/${all_vips[0]}/${REMOTE_PATH}
echo ${rando_dir}
${FIO_BIN} --name=${JOB_NAME} --ioengine=${ioengine} --iodepth=${iodepth} --rw=randrw --bs=${BLOCKSIZE} --direct=${DIRECT} --numjobs=${JOBS} --rwmixread=100 --group_reporting --opendir=${rando_dir} --time_based=1 --runtime=${RUNTIME}
}
cleanup() {
#basically, just skip doing any actual testing, and make sure that routes and mounts are cleaned up.
echo "I'm only cleaning up..."
pkill fio
# this will clean up anything in dirs
if [[ $DELETE_ALL -eq 1 ]]; then
echo "deleting all created files"
sudo rm -rvf ${MOUNT}/${needed_vips[0]}/${REMOTE_PATH}/${node}
sudo umount -lf ${MOUNT}/* >/dev/null 2>/dev/null
sudo rm -rf ${MOUNT}
else
echo "leaving files in place. you may want to clean up before you leave.."
echo "unmounting all dirs"
sudo umount -lf ${MOUNT}/* >/dev/null 2>/dev/null
fi
if [ $IS_VAST == 1 ] && [ $CN_AVOID_ISL == 1 ]; then
if [[ $iface_count -eq 2 ]]; then
if [ $VLAN_ID != "empty" ] && [ $VLAN_IFACES != "empty" ]; then #super experimental
EXT_IFACES=$VLAN_IFACES.$VLAN_ID
for iface in $EXT_IFACES; do
export IPS_TO_ROUTE=$(clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $2'})
for IP in ${IPS_TO_ROUTE}; do
echo "not doing route deletion..beware."
#sudo /sbin/ip route del ${IP}/32 dev ${iface} >/dev/null 2>1
done
done
else #the normal path
for iface in $EXT_IFACES; do
export IPS_TO_ROUTE=$(clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $2'})
for IP in ${IPS_TO_ROUTE}; do
# a little heavy handed, but its OK.
sudo /sbin/ip route del ${IP}/32 dev ${iface} >/dev/null 2>1
done
done
fi
else
echo "only one external iface, skipping route destruction"
fi
fi
remove_isl_func
}
showusage() {
# the alignment of the arguments below is weird because they are meant
# to align in the output after variable substitution
cat << EOF
Usage: vast-perf.sh --test=TEST [optional args]
Available tests ---------------------------------------------------------------
--test=TEST test to run, one of:
read_bw, write_bw, seq_write_bw, write_iops, read_iops,
mix_bw, mix_ops, reuse_bw_reuse, cleanup, mount_only
Optional arguments with their defaults ----------------------------------------
misc options
--delete=$DELETE_ALL delete files upon completion?
--verbose=$VERBOSE include verbose output?
fio options
--binary=$FIO_BIN path to fio
--runtime=$RUNTIME runtime of the test in seconds
--job-name=$JOB_NAME fio job name, used in filenames
--jobs=$JOBS number of fio jobs to run
--size=$SIZE size of the files to read/write
--block-size=$BLOCKSIZE read/write blocksize
--direct=$DIRECT use O_DIRECT?
--ioengine=$ioengine ioengine to use with fio
--iodepth=$iodepth iodepth to use with fio
--mix=$MIX read/write mix, only used for "mix_bw" or "mix_iops"
mount-related options
--proto=$PROTO mount protocol (auto-detected)
--mountpoint=$MOUNT
mount point on cnode
--export=$NFSEXPORT NFS export from the cluster
--path=$REMOTE_PATH directory under the mountpoint to place files
--loopback=$LOOPBACK use loopback mounts
cluster options
--usevms=$USE_VMS use VMS node in the test load? (auto-detected)
--pool=$POOL vippool to run against
advanced options -- you shouldn't need any of these when running on cnodes
--vms=$mVIP VMS VIP (auto-detected when run on cnode)
--proxy=$PROXY proxy in the form of ip:port
--user=$ADMINUSER admin username
--password=$ADMINPASSWORD admin password
Note: experimental options are not included.
EOF
}
#=============================================================================
# Main program
#--------------------------------------
# Argument Parsing
### try this sometime...
# USAGE="$0 -n <qty> -d <path> -h <list,of,hosts> "
# while getopts n:d:h: c
# do
# case $c in
# n) shift; QTY=$1 ;;
# d) shift; THEPATH="$1" ;;
# h) shift; THEHOSTS="$1" ;;
# \?) echo $USAGE
# exit 2;;
# esac
# done
while [ $# -gt 0 ]; do
case "$1" in
--vms=*)
mVIP="${1#*=}"
;;
--proxy=*)
PROXY="${1#*=}"
;;
--export=*)
NFSEXPORT="${1#*=}"
;;
--test=*)
TEST="${1#*=}"
;;
--runtime=*)
RUNTIME="${1#*=}"
RUNTIMEWRITE=$RUNTIME
;;
--proto=*)
PROTO="${1#*=}"
;;
--job-name=*)
JOB_NAME="${1#*=}"
;;
--jobs=*)
JOBS="${1#*=}"
;;
--size=*)
SIZE="${1#*=}"
;;
--mix=*)
MIX="${1#*=}"
;;
--block-size=*)
BLOCKSIZE="${1#*=}"
;;
--pool=*)
POOL="${1#*=}"
;;
--alt-pool=*)
ALT_POOL="${1#*=}"
;;
--path=*)
REMOTE_PATH="${1#*=}"
;;
--binary=*)
FIO_BIN="${1#*=}"
;;
--mountpoint=*)
MOUNT="${1#*=}"
;;
--delete=*)
DELETE_ALL="${1#*=}"
;;
--ioengine=*)
ioengine="${1#*=}"
;;
--iodepth=*)
iodepth="${1#*=}"
;;
--direct=*)
DIRECT="${1#*=}"
;;
--usevms=*)
USE_VMS="${1#*=}"
;;
--distmode=*)
CN_DIST_MODE="${1#*=}"
;;
--avoid-isl=*)
CN_AVOID_ISL="${1#*=}"
;;
--loopback=*)
LOOPBACK="${1#*=}"
;;
--user=*)
ADMINUSER="${1#*=}"
;;
--password=*)
ADMINPASSWORD="${1#*=}"
;;
--verbose=*)
VERBOSE="${1#*=}"
;;
--vipfile=*)
VIPFILE="${1#*=}"
;;
--help=*)
HELPME="true"
;;
*)
showusage
exit 1
esac
shift
done
#--------------------------------------
# Pre-flight checks
#
#check if we are running on a vast CNode
#
IS_VAST=0
if [ -f "/vast/vman/mgmt-vip" ] && [ $NOT_CNODE == 0 ]; then
IS_VAST=1
mVIP=`cat /vast/vman/mgmt-vip`
echo "running on a Vast node. setting VMSIP to ${mVIP}"
if [ ${PROTO} == "multipath" ]; then
echo "can't use multipath on cnode, falling back to regular rdma"
PROTO=rdma
fi
# Nowadays, vast-OS supports RDMA so default to it when running on loopback
# mode, but first check two exceptions: Broadcom NICs & Rocky
# Check for the Broadcom driver
LSMOD=`sudo lsmod|grep bnxt_en|wc -l`
if [ "$LSMOD" -gt "0" ]; then
PROTO=tcp
echo "setting PROTO to ${PROTO} because broadcom"
# Check for Rocky
elif grep -q "Rocky" /etc/redhat-release; then
PROTO=tcp
echo "setting PROTO to ${PROTO} because rocky"
# by default, use RDMA
else
PROTO=rdma
echo "setting PROTO to ${PROTO} because cnode"
fi
else # if we are running on an external client.
if [[ ${mVIP} == "empty" ]]; then
echo "You must specify a VMS ip via --vms=x.x.x.x"
exit 20
fi
# loopback isn't valid on non-cnodes
LOOPBACK=0
fi
# Build our curl options
CURL_OPTS="-s -u ${ADMINUSER}:${ADMINPASSWORD} -H 'accept: application/json' --insecure"
if [ "$PROXY" != "empty" ]; then
export http_proxy=${PROXY}
CURL_OPTS="${CURL_OPTS} -x ${PROXY}"
fi
# Try to use the updated cipher
/usr/bin/curl ${CURL_OPTS} --ciphers ECDHE-RSA-AES128-GCM-SHA256 -X GET "https://$mVIP/api/basesettings/" > /dev/null
if [ "$?" != "59" ]; then
CURL_OPTS="${CURL_OPTS} --ciphers ECDHE-RSA-AES128-GCM-SHA256"
else
echo "WARNING: Your vast-os is old as cipher ECDHE-RSA-AES128-GCM-SHA256 is not supported!"
fi
# Confirm we can connect to mVIP
response_code=$(/usr/bin/curl ${CURL_OPTS} -X GET "https://$mVIP/api/vippools/" --write-out "\n%{http_code}\\n" | tail -1)
if [ "$response_code" != "200" ]; then
echo "Unable to connect to VMS, confirm username & password are correct."
exit 20
fi
if [[ ${TEST} == "read_bw_reuse" ]]; then
#this test is really only valid if you are using RDMA, otherwise you will bottleneck on a single mount per client.
if [[ ${IS_VAST} == 1 ]]; then
echo "don't use read_bw_reuse on cnodes. only use this option on external clients if you are using RDMA, OR you have a lot of clients."
exit 20
fi
fi
#--------------------------------------
# Determine some info about our env
pools=()
pools+=(${POOL})
if [ "$ALT_POOL" != "empty" ]; then
pools+=" ${ALT_POOL}"
fi
client_VIPs=""
all_vips=()
for pool in $pools; do
if [ $LOOPBACK == 1 ]; then
# if [ VIPFILE != "empty" ]; then
# echo "you can't specify --vipfile=${VIPFILE} && looback=1. If you want to use a vipfile, use loopback=0"
# exit 20
# fi
# only mount local vips on the CNode. Note that if there are less vips per CNode than jobs, then some jobs
# will re-use the same VIPs, which will not necessarily give the b/w you desire..ideally you have at least 5 mounts per CNode.
# in 4.2, we changed how we name nodes. So, instead of using node-names in the filter...we will use the internal IP.
# also, the way this works is different on an IB backend cluster vs an ETH cluster.
#first..figure out if we have an IB nic for internal
export INT_IFACES=$(cat /etc/vast-configure_network.py-params.ini|grep internal_interfaces|awk -F "=" {'print $2'}| sed -E 's/,/ /')
export DATA_VLAN=$(cat /etc/vast-configure_network.py-params.ini|grep data_vlan|awk -F "=" {'print $2'}| sed -E 's/,/ /')
#
if [[ "$INT_IFACES" =~ .*"en".* ]]; then
echo "ETH backend"
BOND_IFACE="bond0.${DATA_VLAN}"
elif [[ "$INT_IFACES" =~ .*"p2p".* ]]; then
echo "ETH backend"
BOND_IFACE="bond0.${DATA_VLAN}"
elif [[ "$INT_IFACES" =~ .*"ib".* ]]; then
echo "IB backend"
BOND_IFACE="bond0"
else
echo "int ifaces: ${INT_IFACES} : not recognized as 'en', 'p2p', or 'ib' , exiting"
exit 20
fi
###we need to know what the "inner vip" is.
export INNER_VIP=$(cat /etc/vast-configure_network.py-params.ini|grep mgmt_inner_vip=|awk -F "=" {'print $2'})
export INT_IP=`/sbin/ip a s ${BOND_IFACE}|grep inet|grep -v inet6|grep -v ${INNER_VIP} | awk {'print $2'}|sed -E 's/\/[0-9]+//' | awk '{print $1}'`
# query VMS
if [ $VIPFILE != "empty" ]; then #super experimental
# grab the vips from a file..which must be formatted perfectly..or you die.
local_vips="$(cat ${VIPFILE})"
else
export local_vips=$(/usr/bin/curl ${CURL_OPTS} -X GET "https://$mVIP/api/vips/?vippool__id=${pool}&cnode__ip=${INT_IP}" | jq '.[] | .ip')
fi
if [ "x$local_vips" == 'x' ]; then
echo "Failed to retrieve cluster virtual IPs for client access using VIP pool ID ${pool}, check VMSip or pool-id. Also: make sure that this CNode is a member of the pool you want to test with."
echo "Available VIP pools:"
/usr/bin/curl ${CURL_OPTS} -X GET "https://$mVIP/api/vippools/" | jq -c 'sort_by(.id) | .[] | {id, name, ranges_summary}'
exit 20
fi
for local_vip in ${local_vips}; do
local_vip=${local_vip//\"/}
all_vips+=(${local_vip})
done
else
#not loopback..get all the vips in the pool to use.
if [ $VIPFILE != "empty" ]; then #super experimental
# grab the vips from a file..which must be formatted perfectly..or you die.
client_VIPs="$(cat ${VIPFILE})"
else
# use curl to grab the VIPs
client_VIPs+="$(/usr/bin/curl ${CURL_OPTS} -X GET "https://$mVIP/api/vips/?vippool__id=${pool}" | jq -r '.[] | .ip' | sort -t'.' -k4 -n | tr '\n' ' ')"
fi
echo $client_VIPs
if [ "x$client_VIPs" == 'x' ]; then
echo "Failed to retrieve cluster virtual IPs for client access using VIP pool ID ${pool}, check VMSip or pool-id"
echo "Available VIP pools:"
/usr/bin/curl ${CURL_OPTS} -X GET "https://$mVIP/api/vippools/" | jq -c 'sort_by(.id) | .[] | {id, name, ranges_summary}'
exit 20
fi
fi
done
if [ $LOOPBACK == 0 ]; then
numVIPS=`echo $client_VIPs | wc -w`
if [ "$numVIPS" -lt "$JOBS" ]; then
echo "$numVIPS vips is < $JOBS jobs , re-run with $numVIPS or less jobs."
exit 20
fi
#put the vips into an array.
for i in $client_VIPs; do
all_vips+=(${i})
done
fi
####If we are running on a CNode.
#If the cluster is more than XX CNodes, then don't run on VMS node.
#
if [ $IS_VAST == 1 ]; then
#dima don't like this, but for now its fine.
numCNodes=`grep 'cnodes:' /etc/clustershell/groups.d/local.cfg |awk -F ":" {'print $2'}|wc -w`
if [ "$numCNodes" -gt $USABLE_CNODES ] || [ "$USE_VMS" == "false" ]; then
#check if we're on the VMS Cnode
if [ $(docker ps -q --filter label=role=vms |wc -l) -eq 1 ]; then
echo "not going to run on VMS node"
exit
fi
fi
# next chunk of items don't apply to loopback.
if [ $LOOPBACK == 0 ]; then
if [ "$CN_DIST_MODE" == "modulo" ]; then
#Basic methodology is that we want to shift the starting IP for each CNode, and have it mount $Numjobs (up to 12) VIPs.
# eg: CN1 mounts .1 -> 12 , CN2 mounts .2 -> .13 , CN3 mounts .3 -> .14 , and so on.
# check we have enough VIPs for this.
#
vipTEST=$(($numVIPS + 1))
if [[ $vipTEST -lt $(($numCNodes + $JOBS)) ]]; then
echo "you need at least $((($numCNodes + $JOBS) - 1)) vips"
exit 20
fi
fi
#next, avoid crossing ISLs since we are on cnodes.
#temporarily setup the routing table to ensure that reads/writes go over both ifaces. assumes at least one VIP on each iface on a given cnode.
#
#first, figure out what ifaces we need to use.
export EXT_IFACES=$(cat /etc/vast-configure_network.py-params.ini|grep external_interfaces|awk -F "=" {'print $2'}| sed -E 's/,/ /')
# we only care if there are more than one iface.
export iface_count=`echo $EXT_IFACES | wc -w`
echo "interface count is $iface_count"
if [ $iface_count -eq 2 ] && [ $CN_AVOID_ISL == 1 ]; then
# sometimes the route is OK, sometimes its not, check them all and only change if they are 'wrong'
if [ $VLAN_ID != "empty" ] && [ $VLAN_IFACES != "empty" ]; then #super experimental
EXT_IFACES=$VLAN_IFACES.$VLAN_ID
for iface in $EXT_IFACES; do
export IPS_TO_ROUTE=$(clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $2'})
for IP in ${IPS_TO_ROUTE}; do #
current_route=`/sbin/ip route get ${IP}|egrep -o 'dev \w+\.[0-9]+'|awk {'print $2'}`
if [[ ${current_route} == $iface ]]; then
echo "route is OK for $IP -> $iface , skipping"
else
echo "temporarily changing route for $IP -> $iface"
sudo /sbin/ip route add ${IP}/32 dev ${iface}
fi
done
done
else # this is the 'normal' path
for iface in $EXT_IFACES; do #this won't work if there is a vlan tag on the pool.
export IPS_TO_ROUTE=$(clush -g cnodes "/sbin/ip a s ${iface} | egrep ':vip[0-9]+|:v[0-9]+'|egrep -o '[0-9]{1,3}*\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}'"|awk {'print $2'})
#now check what the route looks like on this node for each ip.
for IP in ${IPS_TO_ROUTE}; do
current_route=`/sbin/ip route get ${IP}|egrep -o 'dev \w+'|awk {'print $2'}`
if [[ ${current_route} == $iface ]]; then
echo "route is OK for $IP -> $iface , skipping"
else
echo "temporarily changing route for $IP -> $iface"
sudo /sbin/ip route add ${IP}/32 dev ${iface}
fi
done
done
fi
else
echo "skipping route setup"
fi
fi
fi
####End CNode specific stuff.###
#build the list of vips to actually mount.
#
if [ $IS_VAST == 1 ] && [ "$CN_DIST_MODE" == "modulo" ]; then
#figure out what node we are on.
export NODENUM=`grep node /etc/vast-configure_network.py-params.ini |egrep -o 'node=[0-9]+'|awk -F '=' {'print $2'}`
echo "$NODENUM here"
idx_start=$(( $NODENUM - 1))
idx_end=$((($idx_start + $JOBS) -1 ))
needed_vips=()
for ((idx=$idx_start; idx<=$idx_end; idx++)); do
needed_vips+=(${all_vips[$idx]})
done
else
#either we're not on a vast cnode, or you chose random distribution.
#Better logic could be had here, but for now just randomize the vip list, and iterate through them until numJobs is satisfied.
# regardless of how we got here, shuffle the vips and only use as many as we need to satisfy the job count, if we have enough.
all_vips=( $(shuf -e "${all_vips[@]}") )
needed_vips=()
for ((idx=0; idx<${JOBS} && idx<${#all_vips[@]}; ++idx)); do
needed_vips+=(${all_vips[$idx]})
# echo "${all_vips[$idx]} is in the mix"
done
fi
#--------------------------------------
# Unmount old mounts & run the test
#force unmount anything that might have been mounted last time this was run.
sudo umount -lf ${MOUNT}/* >/dev/null 2>/dev/null
avoid_isl_func
if [[ ${TEST} == "read_bw" ]]; then
mount_func
read_bw_test
cleanup
elif [[ ${TEST} == "write_bw" ]]; then
mount_func
write_bw_test
cleanup
elif [[ ${TEST} == "seq_write_bw" ]]; then
mount_func
seq_write_test
cleanup
elif [[ ${TEST} == "write_iops" ]]; then
mount_func
write_iops_test
cleanup
elif [[ ${TEST} == "read_iops" ]]; then
mount_func
read_iops_test
cleanup
elif [[ ${TEST} == "mix_bw" ]]; then
mount_func
mix_bw_test
cleanup
elif [[ ${TEST} == "mix_iops" ]]; then
mount_func
mix_iops_test
cleanup
elif [[ ${TEST} == "read_bw_reuse" ]]; then
mount_func
read_bw_reuse_test
cleanup
elif [[ ${TEST} == "cleanup" ]]; then
mount_func
cleanup
elif [[ ${TEST} == "mount_only" ]]; then
mount_func
echo "done mounting, existing and leaving mounts. if you want to unmount, run with --test=cleanup"
else
echo "you didn't specify a valid test. unmounting and exiting"
fi