From fee56e3165d47b1b45853bfecf112c851b3bbbef Mon Sep 17 00:00:00 2001 From: David Valin Date: Tue, 15 Aug 2023 11:01:05 -0400 Subject: [PATCH 1/2] Updates for AWS spot --- bin/burden | 60 +++++++++++++++++++++++++++++++++++++----- bin/kick_off.sh | 70 ++++++++++++++----------------------------------- 2 files changed, 72 insertions(+), 58 deletions(-) diff --git a/bin/burden b/bin/burden index 4001100..4b399de 100755 --- a/bin/burden +++ b/bin/burden @@ -25,6 +25,9 @@ # # Burdens version number. + +cli=${0} +arguments=("$@") version="3.2" export ANSIBLE_ROLES_PATH=$HOME/.ansible/collections/ansible_collections/pbench/agent/roles:$ANSIBLE_ROLES_PATH # @@ -1264,19 +1267,15 @@ retrieve_spot_from_config() # # For AWS obtain the possible starting price. # - worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'` + worker=`aws ec2 describe-spot-price-history --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' --instance-types $host_or_cloud_inst | tail -1 | awk '{ print $2}'` if [[ $? -ne 0 ]]; then cleanup_and_exit "Error: unable to retrieve spot info for $1" 1 fi - if [[ $worker == *":[]"* ]]; then - cleanup_and_exit "Error: unable to retrieve spot info for $1" 1 - fi + gl_spot_price=$worker + gl_spot_cap=$gl_base_cost # # We will default to 5 intervals # - gl_spot_cap=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1` - gl_base_cost=$gl_spot_cap; - gl_spot_price=`echo $worker | cut -d':' -f 10 | cut -d: -f 10 | cut -d'"' -f 2` gl_spot_increment=`echo "scale=2;((($gl_spot_cap - $gl_spot_price)*1.5)/5.00)" | bc` else spot_string=`grep $1 ${gl_test_def_dir}/spot_price.cfg` @@ -1506,6 +1505,10 @@ create_ansible_options() # if [[ $gl_system_type != "local" ]] && [[ $gl_spot_cap -eq 0 ]]; then echo No spot price designated, retrieving + worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'` + if [[ $? -ne 0 ]]; then + cleanup_and_exit "Unable to retrieve base cost $1" 1 + fi if [[ $gl_system_type == "aws" ]]; then worker=`curl -L ec2.shop?filter=$host_or_cloud_inst -H 'accept: json'` gl_base_cost=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1` @@ -1809,6 +1812,7 @@ create_ansible_options() if [[ "$gl_ssh_key_file" != "" ]]; then base_string="${base_string} -s $gl_ssh_key_file" fi + echo $cli "${arguments[@]}" | sed "s/bin/./g" > ${run_dir}/exec_command kick_off.sh $base_string | tee ${run_dir}/ansible_log & pids[${pindex}]=$! let "pindex=$pindex+1" @@ -4009,4 +4013,46 @@ remove_hostnames_added if [[ $gl_warning_string != "" ]]; then echo -e $gl_warning_string fi + +# +# Ok, go back through things and see if we have to perform any runs because of spot failures. +# +pushd ${gl_run_prefix}/${gl_os_vendor}/${gl_system_type} > /dev/null +for i in `ls -d *`; do + if [ -f $i/spot_failure ]; then + rm $i/spot_failure + echo $i has spot failure running again with out spot + next_one=0 + for data in `cat $i/exec_command`; do + if [ $next_one -eq 1 ]; then + file_to_use=$data + break + fi + if [[ $data == "--run_file" ]]; then + next_one=1 + fi + done + test_separ="" + tests="" + pushd $i > /dev/null + for i in `cat test_times | cut -d' ' -f2`; do + tests=${tests}${test_separ}${i} + test_separ="," + done + popd > /dev/null + echo tests >/tmp/tests + tests=${tests}"," + tests_left=`sed "s/${tests}//g" $gl_top_dir/$file_to_use` + if [[ -f /home/zathras_log/spot_fails ]]; then + date=`date` + echo "$data: $date tests completed: ${tests}, tests left ${tests_left}" >> /home/zathras_log/spot_fails + fi + sed "s/--use_spot/--no_spot_recover/g" $gl_top_dir/$file_to_use | sed "s/${tests}//g" > $gl_top_dir/rerun + pushd $gl_top_dir > /dev/null + chmod 755 rerun + ./rerun + popd > /dev/null + fi +done +popd > /dev/null cleanup_and_exit "" 0 diff --git a/bin/kick_off.sh b/bin/kick_off.sh index 9c02698..35d49af 100755 --- a/bin/kick_off.sh +++ b/bin/kick_off.sh @@ -147,7 +147,7 @@ if [[ $ssh_key_file != "" ]]; then cp $ssh_key_file config/user.pem_test if [ ! -s $ssh_key_file ]; then echo "${ssh_key_file} is zero length, please fix. Test is exiting" - exit + exit 1 fi chmod 500 config/user.pem_test fi @@ -191,7 +191,6 @@ do do mkdir tf echo ===== attempt $attempts of $create_attempts ============== - echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml # if [ $spot_recover -eq 1 ] && [[ ! -f "test_returned" ]] && [[ ! -f "cpu_type_failure" ]]; then @@ -199,58 +198,26 @@ do # Check to see if we used spot, and we are to recover if the system goes away. # if [[ -f test_started ]]; then - sp_check=`grep spot_range: ansible_vars_main.yml` - if [[ $sp_check == *","* ]]; then - echo Need to update the test list, do not execute the tests we already executed. - tests_list=`grep ^test: test_times | awk '{ print $2 }'` - test_rm="" - seper="" - for i in $tests_list; do - test_rm=$test_rm${seper}$i - seper="," - done - test_rm=${test_rm}${seper} - cp ansible_vars_main.yml ansible_vars_main.yml_back - sed "s/${test_rm}//g" < ansible_vars_main.yml > update - mv update ansible_vars_main.yml - cp ansible_vars.yml ansible_vars.yml_back - sed "s/${test_rm}//g" < ansible_vars.yml > update - mv update ansible_vars.yml - if [[ -f test_times ]]; then - report_usage - fi - mv test_times test_times_spot - mv if_spot_fail instance_cost - rm -rf tf - rm tf.rtc - # - # Next attempt it without spot pricing. - # - mv ansible_run_vars.yml ansible_run_vars.yml_spot_died - grep -v "spot_range:" ansible_vars_main.yml | grep -v spot_start_price > spot_repair - echo " spot_start_price: 0" >> spot_repair - echo " spot_range: 0" >> spot_repair - mv spot_repair ansible_vars_main.yml - grep -v "spot_range:" ansible_vars.yml | grep -v spot_start_price > spot_repair - echo " spot_start_price: 0" >> spot_repair - echo " spot_range: 0" >> spot_repair - mv spot_repair ansible_main.yml - mkdir tf - echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml - ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml - else - # - # Not spot, test started, system died. - # - echo Error: test started, system died. - exit + sp_check=`grep spot_range: ansible_vars_main.yml | cut -d: -f 2 | sed "s/ //g"` + if [[ $sp_check != "0" ]]; then + touch spot_failure + spot_recover=0 + exit 1 fi else # - # Rarely will happen. + # Not spot, test started, system died. # - echo Error: did not start the test. >> test_start_failure - exit + echo Error: test started, system died. + exit 1 + fi + else + # + # Rarely will happen. + # + if [[ ! -f test_returned ]]; then + echo Error: test started, system died. + exit 1 fi fi if [[ ! -f "cpu_type_failure" ]]; then @@ -273,7 +240,7 @@ do fi let "attempts=${attempts}+1" rm -rf ansible_install_group ansible_test_group boot_info cloud_timings copy_git_file_status cr_status cpu_type_failure - rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_defs.yml test_info test_times tf_results + rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_times tf_results mv tf tf_delete_${attempts} done $top_dir/bin/remove_wrong_cpus $top_dir/$direct @@ -296,3 +263,4 @@ for i in `ls results*tar`; do echo $i >> tuned_run_info cat $check_file >> tuned_run_info done +exit 0 From 31fcfbd3c05132fbf9b7139862e804796e00dd3d Mon Sep 17 00:00:00 2001 From: David Valin Date: Thu, 17 Aug 2023 05:52:58 -0400 Subject: [PATCH 2/2] REview updates --- bin/burden | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/burden b/bin/burden index 4b399de..933bd68 100755 --- a/bin/burden +++ b/bin/burden @@ -1267,7 +1267,7 @@ retrieve_spot_from_config() # # For AWS obtain the possible starting price. # - worker=`aws ec2 describe-spot-price-history --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' --instance-types $host_or_cloud_inst | tail -1 | awk '{ print $2}'` + worker=`aws ec2 describe-spot-price-history --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' --instance-types $host_or_cloud_inst | tail -1 | awk '{ print $2 }'` if [[ $? -ne 0 ]]; then cleanup_and_exit "Error: unable to retrieve spot info for $1" 1 fi @@ -4021,7 +4021,7 @@ pushd ${gl_run_prefix}/${gl_os_vendor}/${gl_system_type} > /dev/null for i in `ls -d *`; do if [ -f $i/spot_failure ]; then rm $i/spot_failure - echo $i has spot failure running again with out spot + echo $i has spot failure running again without spot next_one=0 for data in `cat $i/exec_command`; do if [ $next_one -eq 1 ]; then