Skip to content

Commit

Permalink
Merge pull request #48 from redhat-performance/fix_spot
Browse files Browse the repository at this point in the history
Updates for AWS spot
  • Loading branch information
dvalinrh authored Aug 23, 2023
2 parents ec06045 + 31fcfbd commit 56acc03
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 58 deletions.
60 changes: 53 additions & 7 deletions bin/burden
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#

# Burdens version number.

cli=${0}
arguments=("$@")
version="3.2"
export ANSIBLE_ROLES_PATH=$HOME/.ansible/collections/ansible_collections/pbench/agent/roles:$ANSIBLE_ROLES_PATH
#
Expand Down Expand Up @@ -1264,19 +1267,15 @@ retrieve_spot_from_config()
#
# For AWS obtain the possible starting price.
#
worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'`
worker=`aws ec2 describe-spot-price-history --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' --instance-types $host_or_cloud_inst | tail -1 | awk '{ print $2 }'`
if [[ $? -ne 0 ]]; then
cleanup_and_exit "Error: unable to retrieve spot info for $1" 1
fi
if [[ $worker == *":[]"* ]]; then
cleanup_and_exit "Error: unable to retrieve spot info for $1" 1
fi
gl_spot_price=$worker
gl_spot_cap=$gl_base_cost
#
# We will default to 5 intervals
#
gl_spot_cap=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1`
gl_base_cost=$gl_spot_cap;
gl_spot_price=`echo $worker | cut -d':' -f 10 | cut -d: -f 10 | cut -d'"' -f 2`
gl_spot_increment=`echo "scale=2;((($gl_spot_cap - $gl_spot_price)*1.5)/5.00)" | bc`
else
spot_string=`grep $1 ${gl_test_def_dir}/spot_price.cfg`
Expand Down Expand Up @@ -1506,6 +1505,10 @@ create_ansible_options()
#
if [[ $gl_system_type != "local" ]] && [[ $gl_spot_cap -eq 0 ]]; then
echo No spot price designated, retrieving
worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'`
if [[ $? -ne 0 ]]; then
cleanup_and_exit "Unable to retrieve base cost $1" 1
fi
if [[ $gl_system_type == "aws" ]]; then
worker=`curl -L ec2.shop?filter=$host_or_cloud_inst -H 'accept: json'`
gl_base_cost=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1`
Expand Down Expand Up @@ -1809,6 +1812,7 @@ create_ansible_options()
if [[ "$gl_ssh_key_file" != "" ]]; then
base_string="${base_string} -s $gl_ssh_key_file"
fi
echo $cli "${arguments[@]}" | sed "s/bin/./g" > ${run_dir}/exec_command
kick_off.sh $base_string | tee ${run_dir}/ansible_log &
pids[${pindex}]=$!
let "pindex=$pindex+1"
Expand Down Expand Up @@ -4009,4 +4013,46 @@ remove_hostnames_added
if [[ $gl_warning_string != "" ]]; then
echo -e $gl_warning_string
fi

#
# Ok, go back through things and see if we have to perform any runs because of spot failures.
#
pushd ${gl_run_prefix}/${gl_os_vendor}/${gl_system_type} > /dev/null
for i in `ls -d *`; do
if [ -f $i/spot_failure ]; then
rm $i/spot_failure
echo $i has spot failure running again without spot
next_one=0
for data in `cat $i/exec_command`; do
if [ $next_one -eq 1 ]; then
file_to_use=$data
break
fi
if [[ $data == "--run_file" ]]; then
next_one=1
fi
done
test_separ=""
tests=""
pushd $i > /dev/null
for i in `cat test_times | cut -d' ' -f2`; do
tests=${tests}${test_separ}${i}
test_separ=","
done
popd > /dev/null
echo tests >/tmp/tests
tests=${tests}","
tests_left=`sed "s/${tests}//g" $gl_top_dir/$file_to_use`
if [[ -f /home/zathras_log/spot_fails ]]; then
date=`date`
echo "$data: $date tests completed: ${tests}, tests left ${tests_left}" >> /home/zathras_log/spot_fails
fi
sed "s/--use_spot/--no_spot_recover/g" $gl_top_dir/$file_to_use | sed "s/${tests}//g" > $gl_top_dir/rerun
pushd $gl_top_dir > /dev/null
chmod 755 rerun
./rerun
popd > /dev/null
fi
done
popd > /dev/null
cleanup_and_exit "" 0
70 changes: 19 additions & 51 deletions bin/kick_off.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ if [[ $ssh_key_file != "" ]]; then
cp $ssh_key_file config/user.pem_test
if [ ! -s $ssh_key_file ]; then
echo "${ssh_key_file} is zero length, please fix. Test is exiting"
exit
exit 1
fi
chmod 500 config/user.pem_test
fi
Expand Down Expand Up @@ -191,66 +191,33 @@ do
do
mkdir tf
echo ===== attempt $attempts of $create_attempts ==============
echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml
ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml
#
if [ $spot_recover -eq 1 ] && [[ ! -f "test_returned" ]] && [[ ! -f "cpu_type_failure" ]]; then
#
# Check to see if we used spot, and we are to recover if the system goes away.
#
if [[ -f test_started ]]; then
sp_check=`grep spot_range: ansible_vars_main.yml`
if [[ $sp_check == *","* ]]; then
echo Need to update the test list, do not execute the tests we already executed.
tests_list=`grep ^test: test_times | awk '{ print $2 }'`
test_rm=""
seper=""
for i in $tests_list; do
test_rm=$test_rm${seper}$i
seper=","
done
test_rm=${test_rm}${seper}
cp ansible_vars_main.yml ansible_vars_main.yml_back
sed "s/${test_rm}//g" < ansible_vars_main.yml > update
mv update ansible_vars_main.yml
cp ansible_vars.yml ansible_vars.yml_back
sed "s/${test_rm}//g" < ansible_vars.yml > update
mv update ansible_vars.yml
if [[ -f test_times ]]; then
report_usage
fi
mv test_times test_times_spot
mv if_spot_fail instance_cost
rm -rf tf
rm tf.rtc
#
# Next attempt it without spot pricing.
#
mv ansible_run_vars.yml ansible_run_vars.yml_spot_died
grep -v "spot_range:" ansible_vars_main.yml | grep -v spot_start_price > spot_repair
echo " spot_start_price: 0" >> spot_repair
echo " spot_range: 0" >> spot_repair
mv spot_repair ansible_vars_main.yml
grep -v "spot_range:" ansible_vars.yml | grep -v spot_start_price > spot_repair
echo " spot_start_price: 0" >> spot_repair
echo " spot_range: 0" >> spot_repair
mv spot_repair ansible_main.yml
mkdir tf
echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml
ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml
else
#
# Not spot, test started, system died.
#
echo Error: test started, system died.
exit
sp_check=`grep spot_range: ansible_vars_main.yml | cut -d: -f 2 | sed "s/ //g"`
if [[ $sp_check != "0" ]]; then
touch spot_failure
spot_recover=0
exit 1
fi
else
#
# Rarely will happen.
# Not spot, test started, system died.
#
echo Error: did not start the test. >> test_start_failure
exit
echo Error: test started, system died.
exit 1
fi
else
#
# Rarely will happen.
#
if [[ ! -f test_returned ]]; then
echo Error: test started, system died.
exit 1
fi
fi
if [[ ! -f "cpu_type_failure" ]]; then
Expand All @@ -273,7 +240,7 @@ do
fi
let "attempts=${attempts}+1"
rm -rf ansible_install_group ansible_test_group boot_info cloud_timings copy_git_file_status cr_status cpu_type_failure
rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_defs.yml test_info test_times tf_results
rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_times tf_results
mv tf tf_delete_${attempts}
done
$top_dir/bin/remove_wrong_cpus $top_dir/$direct
Expand All @@ -296,3 +263,4 @@ for i in `ls results*tar`; do
echo $i >> tuned_run_info
cat $check_file >> tuned_run_info
done
exit 0

0 comments on commit 56acc03

Please sign in to comment.