Updates for AWS spot

redhat-performance · Aug 15, 2023 · fee56e3 · fee56e3
1 parent 2bec2ca
commit fee56e3
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 58 deletions.
diff --git a/bin/burden b/bin/burden
@@ -25,6 +25,9 @@
 #
 
 # Burdens version number.
+
+cli=${0}
+arguments=("$@")
 version="3.2"
 export ANSIBLE_ROLES_PATH=$HOME/.ansible/collections/ansible_collections/pbench/agent/roles:$ANSIBLE_ROLES_PATH
 #
@@ -1264,19 +1267,15 @@ retrieve_spot_from_config()
 			#
 			# For AWS obtain the possible starting price.
 			#
-			worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'`
+			worker=`aws ec2 describe-spot-price-history --start-time=$(date +%s) --product-descriptions="Linux/UNIX" --query 'SpotPriceHistory[*].{az:AvailabilityZone, price:SpotPrice}' --instance-types $host_or_cloud_inst | tail -1 | awk '{ print $2}'`
 			if [[ $? -ne 0 ]]; then
 				cleanup_and_exit "Error: unable to retrieve spot info for $1" 1
 			fi
-			if [[ $worker == *":[]"* ]]; then
-				cleanup_and_exit "Error: unable to retrieve spot info for $1" 1
-			fi
+			gl_spot_price=$worker
+			gl_spot_cap=$gl_base_cost
 			#
 			# We will default to 5 intervals
 			# 
-			gl_spot_cap=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1`
-			gl_base_cost=$gl_spot_cap;
-			gl_spot_price=`echo $worker | cut -d':' -f 10 | cut -d: -f 10 | cut -d'"' -f 2`
 			gl_spot_increment=`echo "scale=2;((($gl_spot_cap - $gl_spot_price)*1.5)/5.00)" | bc`
 		else
 			spot_string=`grep $1 ${gl_test_def_dir}/spot_price.cfg`
@@ -1506,6 +1505,10 @@ create_ansible_options()
 		#
 		if [[ $gl_system_type != "local" ]] && [[ $gl_spot_cap -eq 0 ]]; then
 			echo No spot price designated, retrieving
+			worker=`curl -L ec2.shop?filter=$1 -H 'accept: json'`
+			if [[ $? -ne 0 ]]; then
+				cleanup_and_exit "Unable to retrieve base cost $1" 1
+			fi
 			if [[ $gl_system_type == "aws" ]]; then
 				worker=`curl -L ec2.shop?filter=$host_or_cloud_inst -H 'accept: json'`
 				gl_base_cost=`echo $worker | cut -d':' -f 8 | cut -d',' -f 1`
@@ -1809,6 +1812,7 @@ create_ansible_options()
 			if [[ "$gl_ssh_key_file" != "" ]]; then
 				base_string="${base_string} -s $gl_ssh_key_file"
 			fi
+			echo $cli "${arguments[@]}" | sed "s/bin/./g" > ${run_dir}/exec_command
 			kick_off.sh $base_string | tee ${run_dir}/ansible_log &
 			pids[${pindex}]=$!
 			let "pindex=$pindex+1"
@@ -4009,4 +4013,46 @@ remove_hostnames_added
 if [[ $gl_warning_string != "" ]]; then
 	echo -e $gl_warning_string
 fi
+
+#
+# Ok, go back through things and see if we have to perform any runs because of spot failures.
+#
+pushd ${gl_run_prefix}/${gl_os_vendor}/${gl_system_type} > /dev/null
+for i in `ls -d *`; do
+	if [ -f $i/spot_failure ]; then
+		rm $i/spot_failure
+		echo $i has spot failure running again with out spot
+		next_one=0
+		for data in `cat $i/exec_command`; do
+			if [ $next_one -eq 1 ]; then
+				file_to_use=$data
+				break
+			fi
+			if [[ $data == "--run_file" ]]; then
+				next_one=1
+			fi
+		done
+		test_separ=""
+		tests=""
+		pushd $i > /dev/null
+		for i in `cat test_times | cut -d' ' -f2`; do
+			tests=${tests}${test_separ}${i}
+			test_separ=","
+		done
+		popd > /dev/null
+		echo tests >/tmp/tests
+		tests=${tests}","
+		tests_left=`sed "s/${tests}//g" $gl_top_dir/$file_to_use`
+		if [[ -f /home/zathras_log/spot_fails ]]; then
+			date=`date`
+			echo "$data: $date tests completed: ${tests}, tests left ${tests_left}" >> /home/zathras_log/spot_fails
+		fi
+		sed "s/--use_spot/--no_spot_recover/g" $gl_top_dir/$file_to_use | sed "s/${tests}//g" > $gl_top_dir/rerun
+		pushd $gl_top_dir > /dev/null
+		chmod 755 rerun
+		./rerun
+		popd > /dev/null
+	fi
+done
+popd > /dev/null
 cleanup_and_exit "" 0
diff --git a/bin/kick_off.sh b/bin/kick_off.sh
@@ -147,7 +147,7 @@ if [[ $ssh_key_file != "" ]]; then
 	cp $ssh_key_file config/user.pem_test
 	if [ ! -s $ssh_key_file ]; then
 		echo "${ssh_key_file} is zero length, please fix.  Test is exiting"
-		exit
+		exit 1
 	fi
 	chmod 500 config/user.pem_test
 fi
@@ -191,66 +191,33 @@ do
 	do
 		mkdir tf
 		echo ===== attempt $attempts of $create_attempts ==============
-		echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml
 		ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml
 		#
 		if [ $spot_recover -eq 1 ] && [[ ! -f "test_returned" ]] && [[ ! -f "cpu_type_failure" ]]; then
 			#
 			# Check to see if we used spot, and we are to recover if the system goes away.
 			#
 			if [[ -f test_started ]]; then
-				sp_check=`grep spot_range: ansible_vars_main.yml`
-				if [[ $sp_check == *","* ]]; then
-					echo Need to update the test list, do not execute the tests we already executed.
-					tests_list=`grep ^test: test_times | awk '{ print $2 }'`
-					test_rm=""
-					seper=""
-					for i in $tests_list; do
-						test_rm=$test_rm${seper}$i
-						seper=","
-					done
-					test_rm=${test_rm}${seper}
-					cp ansible_vars_main.yml ansible_vars_main.yml_back
-					sed "s/${test_rm}//g" < ansible_vars_main.yml > update
-					mv update ansible_vars_main.yml
-					cp ansible_vars.yml ansible_vars.yml_back
-					sed "s/${test_rm}//g" < ansible_vars.yml > update
-					mv update ansible_vars.yml
-					if [[ -f test_times ]]; then
-        					report_usage
-					fi
-					mv test_times test_times_spot
-					mv if_spot_fail instance_cost
-					rm -rf tf
-					rm tf.rtc
-					#
-					# Next attempt it without spot pricing.
-					#
-					mv ansible_run_vars.yml ansible_run_vars.yml_spot_died
-					grep -v "spot_range:" ansible_vars_main.yml | grep -v spot_start_price > spot_repair
-					echo "  spot_start_price: 0" >> spot_repair
-					echo "  spot_range: 0" >> spot_repair
-					mv  spot_repair ansible_vars_main.yml
-					grep -v "spot_range:" ansible_vars.yml  | grep -v  spot_start_price > spot_repair
-					echo "  spot_start_price: 0" >> spot_repair
-					echo "  spot_range: 0" >> spot_repair
-					mv spot_repair ansible_main.yml
-					mkdir tf
-					echo ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3" ten_of_us.yml
-					ansible-playbook -i ./inventory --extra-vars "working_dir=${curdir} ansible_python_interpreter=/usr/bin/python3 delete_tf=none" ten_of_us.yml
-				else
-					#
-					# Not spot, test started, system died.
-					#
-					echo Error: test started, system died.
-					exit
+				sp_check=`grep spot_range: ansible_vars_main.yml | cut -d: -f 2 | sed "s/ //g"`
+				if [[ $sp_check != "0" ]]; then
+					touch spot_failure
+					spot_recover=0
+					exit 1
 				fi
 			else
 				#
-				# Rarely will happen.
+				# Not spot, test started, system died.
 				#
-				echo Error: did not start the test. >>  test_start_failure
-				exit
+				echo Error: test started, system died.
+				exit 1
+			fi
+		else
+			#
+			# Rarely will happen.
+			#
+			if [[ ! -f test_returned ]]; then
+				echo Error: test started, system died.
+				exit 1
 			fi
 		fi
 		if [[ ! -f "cpu_type_failure" ]]; then
@@ -273,7 +240,7 @@ do
 		fi
 		let "attempts=${attempts}+1"
 		rm -rf ansible_install_group ansible_test_group boot_info cloud_timings copy_git_file_status cr_status cpu_type_failure
-		rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_defs.yml test_info test_times tf_results
+		rm -rf dev_env_status install_status meta_data.yml tar_status terraform_data.yml test_times tf_results
 		mv tf tf_delete_${attempts}
 	done
 	$top_dir/bin/remove_wrong_cpus $top_dir/$direct
@@ -296,3 +263,4 @@ for i in `ls results*tar`; do
 	echo $i >> tuned_run_info
 	cat $check_file >> tuned_run_info
 done
+exit 0