Add web APIs to the variable client workload runners, add startup scr…

…ipts (#482) * Restore removed (?) old variable clients * Add web apis to the runners * Add workload startup scripts: * Fix lint, add starting clients to ana runner too * Start up RA runner too * Minor API improvements
mitdbg · Mar 29, 2024 · 7666acc · 7666acc
1 parent f4e6f7b
commit 7666acc
Show file tree

Hide file tree

Showing 10 changed files with 1,095 additions and 706 deletions.
diff --git a/experiments/16-demo/.gitignore b/experiments/16-demo/.gitignore
@@ -0,0 +1 @@
+out
diff --git a/experiments/16-demo/common.sh b/experiments/16-demo/common.sh
@@ -0,0 +1,194 @@
+function start_brad_w_ui() {
+  system_config_file=$1
+  physical_config_file=$2
+  curr_dir=$(pwd)
+
+  pushd ../../
+  brad daemon \
+    --physical-config-file $physical_config_file \
+    --system-config-file $curr_dir/$system_config_file \
+    --schema-name $schema_name \
+    --ui \
+    &
+  brad_pid=$!
+  popd
+}
+
+function cancel_experiment() {
+  for pid_var in "$@"; do
+    kill -INT $pid_var
+  done
+  kill -INT $brad_pid
+}
+
+function graceful_shutdown() {
+  for pid_var in "$@"; do
+    kill -INT $pid_var
+  done
+  for pid_var in "$@"; do
+    wait $pid_var
+  done
+
+  kill -INT $brad_pid
+  wait $brad_pid
+}
+
+function terminate_process_group() {
+  local pid=$1
+  local initial_wait_s=$2
+  sleep $2
+  if kill -0 $pid >/dev/null 2>&1; then
+    pkill -KILL -P $pid
+    pkill -KILL $pid
+    echo "NOTE: Forced process $pid to stop."
+  else
+    echo "Process $pid stopped gracefully."
+  fi
+}
+
+function log_workload_point() {
+  msg=$1
+  now=$(date --utc "+%Y-%m-%d %H:%M:%S")
+  echo "$now,$msg" >> $COND_OUT/points.log
+}
+
+function pause_for_s_past_timepoint() {
+  local timepoint="$1"
+  local wait_s="$2"
+
+  local curr_tp="$(date -u +%s)"
+  elapsed_s="$(($curr_tp - $timepoint))"
+  if (( $elapsed_s < $wait_s )); then
+    leftover_s=$(($wait_s - $elapsed_s))
+    >&2 echo "Waiting $leftover_s seconds before continuing..."
+    sleep $leftover_s
+  fi
+}
+
+function poll_file_for_event() {
+  local file="$1"
+  local event_name="$2"
+  local timeout_minutes="$3"
+  local previous_size=$(stat -c %s "$file")
+  local current_size
+  local last_line
+
+  local start_time
+  local elapsed_time
+  start_time=$(date +%s)
+
+  while true; do
+    current_size=$(stat -c %s "$file")
+
+    if [[ $current_size -ne $previous_size ]]; then
+      last_line=$(tail -n 1 "$file")
+
+      if [[ $last_line == *"$event_name"* ]]; then
+        >&2 echo "Detected new $event_name!"
+        break
+      fi
+    fi
+
+    elapsed_time=$(( $(date +%s) - $start_time ))
+    if [[ $elapsed_time -ge $((timeout_minutes * 60)) ]]; then
+      >&2 echo "Timeout reached. Did not detect $event_name within $timeout_minutes minutes."
+      log_workload_point "timeout_poll_${event_name}"
+      break
+    fi
+
+    sleep 30
+  done
+}
+
+function extract_named_arguments() {
+  # Evaluates any environment variables in this script's arguments. This script
+  # should only be run on trusted input.
+  orig_args=($@)
+  for val in "${orig_args[@]}"; do
+    phys_arg=$(eval "echo $val")
+
+    if [[ $phys_arg =~ --ra-clients=.+ ]]; then
+      ra_clients=${phys_arg:13}
+    fi
+
+    if [[ $phys_arg =~ --t-clients-lo=.+ ]]; then
+      t_clients_lo=${phys_arg:15}
+    fi
+
+    if [[ $phys_arg =~ --t-clients-hi=.+ ]]; then
+      t_clients_hi=${phys_arg:15}
+    fi
+
+    if [[ $phys_arg =~ --ra-query-indexes=.+ ]]; then
+      ra_query_indexes=${phys_arg:19}
+    fi
+
+    if [[ $phys_arg =~ --ra-query-bank-file=.+ ]]; then
+      ra_query_bank_file=${phys_arg:21}
+    fi
+
+    if [[ $phys_arg =~ --other-query-bank-file=.+ ]]; then
+      other_query_bank_file=${phys_arg:24}
+    fi
+
+    if [[ $phys_arg =~ --ra-gap-s=.+ ]]; then
+      ra_gap_s=${phys_arg:11}
+    fi
+
+    if [[ $phys_arg =~ --ra-gap-std-s=.+ ]]; then
+      ra_gap_std_s=${phys_arg:15}
+    fi
+
+    if [[ $phys_arg =~ --num-front-ends=.+ ]]; then
+      num_front_ends=${phys_arg:17}
+    fi
+
+    if [[ $phys_arg =~ --run-for-s=.+ ]]; then
+      run_for_s=${phys_arg:12}
+    fi
+
+    if [[ $phys_arg =~ --physical-config-file=.+ ]]; then
+      physical_config_file=${phys_arg:23}
+    fi
+
+    if [[ $phys_arg =~ --system-config-file=.+ ]]; then
+      system_config_file=${phys_arg:21}
+    fi
+
+    if [[ $phys_arg =~ --skip-replan=.+ ]]; then
+      skip_replan=${phys_arg:14}
+    fi
+
+    if [[ $phys_arg =~ --schema-name=.+ ]]; then
+      schema_name=${phys_arg:14}
+    fi
+
+    if [[ $phys_arg =~ --dataset-type=.+ ]]; then
+      dataset_type=${phys_arg:15}
+    fi
+
+    if [[ $phys_arg =~ --query-sequence-file=.+ ]]; then
+      query_sequence_file=${phys_arg:22}
+    fi
+
+    if [[ $phys_arg =~ --snowset-query-frequency-path=.+ ]]; then
+      snowset_query_frequency_path=${phys_arg:31}
+    fi
+
+    if [[ $phys_arg =~ --snowset-client-dist-path=.+ ]]; then
+      snowset_client_dist_path=${phys_arg:27}
+    fi
+
+    if [[ $phys_arg =~ --snowset-gap-dist-path=.+ ]]; then
+      snowset_gap_dist_path=${phys_arg:24}
+    fi
+
+    if [[ $phys_arg =~ --txn-scale-factor=.+ ]]; then
+      txn_scale_factor=${phys_arg:19}
+    fi
+
+    if [[ $phys_arg =~ --is-daylong-hd=.+ ]]; then
+      is_daylong_hd=1
+    fi
+  done
+}
diff --git a/experiments/16-demo/out/.gitignore b/experiments/16-demo/out/.gitignore
diff --git a/experiments/16-demo/run_scale_down_workload.sh b/experiments/16-demo/run_scale_down_workload.sh
@@ -0,0 +1,10 @@
+#! /bin/bash
+
+script_loc=$(cd $(dirname $0) && pwd -P)
+cd $script_loc
+source common.sh
+
+./scale_down_workload_impl.sh \
+  --physical-config-file=$1 \
+  --system-config-file=scale_down_config.yml \
+  --schema-name=imdb_extended_100g
diff --git a/experiments/16-demo/scale_down_config.yml b/experiments/16-demo/scale_down_config.yml
@@ -0,0 +1,163 @@
+# This file contains configurations that are used by BRAD. These are default
+# values and should be customized for specific situations.
+
+# BRAD's front end servers will listen for client connections on this interface
+# and port. If `num_front_ends` is greater than one, subsequent front ends will
+# listen on successive ports (e.g., 6584, 6585, etc.).
+front_end_interface: "0.0.0.0"
+front_end_port: 6583
+num_front_ends: 12
+
+# Logging paths. If the value is in ALL_CAPS (with underscores), it is
+# interpreted as an environment variable (BRAD will log to the path stored in
+# the environment variable).
+
+# Where BRAD's daemon process will write its logs.
+daemon_log_file: /tmp
+
+# Where BRAD's front end processes will write their logs.
+front_end_log_path: /tmp
+
+# Where BRAD's blueprint planner will write debug logs.
+planner_log_path: /tmp
+
+# Where BRAD's metrics loggers will write their logs.
+metrics_log_path: /tmp
+
+# Probability that each transactional query will be logged.
+txn_log_prob: 0.01
+
+# Set to a non-zero value enable automatic data syncing. When this is set to 0,
+# automatic syncing is disabled.
+data_sync_period_seconds: 0
+
+# BRAD's front end servers will report their metrics at regular intervals.
+front_end_metrics_reporting_period_seconds: 30
+front_end_query_latency_buffer_size: 100
+
+# `default` means to use the policy encoded in the blueprint. Other values will
+# override the blueprint.
+routing_policy: default
+
+# Whether to disable table movement for benchmark purposes (i.e., keep all
+# tables on all engines.)
+disable_table_movement: true
+
+# Epoch length for metrics and forecasting. This is the granularity at which
+# metrics/forecasting will be performed.
+epoch_length:
+  weeks: 0
+  days: 0
+  hours: 0
+  minutes: 1
+
+# Blueprint planning strategy.
+strategy: fp_query_based_beam
+
+# Used to specify the period of time over which to use data for planning.
+# Currrently, this is a "look behind" window for the workload.
+planning_window:
+  weeks: 0
+  days: 0
+  hours: 1
+  minutes: 0
+
+# Used to aggregate metrics collected in the planning window.
+metrics_agg:
+  method: ewm         # 'mean' is another option
+  alpha: 0.86466472   # 1 - 1 / e^2
+
+# Used during planning.
+reinterpret_second_as: 1
+
+# The query distribution must change by at least this much for a new blueprint
+# to be accepted.
+query_dist_change_frac: 0.1
+
+# The search bound for the provisioning.
+max_provisioning_multiplier: 2.5
+
+# Flag options for blueprint planning.
+use_io_optimized_aurora: true
+use_recorded_routing_if_available: true
+ensure_tables_together_on_one_engine: true
+
+# Loads used to prime the system when no information is available.
+aurora_initialize_load_fraction: 0.25
+redshift_initialize_load_fraction: 0.25
+
+# BRAD will not reduce predicted load lower than these values. Raise these
+# values to be more conservative against mispredictions.
+aurora_min_load_removal_fraction: 0.8
+redshift_min_load_removal_fraction: 0.8
+
+# Blueprint planning performance ceilings.
+query_latency_p90_ceiling_s: 30.0
+txn_latency_p90_ceiling_s: 0.030
+
+aurora_provisioning_search_distance: 900.0
+redshift_provisioning_search_distance: 900.0
+
+# Used for ordering blueprints during planning.
+comparator:
+  type: benefit_perf_ceiling  # or `perf_ceiling`
+
+  benefit_horizon:  # Only used by the `benefit_perf_ceiling` comparator
+    weeks: 0
+    days: 0
+    hours: 24
+    minutes: 0
+
+  penalty_threshold: 0.8  # Only used by the `benefit_perf_ceiling` comparator
+  penalty_power: 2  # Only used by the `benefit_perf_ceiling` comparator
+
+aurora_max_query_factor: 4.0
+aurora_max_query_factor_replace: 10000.0
+redshift_peak_load_threshold: 99.0
+redshift_peak_load_multiplier: 1.5
+
+planner_max_workers: 16
+
+# Used for precomputed predictions.
+std_datasets:
+  - name: regular
+    path: workloads/IMDB_100GB/regular_test/
+  - name: adhoc
+    path: workloads/IMDB_100GB/adhoc_test/
+
+# Blueprint planning trigger configs.
+
+triggers:
+  enabled: false  # Change to true when running.
+  check_period_s: 90  # Triggers are checked every X seconds.
+  check_period_offset_s: 360  # Wait 6 mins before starting.
+  observe_new_blueprint_mins: 3
+
+  elapsed_time:
+    disabled: true
+    multiplier: 60  # Multiplier over `planning_window`.
+
+  redshift_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  aurora_cpu:
+    lo: 15
+    hi: 85
+    sustained_epochs: 3
+
+  variable_costs:
+    disabled: true
+    threshold: 1.0
+
+  query_latency_ceiling:
+    ceiling_s: 30.0
+    sustained_epochs: 3
+
+  txn_latency_ceiling:
+    ceiling_s: 0.030
+    sustained_epochs: 3
+
+  recent_change:
+    delay_epochs: 5