From 313b1339c469e74a2cb5630ff6505cec0b866118 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20Mac=C3=ADk?= <pavel.macik@gmail.com>
Date: Wed, 10 Jan 2024 16:12:10 +0100
Subject: [PATCH] feat(RHIDP-893): Collect DB scalability metrics and results
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Pavel Macík <pavel.macik@gmail.com>
---
 Makefile                                      | 10 ++--
 ci-scripts/collect-results.sh                 | 51 +++++++++++++++----
 ci-scripts/scalability/collect-results.sh     | 51 +++++++++++++++++++
 config/cluster_read_config.populate.yaml      | 14 +++++
 ...fig.yaml => cluster_read_config.test.yaml} | 13 +++++
 5 files changed, 125 insertions(+), 14 deletions(-)
 create mode 100644 config/cluster_read_config.populate.yaml
 rename config/{cluster_read_config.yaml => cluster_read_config.test.yaml} (93%)

diff --git a/Makefile b/Makefile
index 1104a0f..59a1e62 100644
--- a/Makefile
+++ b/Makefile
@@ -72,12 +72,16 @@ namespace:
 ## Deploy RHDH
 .PHONY: deploy-rhdh
 deploy-rhdh:
+	date --utc -Ins>$(TMP_DIR)/deploy-before
 	cd ./ci-scripts/rhdh-setup/; ./deploy.sh -i
+	date --utc -Ins>$(TMP_DIR)/deploy-after
 
 ## Create users, groups and objects such as components and APIs in RHDH
 .PHONY: populate-rhdh
 populate-rhdh:
+	date --utc -Ins>$(TMP_DIR)/populate-before
 	cd ./ci-scripts/rhdh-setup/; ./deploy.sh -c
+	date --utc -Ins>$(TMP_DIR)/populate-after
 
 ## Undeploy RHDH
 .PHONY: undeploy-rhdh
@@ -120,15 +124,15 @@ clean:
 .PHONY: test
 test:
 	mkdir -p $(ARTIFACT_DIR)
-	echo $(SCENARIO)>$(ARTIFACT_DIR)/benchmark-scenario
+	echo $(SCENARIO)>$(TMP_DIR)/benchmark-scenario
 	cat locust-test-template.yaml | envsubst | kubectl apply --namespace $(LOCUST_NAMESPACE) -f -
 	kubectl create --namespace $(LOCUST_NAMESPACE) configmap locust.$(SCENARIO) --from-file scenarios/$(SCENARIO).py --dry-run=client -o yaml | kubectl apply --namespace $(LOCUST_NAMESPACE) -f -
-	date --utc -Ins>$(ARTIFACT_DIR)/benchmark-before
+	date --utc -Ins>$(TMP_DIR)/benchmark-before
 	timeout=$$(date -d "30 seconds" "+%s"); while [ -z "$$(kubectl get --namespace $(LOCUST_NAMESPACE) pod -l performance-test-pod-name=$(SCENARIO)-test-master -o name)" ]; do if [ "$$(date "+%s")" -gt "$$timeout" ]; then echo "ERROR: Timeout waiting for locust master pod to start"; exit 1; else echo "Waiting for locust master pod to start..."; sleep 5s; fi; done
 	kubectl wait --namespace $(LOCUST_NAMESPACE) --for=condition=Ready=true $$(kubectl get --namespace $(LOCUST_NAMESPACE) pod -l performance-test-pod-name=$(SCENARIO)-test-master -o name)
 	@echo "Getting locust master log:"
 	kubectl logs --namespace $(LOCUST_NAMESPACE) -f -l performance-test-pod-name=$(SCENARIO)-test-master | tee load-test.log
-	date --utc -Ins>$(ARTIFACT_DIR)/benchmark-after
+	date --utc -Ins>$(TMP_DIR)/benchmark-after
 	@echo "All done!!!"
 
 ## Run the scalability test
diff --git a/ci-scripts/collect-results.sh b/ci-scripts/collect-results.sh
index 313ffe7..026f693 100755
--- a/ci-scripts/collect-results.sh
+++ b/ci-scripts/collect-results.sh
@@ -9,6 +9,11 @@ echo -e "\n === Collecting test results and metrics ===\n"
 ARTIFACT_DIR=$(readlink -m "${ARTIFACT_DIR:-.artifacts}")
 mkdir -p "${ARTIFACT_DIR}"
 
+export TMP_DIR
+
+TMP_DIR=$(readlink -m "${TMP_DIR:-.tmp}")
+mkdir -p "${TMP_DIR}"
+
 RHDH_NAMESPACE=${RHDH_NAMESPACE:-rhdh-performance}
 
 cli="oc"
@@ -47,9 +52,16 @@ try_gather_dir() {
     fi
 }
 
-try_gather_file ./.tmp/backstage.url
-try_gather_file ./.tmp/keycloak.url
-try_gather_file ./.tmp/chart-values.yaml
+try_gather_file "${TMP_DIR}/backstage.url"
+try_gather_file "${TMP_DIR}/keycloak.url"
+try_gather_file "${TMP_DIR}/chart-values.yaml"
+try_gather_file "${TMP_DIR}/deploy-before"
+try_gather_file "${TMP_DIR}/deploy-after"
+try_gather_file "${TMP_DIR}/populate-before"
+try_gather_file "${TMP_DIR}/populate-after"
+try_gather_file "${TMP_DIR}/benchmark-before"
+try_gather_file "${TMP_DIR}/benchmark-after"
+try_gather_file "${TMP_DIR}/benchmark-scenario"
 try_gather_file load-test.log
 
 PYTHON_VENV_DIR=.venv
@@ -71,22 +83,39 @@ set +u
 # shellcheck disable=SC1090,SC1091
 source $PYTHON_VENV_DIR/bin/activate
 set -u
-mstart=$(date --utc --date "$(cat "${ARTIFACT_DIR}/benchmark-before")" --iso-8601=seconds)
-mend=$(date --utc --date "$(cat "${ARTIFACT_DIR}/benchmark-after")" --iso-8601=seconds)
+# populate phase
+if [ "$PRE_LOAD_DB" == "true" ]; then
+    mstart=$(date --utc --date "$(cat "${TMP_DIR}/populate-before")" --iso-8601=seconds)
+    mend=$(date --utc --date "$(cat "${TMP_DIR}/populate-after")" --iso-8601=seconds)
+    mhost=$(kubectl -n openshift-monitoring get route -l app.kubernetes.io/name=thanos-query -o json | jq --raw-output '.items[0].spec.host')
+    status_data.py \
+        --status-data-file "$monitoring_collection_data" \
+        --additional config/cluster_read_config.populate.yaml \
+        --monitoring-start "$mstart" \
+        --monitoring-end "$mend" \
+        --monitoring-raw-data-dir "$monitoring_collection_dir" \
+        --prometheus-host "https://$mhost" \
+        --prometheus-port 443 \
+        --prometheus-token "$($cli whoami -t)" \
+        -d &>>"$monitoring_collection_log"
+fi
+# test phase
+mstart=$(date --utc --date "$(cat "${TMP_DIR}/benchmark-before")" --iso-8601=seconds)
+mend=$(date --utc --date "$(cat "${TMP_DIR}/benchmark-after")" --iso-8601=seconds)
 mhost=$(kubectl -n openshift-monitoring get route -l app.kubernetes.io/name=thanos-query -o json | jq --raw-output '.items[0].spec.host')
-mversion=$(sed -n 's/^__version__ = "\(.*\)"/\1/p' "scenarios/$(cat "${ARTIFACT_DIR}/benchmark-scenario").py")
+mversion=$(sed -n 's/^__version__ = "\(.*\)"/\1/p' "scenarios/$(cat "${TMP_DIR}/benchmark-scenario").py")
 status_data.py \
     --status-data-file "$monitoring_collection_data" \
     --set \
-    results.started="$(cat "${ARTIFACT_DIR}/benchmark-before")" \
-    results.ended="$(cat "${ARTIFACT_DIR}/benchmark-after")" \
-    name="RHDH load test $(cat "${ARTIFACT_DIR}/benchmark-scenario")" \
-    metadata.scenario.name="$(cat "${ARTIFACT_DIR}/benchmark-scenario")" \
+    results.started="$(cat "${TMP_DIR}/benchmark-before")" \
+    results.ended="$(cat "${TMP_DIR}/benchmark-after")" \
+    name="RHDH load test $(cat "${TMP_DIR}/benchmark-scenario")" \
+    metadata.scenario.name="$(cat "${TMP_DIR}/benchmark-scenario")" \
     metadata.scenario.version="$mversion" \
     -d &>"$monitoring_collection_log"
 status_data.py \
     --status-data-file "$monitoring_collection_data" \
-    --additional config/cluster_read_config.yaml \
+    --additional config/cluster_read_config.test.yaml \
     --monitoring-start "$mstart" \
     --monitoring-end "$mend" \
     --monitoring-raw-data-dir "$monitoring_collection_dir" \
diff --git a/ci-scripts/scalability/collect-results.sh b/ci-scripts/scalability/collect-results.sh
index 13d3e1d..67dc938 100755
--- a/ci-scripts/scalability/collect-results.sh
+++ b/ci-scripts/scalability/collect-results.sh
@@ -8,3 +8,54 @@ echo -e "\n === Collecting test results and metrics for RHDH scalability test ==
 
 ARTIFACT_DIR=$(readlink -m "${ARTIFACT_DIR:-.artifacts}")
 mkdir -p "$ARTIFACT_DIR"
+
+read -ra workers <<<"${SCALE_WORKERS:-5}"
+
+read -ra active_users_spawn_rate <<<"${SCALE_ACTIVE_USERS_SPAWN_RATES:-1:1 200:40}"
+
+read -ra bs_users_groups <<<"${SCALE_BS_USERS_GROUPS:-1:1 15000:5000}"
+
+read -ra catalog_sizes <<<"${SCALE_CATALOG_SIZES:-1 10000}"
+
+read -ra replicas <<<"${SCALE_REPLICAS:-5}"
+
+read -ra db_storages <<<"${SCALE_DB_STORAGES:-1Gi 2Gi}"
+
+csv_delim=";"
+csv_delim_quoted="\"$csv_delim\""
+
+for w in "${workers[@]}"; do
+    for r in "${replicas[@]}"; do
+        for bu_bg in "${bs_users_groups[@]}"; do
+            IFS=":" read -ra tokens <<<"${bu_bg}"
+            bu="${tokens[0]}"
+            bg="${tokens[1]}"
+            for s in "${db_storages[@]}"; do
+                for au_sr in "${active_users_spawn_rate[@]}"; do
+                    IFS=":" read -ra tokens <<<"${au_sr}"
+                    active_users=${tokens[0]}
+                    output="$ARTIFACT_DIR/scalability_c-${r}r-db_${s}-${bu}bu-${bg}bg-${w}w-${active_users}u.csv"
+                    echo "CatalogSize${csv_delim}AverateRPS${csv_delim}MaxRPS${csv_delim}AverageRT${csv_delim}MaxRT${csv_delim}FailRate${csv_delim}DBStorageUsed${csv_delim}DBStorageAvailable${csv_delim}DBStorageCapacity" >"$output"
+                    for c in "${catalog_sizes[@]}"; do
+                        index="${r}r-db_${s}-${bu}bu-${bg}bg-${w}w-${c}c"
+                        benchmark_json="$(find . -name benchmark.json | grep "$index" || true)"
+                        echo -n "$c;" >>"$output"
+                        if [ -n "$benchmark_json" ]; then
+                            jq_cmd="(.results.\"locust-operator\".locust_requests_current_rps_Aggregated.mean | tostring) \
+                            + $csv_delim_quoted + (.results.\"locust-operator\".locust_requests_current_rps_Aggregated.max | tostring) \
+                            + $csv_delim_quoted + (.results.\"locust-operator\".locust_requests_avg_response_time_Aggregated.mean | tostring) \
+                            + $csv_delim_quoted + (.results.\"locust-operator\".locust_requests_avg_response_time_Aggregated.max | tostring) \
+                            + $csv_delim_quoted + (.results.\"locust-operator\".locust_requests_fail_ratio_Aggregated.mean | tostring) \
+                            + $csv_delim_quoted + (.measurements.cluster.pv_stats.populate.\"data-rhdh-postgresql-primary-0\".used_bytes.max | tostring) \
+                            + $csv_delim_quoted + (.measurements.cluster.pv_stats.populate.\"data-rhdh-postgresql-primary-0\".available_bytes.min | tostring) \
+                            + $csv_delim_quoted + (.measurements.cluster.pv_stats.populate.\"data-rhdh-postgresql-primary-0\".capacity_bytes.max | tostring)"
+                            sed -Ee 's/: ([0-9]+\.[0-9]*[X]+[0-9e\+-]*|[0-9]*X+[0-9]*\.[0-9e\+-]*|[0-9]*X+[0-9]*\.[0-9]*X+[0-9e\+-]+),/: "\1",/g' "$benchmark_json" | jq -rc "$jq_cmd" >>"$output"
+                        else
+                            echo ";" >>"$output"
+                        fi
+                    done
+                done
+            done
+        done
+    done
+done
diff --git a/config/cluster_read_config.populate.yaml b/config/cluster_read_config.populate.yaml
new file mode 100644
index 0000000..e72a163
--- /dev/null
+++ b/config/cluster_read_config.populate.yaml
@@ -0,0 +1,14 @@
+{% macro pv_stats(pvc) -%}
+# Collect data for PV stats
+- name: measurements.cluster.pv_stats.populate.{{pvc}}.capacity_bytes
+  monitoring_query: kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+- name: measurements.cluster.pv_stats.populate.{{pvc}}.used_bytes
+  monitoring_query: kubelet_volume_stats_used_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+- name: measurements.cluster.pv_stats.populate.{{pvc}}.available_bytes
+  monitoring_query: kubelet_volume_stats_available_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+{%- endmacro %}
+
+{{ pv_stats('data-rhdh-postgresql-primary-0') }}
diff --git a/config/cluster_read_config.yaml b/config/cluster_read_config.test.yaml
similarity index 93%
rename from config/cluster_read_config.yaml
rename to config/cluster_read_config.test.yaml
index ddfbc69..3ea8af0 100644
--- a/config/cluster_read_config.yaml
+++ b/config/cluster_read_config.test.yaml
@@ -174,7 +174,20 @@
 {{ monitor_pod('openshift-apiserver', 'apiserver', 15) }}
 {{ monitor_pod('openshift-kube-apiserver', 'kube-apiserver', 15, pod_suffix_regex='-ip-.+') }}
 
+{% macro pv_stats(pvc) -%}
+# Collect data for PV stats
+- name: measurements.cluster.pv_stats.test.{{pvc}}.capacity_bytes
+  monitoring_query: kubelet_volume_stats_capacity_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+- name: measurements.cluster.pv_stats.test.{{pvc}}.used_bytes
+  monitoring_query: kubelet_volume_stats_used_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+- name: measurements.cluster.pv_stats.test.{{pvc}}.available_bytes
+  monitoring_query: kubelet_volume_stats_available_bytes{persistentvolumeclaim="{{ pvc }}"}
+  monitoring_step: 15
+{%- endmacro %}
 
+{{ pv_stats('data-rhdh-postgresql-primary-0') }}
 
 # Results
 - name: results.locust-operator.locust_requests_avg_response_time_Aggregated