Add support to promethues-scale to collect conprof data

openshift-scale · Jul 9, 2020 · 238eba7 · 238eba7
1 parent 6ac11d1
commit 238eba7
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 3 deletions.
diff --git a/docs/prometheus-scale.md b/docs/prometheus-scale.md
@@ -98,3 +98,9 @@ Sleep interval for each block iteration in seconds.
 ### PROMETHEUS_SCALE_TEST_PREFIX
 Default: `prometheus-scale`
 Sets the pbench result test prefix.
+
+### PPROF_COLLECT  
+Default: `false`   
+If you'd like to enable pprof profile data collection of kubeapiserver and prometheus through conprof(https://github.com/conprof/conprof).
+Enabling this will create a few services to collect profiles from the apiserver pods and then create a conprof tarball in the pbench tarball
+
diff --git a/workloads/prometheus.yml b/workloads/prometheus.yml
@@ -24,8 +24,6 @@
       with_items:
         - src: scale-ci-tooling-ns.yml
           dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml"
-        - src: workload-prometheus-script-cm.yml
-          dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml"
 
     - name: Slurp kubeconfig file
       slurp:
@@ -42,6 +40,15 @@
         src: "{{pbench_ssh_public_key_file}}"
       register: pbench_ssh_public_key_file_slurp
 
+    - name: Set cluster details
+      include_role:
+        name: cluster_details
+
+    - name: Collect pprof
+      include_role:
+        name: pprof-collection
+      when: pprof_collect and pprof_collect != ""
+
     - name: Template workload templates
       template:
         src: "{{item.src}}"
@@ -58,6 +65,8 @@
           dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml"
         - src: workload-env.yml.j2
           dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-env.yml"
+        - src: workload-prometheus-script-cm.yml.j2
+          dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml"
 
     - name: Check if scale-ci-tooling namespace exists
       shell: |

diff --git a/...s/files/workload-prometheus-script-cm.yml → ...ates/workload-prometheus-script-cm.yml.j2 b/...s/files/workload-prometheus-script-cm.yml → ...ates/workload-prometheus-script-cm.yml.j2
@@ -38,6 +38,14 @@ data:
     fi
     workload_log "Done configuring pbench for Prometheus scale"
 
+    if [ "${PPROF_COLLECT}" = "true" ]; then
+      workload_log "Configuring conprof"
+      envsubst < /root/workload/conprof.yaml.template > /tmp/conprof.yaml
+      envsubst < /root/workload/conprof_start.sh > /tmp/conprof_start.sh
+      envsubst < /root/workload/conprof_stop.sh > /tmp/conprof_stop.sh
+      workload_log "Done configuring conprof"
+    fi
+
     workload_log "Running Prometheus scale workload"
     if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then
       pbench-user-benchmark --pbench-post='sh /root/workload/post-run.sh' -- sh /root/workload/workload.sh
@@ -53,10 +61,25 @@ data:
       RESULT_DIR=/tmp
     fi
     workload_log "Completed Prometheus scale workload run"
+
+  conprof_start.sh: |
+    #!/bin/sh
+    set -o pipefail
+    nohup /usr/bin/conprof all --config.file /tmp/conprof.yaml --log.level=debug --storage.tsdb.path=/tmp/data &>/tmp/conprof.log &
+  conprof_stop.sh: |
+    #!/bin/sh
+    set -o pipefail
+    pkill conprof
+
   workload.sh: |
     #!/bin/sh
     set -ox pipefail
 
+    if [ "${PPROF_COLLECT}" = "true" ]; then
+      workload_log "Starting conprof"
+      bash /tmp/conprof_start.sh
+    fi
+
     db_aging() {
       while true; do
         echo "$(date +'%m-%d-%y-%H:%M:%S') $(oc exec prometheus-k8s-0 -n openshift-monitoring -c prometheus -- df |grep /prometheus$)" >> /tmp/pvc_monitor_0.log
@@ -80,13 +103,22 @@ data:
     # stop the prometheus load
     kill -9 ${loader_pid} ${db_aging_pid}
 
+    if [ "${PPROF_COLLECT}" = "true" ]; then
+      workload_log "Stopping conprof"
+      bash /tmp/conprof_stop.sh
+      cp /tmp/conprof.log ${benchmark_results_dir}/conprof.log
+      cp /tmp/conprof.yaml ${benchmark_results_dir}/conprof.yaml
+      tar -czvf ${benchmark_results_dir}/conprof.tar.gz /tmp/data/
+      workload_log "copied conprof tarballs and log"
+    fi
+
     # test idle
     sleep 300
   post-run.sh: |
     #!/bin/sh
     set -ox pipefail
 
-    RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1/sample1
+    RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1-default/sample1
     echo "Using RESULT_DIR of: \"${RESULT_DIR}\""
     oc logs -n openshift-monitoring prometheus-k8s-0 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_1.log
     oc logs -n openshift-monitoring prometheus-k8s-1 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_2.log
@@ -463,3 +495,113 @@ data:
 
         def get_dashboards(self):
             return self.dashboards
+  conprof.yaml.template: |
+    scrape_configs:
+    - job_name: 'apiserver0'
+      scrape_interval: 30s
+      scrape_timeout: 10m
+      scheme: https
+      tls_config:
+        insecure_skip_verify: true
+      static_configs:
+      - targets: ['apiserver0-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
+      bearer_token: {{bearer_token.stdout}}
+      profiling_config:
+        pprof_config:
+          heap:
+            enabled: true
+          profile:
+            enabled: true
+          goroutine:
+            enabled: false
+          threadcreate:
+            enabled: false
+          allocs:
+            enabled: false
+          block:
+            enabled: false
+          mutex:
+            enabled: false
+          trace:
+            enabled: false
+    - job_name: 'apiserver1'
+      scrape_interval: 30s
+      scrape_timeout: 10m
+      scheme: https
+      tls_config:
+        insecure_skip_verify: true
+      static_configs:
+      - targets: ['apiserver1-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
+      bearer_token: {{bearer_token.stdout}}
+      profiling_config:
+        pprof_config:
+          heap:
+            enabled: true
+          profile:
+            enabled: true
+          goroutine:
+            enabled: false
+          threadcreate:
+            enabled: false
+          allocs:
+            enabled: false
+          block:
+            enabled: false
+          mutex:
+            enabled: false
+          trace:
+            enabled: false
+    - job_name: 'apiserver2'
+      scrape_interval: 30s
+      scrape_timeout: 10m
+      scheme: https
+      tls_config:
+        insecure_skip_verify: true
+      static_configs:
+      - targets: ['apiserver2-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
+      bearer_token: {{bearer_token.stdout}}
+      profiling_config:
+        pprof_config:
+          heap:
+            enabled: true
+          profile:
+            enabled: true
+          goroutine:
+            enabled: false
+          threadcreate:
+            enabled: false
+          allocs:
+            enabled: false
+          block:
+            enabled: false
+          mutex:
+            enabled: false
+          trace:
+            enabled: false
+    - job_name: 'prometheus'
+      scrape_interval: 30s
+      scrape_timeout: 10m
+      scheme: https
+      tls_config:
+        insecure_skip_verify: true
+      static_configs:
+      - targets: ['prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}}']
+      bearer_token: {{bearer_token.stdout}}
+      profiling_config:
+        pprof_config:
+          heap:
+            enabled: true
+          profile:
+            enabled: false
+          goroutine:
+            enabled: false
+          threadcreate:
+            enabled: false
+          allocs:
+            enabled: false
+          block:
+            enabled: false
+          mutex:
+            enabled: false
+          trace:
+            enabled: false
diff --git a/workloads/vars/prometheus.yml b/workloads/vars/prometheus.yml
@@ -29,6 +29,9 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}"
 scale_ci_results_token: "{{ lookup('env', 'SCALE_CI_RESULTS_TOKEN')|default('', true) }}"
 job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}"
 
+# pporf variables
+pprof_collect: "{{ lookup('env', 'PPROF_COLLECT')|default(false, true)|bool|lower }}"
+
 # Prometheus scale workload specific parameters:
 prometheus_scale_test_prefix: "{{ lookup('env', 'PROMETHEUS_SCALE_TEST_PREFIX')|default('prometheus-scale', true) }}"
 prometheus_concurrency: "{{ lookup('env', 'PROMETHEUS_CONCURRENCY')|default(10, true)|int }}"