Skip to content

Commit

Permalink
Add support to promethues-scale to collect conprof data
Browse files Browse the repository at this point in the history
  • Loading branch information
mohit-sheth authored and chaitanyaenr committed Jul 9, 2020
1 parent 6ac11d1 commit 238eba7
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 3 deletions.
6 changes: 6 additions & 0 deletions docs/prometheus-scale.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,9 @@ Sleep interval for each block iteration in seconds.
### PROMETHEUS_SCALE_TEST_PREFIX
Default: `prometheus-scale`
Sets the pbench result test prefix.

### PPROF_COLLECT
Default: `false`
If you'd like to enable pprof profile data collection of kubeapiserver and prometheus through conprof(https://github.com/conprof/conprof).
Enabling this will create a few services to collect profiles from the apiserver pods and then create a conprof tarball in the pbench tarball
13 changes: 11 additions & 2 deletions workloads/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
with_items:
- src: scale-ci-tooling-ns.yml
dest: "{{ansible_user_dir}}/scale-ci-tooling/scale-ci-tooling-ns.yml"
- src: workload-prometheus-script-cm.yml
dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml"

- name: Slurp kubeconfig file
slurp:
Expand All @@ -42,6 +40,15 @@
src: "{{pbench_ssh_public_key_file}}"
register: pbench_ssh_public_key_file_slurp

- name: Set cluster details
include_role:
name: cluster_details

- name: Collect pprof
include_role:
name: pprof-collection
when: pprof_collect and pprof_collect != ""

- name: Template workload templates
template:
src: "{{item.src}}"
Expand All @@ -58,6 +65,8 @@
dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-job.yml"
- src: workload-env.yml.j2
dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-env.yml"
- src: workload-prometheus-script-cm.yml.j2
dest: "{{ansible_user_dir}}/scale-ci-tooling/workload-prometheus-script-cm.yml"

- name: Check if scale-ci-tooling namespace exists
shell: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ data:
fi
workload_log "Done configuring pbench for Prometheus scale"

if [ "${PPROF_COLLECT}" = "true" ]; then
workload_log "Configuring conprof"
envsubst < /root/workload/conprof.yaml.template > /tmp/conprof.yaml
envsubst < /root/workload/conprof_start.sh > /tmp/conprof_start.sh
envsubst < /root/workload/conprof_stop.sh > /tmp/conprof_stop.sh
workload_log "Done configuring conprof"
fi

workload_log "Running Prometheus scale workload"
if [ "${PBENCH_INSTRUMENTATION}" = "true" ]; then
pbench-user-benchmark --pbench-post='sh /root/workload/post-run.sh' -- sh /root/workload/workload.sh
Expand All @@ -53,10 +61,25 @@ data:
RESULT_DIR=/tmp
fi
workload_log "Completed Prometheus scale workload run"

conprof_start.sh: |
#!/bin/sh
set -o pipefail
nohup /usr/bin/conprof all --config.file /tmp/conprof.yaml --log.level=debug --storage.tsdb.path=/tmp/data &>/tmp/conprof.log &
conprof_stop.sh: |
#!/bin/sh
set -o pipefail
pkill conprof

workload.sh: |
#!/bin/sh
set -ox pipefail

if [ "${PPROF_COLLECT}" = "true" ]; then
workload_log "Starting conprof"
bash /tmp/conprof_start.sh
fi

db_aging() {
while true; do
echo "$(date +'%m-%d-%y-%H:%M:%S') $(oc exec prometheus-k8s-0 -n openshift-monitoring -c prometheus -- df |grep /prometheus$)" >> /tmp/pvc_monitor_0.log
Expand All @@ -80,13 +103,22 @@ data:
# stop the prometheus load
kill -9 ${loader_pid} ${db_aging_pid}

if [ "${PPROF_COLLECT}" = "true" ]; then
workload_log "Stopping conprof"
bash /tmp/conprof_stop.sh
cp /tmp/conprof.log ${benchmark_results_dir}/conprof.log
cp /tmp/conprof.yaml ${benchmark_results_dir}/conprof.yaml
tar -czvf ${benchmark_results_dir}/conprof.tar.gz /tmp/data/
workload_log "copied conprof tarballs and log"
fi

# test idle
sleep 300
post-run.sh: |
#!/bin/sh
set -ox pipefail

RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1/sample1
RESULT_DIR="/var/lib/pbench-agent/$(ls -t /var/lib/pbench-agent/ | grep "pbench-user" | head -1)"/1-default/sample1
echo "Using RESULT_DIR of: \"${RESULT_DIR}\""
oc logs -n openshift-monitoring prometheus-k8s-0 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_1.log
oc logs -n openshift-monitoring prometheus-k8s-1 -c prometheus --since=${PROMETHEUS_DURATION}s > ${RESULT_DIR}/oc_logs_2.log
Expand Down Expand Up @@ -463,3 +495,113 @@ data:

def get_dashboards(self):
return self.dashboards
conprof.yaml.template: |
scrape_configs:
- job_name: 'apiserver0'
scrape_interval: 30s
scrape_timeout: 10m
scheme: https
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ['apiserver0-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
bearer_token: {{bearer_token.stdout}}
profiling_config:
pprof_config:
heap:
enabled: true
profile:
enabled: true
goroutine:
enabled: false
threadcreate:
enabled: false
allocs:
enabled: false
block:
enabled: false
mutex:
enabled: false
trace:
enabled: false
- job_name: 'apiserver1'
scrape_interval: 30s
scrape_timeout: 10m
scheme: https
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ['apiserver1-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
bearer_token: {{bearer_token.stdout}}
profiling_config:
pprof_config:
heap:
enabled: true
profile:
enabled: true
goroutine:
enabled: false
threadcreate:
enabled: false
allocs:
enabled: false
block:
enabled: false
mutex:
enabled: false
trace:
enabled: false
- job_name: 'apiserver2'
scrape_interval: 30s
scrape_timeout: 10m
scheme: https
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ['apiserver2-openshift-kube-apiserver.apps.{{clustername}}.{{base_domain}}']
bearer_token: {{bearer_token.stdout}}
profiling_config:
pprof_config:
heap:
enabled: true
profile:
enabled: true
goroutine:
enabled: false
threadcreate:
enabled: false
allocs:
enabled: false
block:
enabled: false
mutex:
enabled: false
trace:
enabled: false
- job_name: 'prometheus'
scrape_interval: 30s
scrape_timeout: 10m
scheme: https
tls_config:
insecure_skip_verify: true
static_configs:
- targets: ['prometheus-k8s-openshift-monitoring.apps.{{clustername}}.{{base_domain}}']
bearer_token: {{bearer_token.stdout}}
profiling_config:
pprof_config:
heap:
enabled: true
profile:
enabled: false
goroutine:
enabled: false
threadcreate:
enabled: false
allocs:
enabled: false
block:
enabled: false
mutex:
enabled: false
trace:
enabled: false
3 changes: 3 additions & 0 deletions workloads/vars/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ pbench_server: "{{ lookup('env', 'PBENCH_SERVER')|default('', true) }}"
scale_ci_results_token: "{{ lookup('env', 'SCALE_CI_RESULTS_TOKEN')|default('', true) }}"
job_completion_poll_attempts: "{{ lookup('env', 'JOB_COMPLETION_POLL_ATTEMPTS')|default(360, true)|int }}"

# pporf variables
pprof_collect: "{{ lookup('env', 'PPROF_COLLECT')|default(false, true)|bool|lower }}"

# Prometheus scale workload specific parameters:
prometheus_scale_test_prefix: "{{ lookup('env', 'PROMETHEUS_SCALE_TEST_PREFIX')|default('prometheus-scale', true) }}"
prometheus_concurrency: "{{ lookup('env', 'PROMETHEUS_CONCURRENCY')|default(10, true)|int }}"
Expand Down

0 comments on commit 238eba7

Please sign in to comment.