Skip to content

Commit

Permalink
Merge branch 'master' into logging
Browse files Browse the repository at this point in the history
  • Loading branch information
myadla authored Dec 16, 2024
2 parents 5c4bce7 + 468a383 commit a47470b
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 10 deletions.
2 changes: 1 addition & 1 deletion callback_plugins/custom_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def log_summary_results(self, host):
print("The host %s does not have any results" % host)
return

with open(file_path, 'w') as f:
with open(file_path, 'a') as f:
f.write(f"Host: {host}\n")
f.write(f"Tasks Succeeded: {self.results[host]['passed']}\n")
f.write(f"Tasks Failed: {self.results[host]['failed']}\n")
Expand Down
22 changes: 19 additions & 3 deletions roles/test_alerts/tasks/test_create_an_alert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
labels:
prometheus: default
role: alert-rules
name: prometheus-alarm-rules
name: fvt-testing-prometheus-alarm-rules
namespace: service-telemetry
spec:
groups:
- name: ./openstack.rules
rules:
- alert: Collectd metrics receive rate is zero
- alert: FVT_TESTING Collectd metrics receive rate is zero
expr: rate(sg_total_collectd_msg_received_count[1m]) == 0
EOF
changed_when: false
Expand All @@ -34,5 +34,21 @@
cmd: |
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
register: cmd_output
changed_when: true

always:
- name: "Delete the PrometheusRule"
ansible.builtin.command:
cmd: |
oc delete prometheusrule.{{ observability_api }} fvt-testing-prometheus-alarm-rules
register: delete_prom
changed_when: delete_prom.rc == 0

- name: Wait up to two minutes until the rule is deleted
ansible.builtin.command:
cmd: |
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
retries: 12
delay: 10
until: 'not "FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
changed_when: false
failed_when: cmd_output.rc != 0
Original file line number Diff line number Diff line change
Expand Up @@ -36,31 +36,71 @@
ansible.builtin.debug:
var: alertmanager_secret

- name: "RHELOSP-148697 Interrupt metrics flow by preventing the QDR from running"
- name: "RHELOSP-144965 Create the alert"
ansible.builtin.shell:
cmd: |
for i in {1..15}; do oc delete po -l application=default-interconnect; sleep 1; done
oc apply -f - <<EOF
apiVersion: {{ observability_api }}/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
prometheus: default
role: alert-rules
name: fvt-testing-prometheus-alarm-rules-alertmanager
namespace: service-telemetry
spec:
groups:
- name: ./openstack.rules
rules:
- alert: FVT_TESTING Collectd metrics receive rate is zero
expr: rate(sg_total_collectd_msg_received_count[1m]) == 0
EOF
changed_when: false
register: cmd_output
failed_when: cmd_output.rc != 0

- name: "RHELOSP-148697 Interrupt metrics flow by preventing the QDR from running"
ansible.builtin.shell:
cmd: |
for i in {1..30}; do oc delete po -l application=default-interconnect; sleep 1; done
changed_when: false

- name: "RHELOSP-148698 Verify that the alert is active in Alertmanager"
ansible.builtin.shell:
cmd: >-
oc exec -it prometheus-default-0 -c prometheus -- /bin/sh -c 'curl -k -H \
"Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active' | grep 'Collectd metrics receive rate is zero'
https://default-alertmanager-proxy:9095/api/v1/alerts' | grep 'active' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
register: cmd_output
changed_when: false
failed_when: cmd_output.stdout_lines | length == 0

- name: "RHELOSP-148699 Verify that the alert is firing in Prometheus"
ansible.builtin.shell:
cmd: >-
/usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | grep 'firing' | grep 'Collectd metrics receive rate is zero'
/usr/bin/curl -k {{ prom_auth_string }} -g https://{{ prom_url }}/api/v1/alerts | grep 'firing' | grep 'FVT_TESTING Collectd metrics receive rate is zero'
register: cmd_output
changed_when: false
failed_when: cmd_output.stdout_lines | length == 0

always:
- name: "Delete the PrometheusRule"
ansible.builtin.command:
cmd: |
oc delete prometheusrule.{{ observability_api }} fvt-testing-prometheus-alarm-rules-alertmanager
register: delete_prom
changed_when: delete_prom.rc == 0

- name: "Wait up to two minutes until the rule is deleted"
ansible.builtin.command:
cmd: |
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
retries: 12
delay: 10
until: 'not "FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
changed_when: false

- name: "Wait 2 minutes to make sure all SG pods are back to normal"
ansible.builtin.pause:
minutes: 2
Expand Down
30 changes: 28 additions & 2 deletions roles/test_verify_email/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
---
# tasks file for roles/test_verify_email

- name: "Set the prom auth"
ansible.builtin.include_role:
name: client_side_tests
tasks_from: get_prom_info.yml
vars:
prom_auth_method: token
when:
- prom_auth_string is not defined
- prom_url is not defined

- name: "Get the observability strategy and set observability_api"
ansible.builtin.include_role:
name: test_alerts
Expand All @@ -18,13 +29,13 @@
labels:
prometheus: default
role: alert-rules
name: prometheus-alarm-rules
name: fvt-testing-prometheus-alarm-rules-email
namespace: service-telemetry
spec:
groups:
- name: ./openstack.rules
rules:
- alert: Collectd metrics receive rate is zero
- alert: FVT_TESTING Collectd metrics receive rate is zero
expr: rate(sg_total_collectd_msg_received_count[1m]) == 0
EOF
changed_when: false
Expand Down Expand Up @@ -54,6 +65,21 @@
changed_when: false

always:
- name: "Delete the PrometheusRule"
ansible.builtin.command: |
oc delete prometheusrule.{{ observability_api }} fvt-testing-prometheus-alarm-rules-email
register: delete_prom
changed_when: delete_prom.rc == 0

- name: "Wait up to two minutes until the rule is deleted"
ansible.builtin.command:
cmd: |
curl -k {{ prom_auth_string }} https://{{ prom_url }}/api/v1/rules
retries: 12
delay: 10
until: 'not "FVT_TESTING Collectd metrics receive rate is zero" in cmd_output.stdout'
changed_when: false

- name: "RHELOSP-176046 Remove alertmanagerConfigManifest from the ServiceTelemetry object"
ansible.builtin.shell:
cmd: |
Expand Down

0 comments on commit a47470b

Please sign in to comment.