Skip to content

Commit

Permalink
Create rules.yml
Browse files Browse the repository at this point in the history
  • Loading branch information
EarthlingDavey committed Oct 31, 2024
1 parent 688458f commit a301cbc
Showing 1 changed file with 203 additions and 0 deletions.
203 changes: 203 additions & 0 deletions deploy/production/rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
namespace: justice-gov-uk-production
labels:
prometheus: prometheus-operator
role: alert-rules
release: prometheus-operator
name: temp-monitoring-rules-justice-gov-uk-production
spec:
groups:
- name: kubernetes-apps
rules:
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: justice-gov-uk-alerts
- alert: KubeQuota-Exceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: |-
100 * kube_resourcequota{job="kube-state-metrics", type="used", namespace="justice-gov-uk-production"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard", namespace="justice-gov-uk-production"} > 0)
> 90
for: 15m
labels:
severity: justice-gov-uk-alerts
- alert: KubePodCrashLooping
# pint file/disable alerts/template
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
expr: |-
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace="justice-gov-uk-production"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: justice-gov-uk-alerts
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: |-
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown", namespace="justice-gov-uk-production"})
> 0
for: 1h
labels:
severity: justice-gov-uk-alerts
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace="justice-gov-uk-production"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace="justice-gov-uk-production"}
for: 15m
labels:
severity: justice-gov-uk-alerts

- name: application-rules
rules:

- alert: SlowResponses
annotations:
message: For 3 minutes, more than 1% of requests were slower than 5 seconds.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#SlowResponse
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s-nginx-ingress-prometheus-ng2/5178fc76-29af-51b9-83a8-cbab85db37a6
?orgId=1&refresh=1m&var-controller_class=All&var-pod=All&var-datasource=default&from=now-1h&to=now
&var-namespace=All&var-ingress=justice-gov-uk-production-ingress-modsec
expr: |-
histogram_quantile(
0.99,
sum by (le, ingress) (
rate(
nginx_ingress_controller_request_duration_seconds_bucket{exported_namespace="justice-gov-uk-production",status!="404",status!="500"}[1m]
)
)
) > 5
for: 3m
labels:
severity: justice-gov-uk-alerts

- alert: SlownessOutage
annotations:
message: For 3 minutes, more than 10% of requests were slower than 7.5 seconds.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#SlownessOutage
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s-nginx-ingress-prometheus-ng2/5178fc76-29af-51b9-83a8-cbab85db37a6
?orgId=1&refresh=1m&var-controller_class=All&var-pod=All&var-datasource=default&from=now-1h&to=now
&var-namespace=All&var-ingress=justice-gov-uk-production-ingress-modsec
expr: |-
histogram_quantile(
0.9,
sum by (le, ingress) (
rate(
nginx_ingress_controller_request_duration_seconds_bucket{exported_namespace="justice-gov-uk-production",status!="404",status!="500"}[1m]
)
)
) > 7.5
for: 3m
labels:
severity: justice-gov-uk-alerts

- alert: High404Rate
annotations:
message: More than 5% of responses were 404 errors.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#High404Rate
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s-nginx-ingress-prometheus-ng2/5178fc76-29af-51b9-83a8-cbab85db37a6
?orgId=1&refresh=1m&var-controller_class=All&var-pod=All&var-datasource=default&from=now-1h&to=now
&var-namespace=All&var-ingress=justice-gov-uk-production-ingress-modsec
expr: |-
sum by (ingress, cluster) (rate(nginx_ingress_controller_requests{exported_namespace="justice-gov-uk-production",status="404"}[1m]))
/
sum by (ingress) (rate(nginx_ingress_controller_requests{exported_namespace="justice-gov-uk-production"}[1m]))
> 0.05
for: 3m
labels:
severity: justice-gov-uk-alerts

- alert: High5xxRate
annotations:
message: More than 5% of responses were 5xx errors.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#High5xxRate
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s-nginx-ingress-prometheus-ng2/5178fc76-29af-51b9-83a8-cbab85db37a6
?orgId=1&refresh=1m&var-controller_class=All&var-pod=All&var-datasource=default&from=now-1h&to=now
&var-namespace=All&var-ingress=justice-gov-uk-production-ingress-modsec
expr: |-
sum by (ingress, cluster) (rate(nginx_ingress_controller_requests{exported_namespace="justice-gov-uk-production",status=~"5.."}[1m]))
/
sum by (ingress) (rate(nginx_ingress_controller_requests{exported_namespace="justice-gov-uk-production"}[1m]))
> 0.05
for: 3m
labels:
severity: justice-gov-uk-alerts

- alert: TrafficSpike
annotations:
message: Unusual levels of traffic - over 2000 request per minute (33rps).
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#TrafficSpike
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s-nginx-ingress-prometheus-ng2/5178fc76-29af-51b9-83a8-cbab85db37a6
?orgId=1&refresh=1m&var-controller_class=All&var-pod=All&var-datasource=default&from=now-1h&to=now
&var-namespace=All&var-ingress=justice-gov-uk-production-ingress-modsec
expr: |-
sum(rate(nginx_ingress_controller_requests{exported_namespace="justice-gov-uk-production"}[5m])) by (ingress)
> 33
for: 3m
labels:
severity: justice-gov-uk-alerts

- alert: MemorySpike
annotations:
message: A container is using > 500 MiB or ram.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#MemorySpike
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s_views_ns/kubernetes-views-namespaces?orgId=1&var-datasource=prometheus
&var-namespace=intranet-production&var-resolution=30s&from=now-1h&to=now&refresh=30s
expr: |-
sum(container_memory_working_set_bytes{namespace=~"intranet-production", image!=""}) by (pod)
> 500000000
for: 1m
labels:
severity: justice-gov-uk-alerts

- alert: CpuSpike
annotations:
message: A container is using > 0.5 CPU cores.
runbook_url: https://dsdmoj.atlassian.net/wiki/spaces/CDPT/pages/edit-v2/5124292758#CpuSpike
dashboard_url: >-
https://grafana.live.cloud-platform.service.justice.gov.uk
/d/k8s_views_ns/kubernetes-views-namespaces?orgId=1&var-datasource=prometheus
&var-namespace=intranet-production&var-resolution=30s&from=now-1h&to=now&refresh=30s
expr: |-
sum(rate(container_cpu_usage_seconds_total{namespace=~"intranet-production", image!=""}[1m])) by (pod)
> 0.5
for: 1m
labels:
severity: justice-gov-uk-alerts

0 comments on commit a301cbc

Please sign in to comment.