From fd2392477b87fc75ae24c04b74c8fcc709289b48 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 09:32:37 +0100 Subject: [PATCH 1/9] feat: remove old ruby script --- runbooks/bin/mean-time-to-repair.rb | 135 ---------------------------- 1 file changed, 135 deletions(-) delete mode 100755 runbooks/bin/mean-time-to-repair.rb diff --git a/runbooks/bin/mean-time-to-repair.rb b/runbooks/bin/mean-time-to-repair.rb deleted file mode 100755 index 2da1c15e..00000000 --- a/runbooks/bin/mean-time-to-repair.rb +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env ruby - -# Consume the incident log file and output incident performance metrics -# -# Usage (from inside the `runbooks/source` directory): -# -# ../bin/mean-time-to-repair.rb -# - -require "date" - -INCIDENT_LOG = "incident-log.html.md.erb" - -QUARTER_REGEX = %r[^## Q\d \d\d\d\d ] -INCIDENT_REGEX = %r[^### Incident on ] -TIME_TO_REPAIR_REGEX = %r[- ..Time to repair..: (\d+h \d+m)] -TIME_TO_RESOLVE_REGEX = %r[- ..Time to resolve..: (\d+h \d+m)] - -def main - puts parse_incident_log - .map { |quarter, incidents| Quarter.new(quarter, incidents) } - .sort { |a, b| a.title <=> b.title } - .map(&:report) -end - -# Turn the incident log into a hash of: -# -# { [quarter label] => [ list of incidents in quarter ], ... } -# -# ...where each 'incident' is a hash: { time_to_repair: "Xh Ym", time_to_resolve: "Xh Ym" } -# -def parse_incident_log - data = {} - current_quarter = nil - current_incident = nil - - IO.foreach(INCIDENT_LOG) do |l| - line = l.chomp - - case line - when QUARTER_REGEX - if current_quarter # i.e. this isn't the first quarter marker in the file - # We just reached the next quarter, so add the current incident to the current quarter - data[current_quarter].push(current_incident) unless current_incident.nil? - end - - # initialise a new 'quarter' hash - current_quarter = line - data[current_quarter] = [] - when INCIDENT_REGEX - # We reached an incident marker. Finish off this incident and start a new hash. - data[current_quarter].push(current_incident) unless current_incident.nil? - current_incident = {} - when TIME_TO_REPAIR_REGEX - # We've found the time_to_repair line for this incident - m = TIME_TO_REPAIR_REGEX.match(line) - current_incident[:time_to_repair] = m[1] - when TIME_TO_RESOLVE_REGEX - # We've found the time_to_resolve line for this incident - m = TIME_TO_RESOLVE_REGEX.match(line) - current_incident[:time_to_resolve] = m[1] - end - end - - # Ensure we handle the last incident in the file - data[current_quarter].push(current_incident) unless current_incident == {} - - data -end - -class Quarter - attr_reader :title, :incidents - - def initialize(title, incidents) - @title = title - @incidents = incidents - .reject { |i| i == {} } - .map { |i| Incident.new(i) } - end - - def report - <<~EOF - #{title} - Incidents: #{incidents.length} - Mean time to repair: #{mean_time_to_repair} - Mean time to resolve: #{mean_time_to_resolve} - EOF - end - - private - - def mean_time_to_repair - sum = incidents.map(&:time_to_repair).sum - hours_and_minutes(sum / incidents.length) - end - - def mean_time_to_resolve - sum = incidents.map(&:time_to_resolve).sum - hours_and_minutes(sum / incidents.length) - end - - def hours_and_minutes(seconds) - hours = seconds / 3600 - seconds = seconds % 3600 - minutes = seconds / 60 - "#{hours}h #{minutes}m" - end -end - - -class Incident - def initialize(params) - @time_to_repair = params[:time_to_repair] - @time_to_resolve = params[:time_to_resolve] - end - - def time_to_repair - to_seconds(@time_to_repair || @time_to_resolve) # In case one is missing - end - - def time_to_resolve - to_seconds(@time_to_resolve || @time_to_repair) # In case one is missing - end - - private - - # "1h 5m" => 3900 (seconds) - def to_seconds(str) - hours, minutes = str.split(" ").map(&:to_i) - (hours * 3600) + (minutes * 60) - end -end - -main - From 57c88dec5566e14c9009d1c353d78525369aaa77 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 09:32:55 +0100 Subject: [PATCH 2/9] feat: add new go mean time to repair script --- cmd/mean-time-to-repair/go.mod | 3 + cmd/mean-time-to-repair/main.go | 134 ++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 cmd/mean-time-to-repair/go.mod create mode 100644 cmd/mean-time-to-repair/main.go diff --git a/cmd/mean-time-to-repair/go.mod b/cmd/mean-time-to-repair/go.mod new file mode 100644 index 00000000..564bcf05 --- /dev/null +++ b/cmd/mean-time-to-repair/go.mod @@ -0,0 +1,3 @@ +module ministryofjustice/cloud-platform/cmd/mean-time-to-repair + +go 1.23.1 diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go new file mode 100644 index 00000000..78c36a04 --- /dev/null +++ b/cmd/mean-time-to-repair/main.go @@ -0,0 +1,134 @@ +package main + +import ( + "bufio" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +func readLines(path string) ([]string, error) { + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + var lines []string + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + return lines, scanner.Err() +} + +func convertToRaw(data string) int { + s := strings.Split(data, " ") + total := 0 + + hours := 0 + minutes := 0 + + for _, time := range s { + if strings.Contains(time, "h") { + time = strings.Replace(time, "h", "", -1) + i, err := strconv.Atoi(time) + if err != nil { + panic(err) + } + hours = convertHoursToMinutes(i) + } + if strings.Contains(time, "m") { + time = strings.Replace(time, "m", "", -1) + i, err := strconv.Atoi(time) + if err != nil { + panic(err) + } + minutes = i + } + } + total = hours + minutes + + return total +} + +func convertHoursToMinutes(i int) int { + i = i * 60 + return i +} + +func main() { + lines, err := readLines("../../runbooks/source/incident-log.html.md.erb") + if err != nil { + log.Fatalf("readLines: %s", err) + } + + var data []string + + var str strings.Builder + + for _, line := range lines { + str.WriteString(line) + + re := regexp.MustCompile(`---`) + match := re.FindString(line) + + if match != "" { + data = append(data, str.String()) + str.Reset() + } + } + + for _, newline := range data { + reTitle := regexp.MustCompile(`(?U)## .\d \d* \(.*\)`) + + title := reTitle.FindString((newline)) + + if title != "" { + fmt.Printf("%s\n", title) + } + + re := regexp.MustCompile(`\*\*Time to repair\*\*: (\d*. \d*.|\d*.)`) + timeToRepair := 0 + count := 0 + for _, regmatch := range re.FindAllString(newline, -1) { + t := strings.Replace(regmatch, "**Time to repair**: ", "", -1) + timeToRepairTemp := convertToRaw(t) + timeToRepair = timeToRepair + timeToRepairTemp + count = count + 1 + } + + re2 := regexp.MustCompile(`\*\*Time to resolve\*\*: (\d*. \d*.|\d*.)`) + timeToResolve := 0 + resolveCount := 0 + for _, resolveMatch := range re2.FindAllString(newline, -1) { + t := strings.Replace(resolveMatch, "**Time to resolve**: ", "", -1) + timeToResolveTemp := convertToRaw(t) + timeToResolve = timeToResolve + timeToResolveTemp + resolveCount = resolveCount + 1 + } + + if count != 0 { + meanTimeToRepair := timeToRepair / count + d := time.Duration(meanTimeToRepair) * time.Minute + hours := int(d.Hours()) + minutes := int(d.Minutes()) % 60 + fmt.Printf("Incidents:%2d\n", count) + fmt.Printf("Mean time to repair: %2dh %02dm\n", hours, minutes) + } + + if resolveCount != 0 { + meantTimeToResolve := timeToResolve / resolveCount + d := time.Duration(meantTimeToResolve) * time.Minute + hours := int(d.Hours()) + minutes := int(d.Minutes()) % 60 + fmt.Printf("Mean time to resolve: %2dh %02dm", hours, minutes) + fmt.Println("\n") + } + + } +} From 8db70c812f94d20c325c1c52945b518d468cf38b Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 09:33:08 +0100 Subject: [PATCH 3/9] feat: add formatting and update mean times --- runbooks/source/incident-log.html.md.erb | 39 +++++++++++++++++------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/runbooks/source/incident-log.html.md.erb b/runbooks/source/incident-log.html.md.erb index c4aab251..8abbb67f 100644 --- a/runbooks/source/incident-log.html.md.erb +++ b/runbooks/source/incident-log.html.md.erb @@ -5,12 +5,13 @@ weight: 45 # Incident Log -> Use the [mean-time-to-repair.rb] script to view performance metrics +> Use the [mean-time-to-repair] go script to view performance metrics +--- ## Q3 2024 (July-September) -- **Mean Time to Repair**: 3h 8m -- **Mean Time to Resolve**: 4h 9m +- **Mean Time to Repair**: 1h 39m +- **Mean Time to Resolve**: 2h 14m ### Incident on 2024-09-20 - EKS Subnet Route Table Associations destroyed @@ -92,6 +93,7 @@ weight: 45 - [Re-introduce Opensearch in to Live logging](https://github.com/ministryofjustice/cloud-platform/issues/5929) - [Investigate fluent-bit "failed to flush chunk"](https://github.com/ministryofjustice/cloud-platform/issues/5930) +--- ## Q1 2024 (January-April) - **Mean Time to Repair**: 3h 21m @@ -138,6 +140,7 @@ weight: 45 - Look into creating a blue-green prometheus to have live like setup to test changes before applying to live - Spike into Amazon Managed Prometheus +--- ## Q4 2023 (October-December) - **Mean Time to Repair**: 35h 36m @@ -181,6 +184,7 @@ weight: 45 - Any other services which is probing prometheus that triggers the restart - Is taking regular velero backups distrub the ebs read/write and cause the restart +--- ## Q3 2023 (July-September) - **Mean Time to Repair**: 10h 55m @@ -340,6 +344,7 @@ weight: 45 - **Review actions**: - Add a test/check to ensure the IP address allocation is working as expected [#4669](https://github.com/ministryofjustice/cloud-platform/issues/4669) +--- ## Q2 2023 (April-June) - **Mean Time to Repair**: 0h 55m @@ -380,9 +385,10 @@ weight: 45 - **Review actions**: - Add a runbook for the steps to perform when changing the node instance type +--- ## Q1 2023 (January-March) -- **Mean Time to Repair**: 225h 11m +- **Mean Time to Repair**: 225h 10m - **Mean Time to Resolve**: 225h 28m @@ -534,6 +540,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Look into a Terraform resource for CircleCI - Use IRSA instead of AWS Keys +--- ## Q4 2022 (October-December) - **Mean Time to Repair**: 27m @@ -582,6 +589,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - AWS advise received - ticket raised to investigate potential solutions: [Implementation of notification of Scheduled Instance Retirements - to Slack. Investigate 2 potential AWS solutions#4264](https://app.zenhub.com/workspaces/cloud-platform-team-5ccb0b8a81f66118c983c189/issues/ministryofjustice/cloud-platform/4264). +--- ## Q3 2022 (July-September) - **Mean Time to Repair**: 6h 27m @@ -619,11 +627,12 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Review actions**: - Mitigaton tickets raised following a post-incident review: https://github.com/ministryofjustice/cloud-platform/issues?q=is%3Aissue+is%3Aopen+post-aws-incident +--- ## Q1 2022 (January to March) -- **Mean Time to Repair**: 1h 39m +- **Mean Time to Repair**: 1h 05m -- **Mean Time to Resolve**: 1h 59m +- **Mean Time to Resolve**: 1h 24m ### Incident on 2022-03-10 11:48 - All ingress resources using *.apps.live.cloud-platform urls showing certificate issue @@ -694,6 +703,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Create a runbook to handle ErrorsInExternalDNS alarm [#3501](https://github.com/ministryofjustice/cloud-platform/issues/3501) - Assign someone to be the 'hammer' on Fridays +--- ## Q4 2021 (October to December) - **Mean Time to Repair**: 1h 17m @@ -726,6 +736,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Review actions**: - N/A +--- ## Q3 2021 (July-September) - **Mean Time to Repair**: 3h 28m @@ -833,6 +844,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Created a [ticket](https://github.com/ministryofjustice/cloud-platform/issues/3083) to set terraform kubernetes credentials dynamically (at executing time) - Fix the pipeline: Before the creation of Terraform resources, add a function in the cli to perform a `kubectl context` switch to the correct cluster. PR exists +--- ## Q2 2021 (April-June) - **Mean Time to Repair**: 2h 32m @@ -902,6 +914,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Disband the cloud-platform-concourse repository. This includes Service accounts, and pipelines. We should split this repository up and move it to the infra/terraform-concourse repos. Ticket [#3017](https://github.com/ministryofjustice/cloud-platform/issues/3017) - Manager needs to use our PSPs instead of eks-privilege - this has already been done. +--- ## Q1 2021 (January - March) - **Mean Time to Repair**: N/A @@ -910,6 +923,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ ### No incidents declared +--- ## Q4 2020 (October - December) - **Mean Time to Repair**: 2h 8m @@ -942,11 +956,12 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Resolution**: - Migrate all ingresses back to the default ingress controller +--- ## Q3 2020 (July - September) -- **Mean Time To Repair**: 1h 9m +- **Mean Time To Repair**: 59m -- **Mean Time To Resolve**: 7h 26m +- **Mean Time To Resolve**: 7h 13m ### Incident on 2020-09-28 13:10 - Termination of nodes updating kops Instance Group. @@ -1123,11 +1138,12 @@ ttps://docs.google.com/document/d/1kxKwC1B_pnlPbysS0zotbXMKyZcUDmDtnGbEyIHGvgQ/e - We replaced all our master nodes with c5.4xlarge instances, which (currently) have better availability - We and AWS are still investigating longer-term and more reliable fixes +--- ## Q2 2020 (April - June) -- **Mean Time To Repair**: 2h 5m +- **Mean Time To Repair**: 2h 49m -- **Mean Time To Resolve**: 15h 53m +- **Mean Time To Resolve**: 7h 12m ### Incident on 2020-08-04 17:13 @@ -1190,6 +1206,7 @@ ttps://docs.google.com/document/d/1kxKwC1B_pnlPbysS0zotbXMKyZcUDmDtnGbEyIHGvgQ/e - **Resolution**: The Nginx configuration was modified to enable TLSv1, TLSv1.1 and TLSv1.2 +--- ## Q1 2020 (January - March) - **Mean Time To Repair**: 1h 22m @@ -1318,4 +1335,4 @@ Datestamps: please use `YYYY-MM-DD HH:MM` (almost ISO 8601, but more readable), - **Review actions**: - - [mean-time-to-repair.rb]: https://github.com/ministryofjustice/cloud-platform/blob/main/runbooks/bin/mean-time-to-repair.rb + [mean-time-to-repair.rb]: https://github.com/ministryofjustice/cloud-platform/blob/main/cmd/mean-time-to-repair From 702bda4f502ab97ec54dca3eec3fea00f7d05c31 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:08:34 +0100 Subject: [PATCH 4/9] refactor: move variable to top of func --- cmd/mean-time-to-repair/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index 78c36a04..72832450 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -12,13 +12,14 @@ import ( ) func readLines(path string) ([]string, error) { + var lines []string + file, err := os.Open(path) if err != nil { return nil, err } defer file.Close() - var lines []string scanner := bufio.NewScanner(file) for scanner.Scan() { lines = append(lines, scanner.Text()) From 8097f61c4a02c31b466c366413b411f3088c7b75 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:10:06 +0100 Subject: [PATCH 5/9] refactor: return log fatal with message --- cmd/mean-time-to-repair/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index 72832450..d9d4d9ed 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -39,7 +39,7 @@ func convertToRaw(data string) int { time = strings.Replace(time, "h", "", -1) i, err := strconv.Atoi(time) if err != nil { - panic(err) + log.Fatalf("contains h: %s", err) } hours = convertHoursToMinutes(i) } @@ -47,7 +47,7 @@ func convertToRaw(data string) int { time = strings.Replace(time, "m", "", -1) i, err := strconv.Atoi(time) if err != nil { - panic(err) + log.Fatalf("contains m: %s", err) } minutes = i } From 588e830849e1e9d8c50835275514dd41e1b54958 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:10:59 +0100 Subject: [PATCH 6/9] refactor: remove unneeded variable --- cmd/mean-time-to-repair/main.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index d9d4d9ed..a1735f04 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -29,8 +29,6 @@ func readLines(path string) ([]string, error) { func convertToRaw(data string) int { s := strings.Split(data, " ") - total := 0 - hours := 0 minutes := 0 @@ -52,9 +50,8 @@ func convertToRaw(data string) int { minutes = i } } - total = hours + minutes - return total + return hours + minutes } func convertHoursToMinutes(i int) int { From c91870616eddb2c44f63d6a8d7f9f68f3d3dc77f Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:11:33 +0100 Subject: [PATCH 7/9] refactor: remove unneeded variable --- cmd/mean-time-to-repair/main.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index a1735f04..97ae4642 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -55,8 +55,7 @@ func convertToRaw(data string) int { } func convertHoursToMinutes(i int) int { - i = i * 60 - return i + return i * 60 } func main() { From 9e5f52d829a556cd7156c22ae4df5066721e90ee Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:13:22 +0100 Subject: [PATCH 8/9] refactor: add vars to top --- cmd/mean-time-to-repair/main.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index 97ae4642..5db94167 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -59,15 +59,14 @@ func convertHoursToMinutes(i int) int { } func main() { + var data []string + var str strings.Builder + lines, err := readLines("../../runbooks/source/incident-log.html.md.erb") if err != nil { log.Fatalf("readLines: %s", err) } - var data []string - - var str strings.Builder - for _, line := range lines { str.WriteString(line) From 047496b29ef7d76187c8499ddf004d16831c1b69 Mon Sep 17 00:00:00 2001 From: Mike Bell Date: Wed, 9 Oct 2024 10:14:31 +0100 Subject: [PATCH 9/9] refactor: shorten count statements --- cmd/mean-time-to-repair/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go index 5db94167..b7b2f12d 100644 --- a/cmd/mean-time-to-repair/main.go +++ b/cmd/mean-time-to-repair/main.go @@ -95,7 +95,7 @@ func main() { t := strings.Replace(regmatch, "**Time to repair**: ", "", -1) timeToRepairTemp := convertToRaw(t) timeToRepair = timeToRepair + timeToRepairTemp - count = count + 1 + count += 1 } re2 := regexp.MustCompile(`\*\*Time to resolve\*\*: (\d*. \d*.|\d*.)`) @@ -105,7 +105,7 @@ func main() { t := strings.Replace(resolveMatch, "**Time to resolve**: ", "", -1) timeToResolveTemp := convertToRaw(t) timeToResolve = timeToResolve + timeToResolveTemp - resolveCount = resolveCount + 1 + resolveCount += 1 } if count != 0 {