diff --git a/cmd/mean-time-to-repair/go.mod b/cmd/mean-time-to-repair/go.mod new file mode 100644 index 00000000..564bcf05 --- /dev/null +++ b/cmd/mean-time-to-repair/go.mod @@ -0,0 +1,3 @@ +module ministryofjustice/cloud-platform/cmd/mean-time-to-repair + +go 1.23.1 diff --git a/cmd/mean-time-to-repair/main.go b/cmd/mean-time-to-repair/main.go new file mode 100644 index 00000000..b7b2f12d --- /dev/null +++ b/cmd/mean-time-to-repair/main.go @@ -0,0 +1,130 @@ +package main + +import ( + "bufio" + "fmt" + "log" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +func readLines(path string) ([]string, error) { + var lines []string + + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + lines = append(lines, scanner.Text()) + } + return lines, scanner.Err() +} + +func convertToRaw(data string) int { + s := strings.Split(data, " ") + hours := 0 + minutes := 0 + + for _, time := range s { + if strings.Contains(time, "h") { + time = strings.Replace(time, "h", "", -1) + i, err := strconv.Atoi(time) + if err != nil { + log.Fatalf("contains h: %s", err) + } + hours = convertHoursToMinutes(i) + } + if strings.Contains(time, "m") { + time = strings.Replace(time, "m", "", -1) + i, err := strconv.Atoi(time) + if err != nil { + log.Fatalf("contains m: %s", err) + } + minutes = i + } + } + + return hours + minutes +} + +func convertHoursToMinutes(i int) int { + return i * 60 +} + +func main() { + var data []string + var str strings.Builder + + lines, err := readLines("../../runbooks/source/incident-log.html.md.erb") + if err != nil { + log.Fatalf("readLines: %s", err) + } + + for _, line := range lines { + str.WriteString(line) + + re := regexp.MustCompile(`---`) + match := re.FindString(line) + + if match != "" { + data = append(data, str.String()) + str.Reset() + } + } + + for _, newline := range data { + reTitle := regexp.MustCompile(`(?U)## .\d \d* \(.*\)`) + + title := reTitle.FindString((newline)) + + if title != "" { + fmt.Printf("%s\n", title) + } + + re := regexp.MustCompile(`\*\*Time to repair\*\*: (\d*. \d*.|\d*.)`) + timeToRepair := 0 + count := 0 + for _, regmatch := range re.FindAllString(newline, -1) { + t := strings.Replace(regmatch, "**Time to repair**: ", "", -1) + timeToRepairTemp := convertToRaw(t) + timeToRepair = timeToRepair + timeToRepairTemp + count += 1 + } + + re2 := regexp.MustCompile(`\*\*Time to resolve\*\*: (\d*. \d*.|\d*.)`) + timeToResolve := 0 + resolveCount := 0 + for _, resolveMatch := range re2.FindAllString(newline, -1) { + t := strings.Replace(resolveMatch, "**Time to resolve**: ", "", -1) + timeToResolveTemp := convertToRaw(t) + timeToResolve = timeToResolve + timeToResolveTemp + resolveCount += 1 + } + + if count != 0 { + meanTimeToRepair := timeToRepair / count + d := time.Duration(meanTimeToRepair) * time.Minute + hours := int(d.Hours()) + minutes := int(d.Minutes()) % 60 + fmt.Printf("Incidents:%2d\n", count) + fmt.Printf("Mean time to repair: %2dh %02dm\n", hours, minutes) + } + + if resolveCount != 0 { + meantTimeToResolve := timeToResolve / resolveCount + d := time.Duration(meantTimeToResolve) * time.Minute + hours := int(d.Hours()) + minutes := int(d.Minutes()) % 60 + fmt.Printf("Mean time to resolve: %2dh %02dm", hours, minutes) + fmt.Println("\n") + } + + } +} diff --git a/runbooks/bin/mean-time-to-repair.rb b/runbooks/bin/mean-time-to-repair.rb deleted file mode 100755 index 2da1c15e..00000000 --- a/runbooks/bin/mean-time-to-repair.rb +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env ruby - -# Consume the incident log file and output incident performance metrics -# -# Usage (from inside the `runbooks/source` directory): -# -# ../bin/mean-time-to-repair.rb -# - -require "date" - -INCIDENT_LOG = "incident-log.html.md.erb" - -QUARTER_REGEX = %r[^## Q\d \d\d\d\d ] -INCIDENT_REGEX = %r[^### Incident on ] -TIME_TO_REPAIR_REGEX = %r[- ..Time to repair..: (\d+h \d+m)] -TIME_TO_RESOLVE_REGEX = %r[- ..Time to resolve..: (\d+h \d+m)] - -def main - puts parse_incident_log - .map { |quarter, incidents| Quarter.new(quarter, incidents) } - .sort { |a, b| a.title <=> b.title } - .map(&:report) -end - -# Turn the incident log into a hash of: -# -# { [quarter label] => [ list of incidents in quarter ], ... } -# -# ...where each 'incident' is a hash: { time_to_repair: "Xh Ym", time_to_resolve: "Xh Ym" } -# -def parse_incident_log - data = {} - current_quarter = nil - current_incident = nil - - IO.foreach(INCIDENT_LOG) do |l| - line = l.chomp - - case line - when QUARTER_REGEX - if current_quarter # i.e. this isn't the first quarter marker in the file - # We just reached the next quarter, so add the current incident to the current quarter - data[current_quarter].push(current_incident) unless current_incident.nil? - end - - # initialise a new 'quarter' hash - current_quarter = line - data[current_quarter] = [] - when INCIDENT_REGEX - # We reached an incident marker. Finish off this incident and start a new hash. - data[current_quarter].push(current_incident) unless current_incident.nil? - current_incident = {} - when TIME_TO_REPAIR_REGEX - # We've found the time_to_repair line for this incident - m = TIME_TO_REPAIR_REGEX.match(line) - current_incident[:time_to_repair] = m[1] - when TIME_TO_RESOLVE_REGEX - # We've found the time_to_resolve line for this incident - m = TIME_TO_RESOLVE_REGEX.match(line) - current_incident[:time_to_resolve] = m[1] - end - end - - # Ensure we handle the last incident in the file - data[current_quarter].push(current_incident) unless current_incident == {} - - data -end - -class Quarter - attr_reader :title, :incidents - - def initialize(title, incidents) - @title = title - @incidents = incidents - .reject { |i| i == {} } - .map { |i| Incident.new(i) } - end - - def report - <<~EOF - #{title} - Incidents: #{incidents.length} - Mean time to repair: #{mean_time_to_repair} - Mean time to resolve: #{mean_time_to_resolve} - EOF - end - - private - - def mean_time_to_repair - sum = incidents.map(&:time_to_repair).sum - hours_and_minutes(sum / incidents.length) - end - - def mean_time_to_resolve - sum = incidents.map(&:time_to_resolve).sum - hours_and_minutes(sum / incidents.length) - end - - def hours_and_minutes(seconds) - hours = seconds / 3600 - seconds = seconds % 3600 - minutes = seconds / 60 - "#{hours}h #{minutes}m" - end -end - - -class Incident - def initialize(params) - @time_to_repair = params[:time_to_repair] - @time_to_resolve = params[:time_to_resolve] - end - - def time_to_repair - to_seconds(@time_to_repair || @time_to_resolve) # In case one is missing - end - - def time_to_resolve - to_seconds(@time_to_resolve || @time_to_repair) # In case one is missing - end - - private - - # "1h 5m" => 3900 (seconds) - def to_seconds(str) - hours, minutes = str.split(" ").map(&:to_i) - (hours * 3600) + (minutes * 60) - end -end - -main - diff --git a/runbooks/source/incident-log.html.md.erb b/runbooks/source/incident-log.html.md.erb index c4aab251..8abbb67f 100644 --- a/runbooks/source/incident-log.html.md.erb +++ b/runbooks/source/incident-log.html.md.erb @@ -5,12 +5,13 @@ weight: 45 # Incident Log -> Use the [mean-time-to-repair.rb] script to view performance metrics +> Use the [mean-time-to-repair] go script to view performance metrics +--- ## Q3 2024 (July-September) -- **Mean Time to Repair**: 3h 8m -- **Mean Time to Resolve**: 4h 9m +- **Mean Time to Repair**: 1h 39m +- **Mean Time to Resolve**: 2h 14m ### Incident on 2024-09-20 - EKS Subnet Route Table Associations destroyed @@ -92,6 +93,7 @@ weight: 45 - [Re-introduce Opensearch in to Live logging](https://github.com/ministryofjustice/cloud-platform/issues/5929) - [Investigate fluent-bit "failed to flush chunk"](https://github.com/ministryofjustice/cloud-platform/issues/5930) +--- ## Q1 2024 (January-April) - **Mean Time to Repair**: 3h 21m @@ -138,6 +140,7 @@ weight: 45 - Look into creating a blue-green prometheus to have live like setup to test changes before applying to live - Spike into Amazon Managed Prometheus +--- ## Q4 2023 (October-December) - **Mean Time to Repair**: 35h 36m @@ -181,6 +184,7 @@ weight: 45 - Any other services which is probing prometheus that triggers the restart - Is taking regular velero backups distrub the ebs read/write and cause the restart +--- ## Q3 2023 (July-September) - **Mean Time to Repair**: 10h 55m @@ -340,6 +344,7 @@ weight: 45 - **Review actions**: - Add a test/check to ensure the IP address allocation is working as expected [#4669](https://github.com/ministryofjustice/cloud-platform/issues/4669) +--- ## Q2 2023 (April-June) - **Mean Time to Repair**: 0h 55m @@ -380,9 +385,10 @@ weight: 45 - **Review actions**: - Add a runbook for the steps to perform when changing the node instance type +--- ## Q1 2023 (January-March) -- **Mean Time to Repair**: 225h 11m +- **Mean Time to Repair**: 225h 10m - **Mean Time to Resolve**: 225h 28m @@ -534,6 +540,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Look into a Terraform resource for CircleCI - Use IRSA instead of AWS Keys +--- ## Q4 2022 (October-December) - **Mean Time to Repair**: 27m @@ -582,6 +589,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - AWS advise received - ticket raised to investigate potential solutions: [Implementation of notification of Scheduled Instance Retirements - to Slack. Investigate 2 potential AWS solutions#4264](https://app.zenhub.com/workspaces/cloud-platform-team-5ccb0b8a81f66118c983c189/issues/ministryofjustice/cloud-platform/4264). +--- ## Q3 2022 (July-September) - **Mean Time to Repair**: 6h 27m @@ -619,11 +627,12 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Review actions**: - Mitigaton tickets raised following a post-incident review: https://github.com/ministryofjustice/cloud-platform/issues?q=is%3Aissue+is%3Aopen+post-aws-incident +--- ## Q1 2022 (January to March) -- **Mean Time to Repair**: 1h 39m +- **Mean Time to Repair**: 1h 05m -- **Mean Time to Resolve**: 1h 59m +- **Mean Time to Resolve**: 1h 24m ### Incident on 2022-03-10 11:48 - All ingress resources using *.apps.live.cloud-platform urls showing certificate issue @@ -694,6 +703,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Create a runbook to handle ErrorsInExternalDNS alarm [#3501](https://github.com/ministryofjustice/cloud-platform/issues/3501) - Assign someone to be the 'hammer' on Fridays +--- ## Q4 2021 (October to December) - **Mean Time to Repair**: 1h 17m @@ -726,6 +736,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Review actions**: - N/A +--- ## Q3 2021 (July-September) - **Mean Time to Repair**: 3h 28m @@ -833,6 +844,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Created a [ticket](https://github.com/ministryofjustice/cloud-platform/issues/3083) to set terraform kubernetes credentials dynamically (at executing time) - Fix the pipeline: Before the creation of Terraform resources, add a function in the cli to perform a `kubectl context` switch to the correct cluster. PR exists +--- ## Q2 2021 (April-June) - **Mean Time to Repair**: 2h 32m @@ -902,6 +914,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - Disband the cloud-platform-concourse repository. This includes Service accounts, and pipelines. We should split this repository up and move it to the infra/terraform-concourse repos. Ticket [#3017](https://github.com/ministryofjustice/cloud-platform/issues/3017) - Manager needs to use our PSPs instead of eks-privilege - this has already been done. +--- ## Q1 2021 (January - March) - **Mean Time to Repair**: N/A @@ -910,6 +923,7 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ ### No incidents declared +--- ## Q4 2020 (October - December) - **Mean Time to Repair**: 2h 8m @@ -942,11 +956,12 @@ Full detailed breakdown of events can be found in the [postmortem notes](https:/ - **Resolution**: - Migrate all ingresses back to the default ingress controller +--- ## Q3 2020 (July - September) -- **Mean Time To Repair**: 1h 9m +- **Mean Time To Repair**: 59m -- **Mean Time To Resolve**: 7h 26m +- **Mean Time To Resolve**: 7h 13m ### Incident on 2020-09-28 13:10 - Termination of nodes updating kops Instance Group. @@ -1123,11 +1138,12 @@ ttps://docs.google.com/document/d/1kxKwC1B_pnlPbysS0zotbXMKyZcUDmDtnGbEyIHGvgQ/e - We replaced all our master nodes with c5.4xlarge instances, which (currently) have better availability - We and AWS are still investigating longer-term and more reliable fixes +--- ## Q2 2020 (April - June) -- **Mean Time To Repair**: 2h 5m +- **Mean Time To Repair**: 2h 49m -- **Mean Time To Resolve**: 15h 53m +- **Mean Time To Resolve**: 7h 12m ### Incident on 2020-08-04 17:13 @@ -1190,6 +1206,7 @@ ttps://docs.google.com/document/d/1kxKwC1B_pnlPbysS0zotbXMKyZcUDmDtnGbEyIHGvgQ/e - **Resolution**: The Nginx configuration was modified to enable TLSv1, TLSv1.1 and TLSv1.2 +--- ## Q1 2020 (January - March) - **Mean Time To Repair**: 1h 22m @@ -1318,4 +1335,4 @@ Datestamps: please use `YYYY-MM-DD HH:MM` (almost ISO 8601, but more readable), - **Review actions**: - - [mean-time-to-repair.rb]: https://github.com/ministryofjustice/cloud-platform/blob/main/runbooks/bin/mean-time-to-repair.rb + [mean-time-to-repair.rb]: https://github.com/ministryofjustice/cloud-platform/blob/main/cmd/mean-time-to-repair