Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Setup MCOA dashboards and scrape configs #1726

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
888ecde
add first dash
thibaultmg Dec 16, 2024
abe6cd2
wip
thibaultmg Dec 18, 2024
2bf01e7
fix deprecated metrics
thibaultmg Dec 19, 2024
20a412c
optimise container_memory_cache
thibaultmg Dec 19, 2024
80b087b
optimise container_memory_rss
thibaultmg Dec 19, 2024
5c72adc
optimise and fix container_memory_
thibaultmg Dec 19, 2024
774397c
remove kube_pod_owner
thibaultmg Dec 19, 2024
7af4744
remove kube_pod_container_resource_requests:sum
thibaultmg Dec 19, 2024
fce0259
more rules
thibaultmg Dec 19, 2024
ae4a691
remove some hub rules
thibaultmg Dec 19, 2024
b35f130
consolidate ci sripts
thibaultmg Jan 7, 2025
6fddba5
update ci readme
thibaultmg Jan 7, 2025
e5a1a8b
update and clean cicd metrics checks
thibaultmg Jan 9, 2025
e21cf4b
refactor grafana dashboards, add scrape configs
thibaultmg Jan 9, 2025
4f16fc3
update grafana rendering
thibaultmg Jan 9, 2025
27e7bed
add missing update on script
thibaultmg Jan 10, 2025
bb146e4
fix unit tests
thibaultmg Jan 10, 2025
ffd3ea5
add scrapeconfig perms to mco
thibaultmg Jan 10, 2025
74f0628
add promalpha to scheme
thibaultmg Jan 10, 2025
e108ab9
add deperecated suffix to dashboard titles
thibaultmg Jan 13, 2025
e0ba372
change nexus to mcoa
thibaultmg Jan 13, 2025
e6a5b48
add metrics checks in github actions
thibaultmg Jan 13, 2025
0af9572
fix ci
thibaultmg Jan 13, 2025
bcbb71d
fix ci
thibaultmg Jan 13, 2025
349cd96
clean
thibaultmg Jan 13, 2025
0a2f917
clean prometheus tar
thibaultmg Jan 14, 2025
9f36702
add copyright to script
thibaultmg Jan 15, 2025
4f37e7a
add copyright
thibaultmg Jan 15, 2025
89e7a5a
format script
thibaultmg Jan 15, 2025
ec7e8e1
fix lint
thibaultmg Jan 15, 2025
270d9db
fix lint
thibaultmg Jan 15, 2025
a05dd6a
fix lint
thibaultmg Jan 15, 2025
76b353b
update scrapeconfigs and rules labels
thibaultmg Jan 24, 2025
7e47a3d
update addonconfig with new resources
thibaultmg Jan 24, 2025
7c5d62e
fix unit tests
thibaultmg Jan 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/metrics.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: collected metrics (MCOA)

on:
push:
branches:
- main
tags:
- "*"
pull_request:

jobs:
metrics:
runs-on: ubuntu-latest
name: Check collected metrics for dashboards
env:
GOBIN: /tmp/.bin
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install Go
uses: actions/setup-go@v5
with:
go-version: 1.22.x
cache-dependency-path: "**/*.sum"

- name: Install dependencies
run: make install-check-metrics-deps

- name: Check metrics for MCOA
run: make check-metrics
16 changes: 14 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ NewHistorgram,NewHistogramVec,NewSummary,NewSummaryVec}=github.com/prometheus/cl
NewCounterVec,NewCounterVec,NewGauge,NewGaugeVec,NewGaugeFunc,NewHistorgram,NewHistogramVec,NewSummary,NewSummaryVec},\
github.com/NYTimes/gziphandler.{GzipHandler}=github.com/klauspost/compress/gzhttp.{GzipHandler},\
sync/atomic=go.uber.org/atomic,\
io/ioutil.{Discard,NopCloser,ReadAll,ReadDir,ReadFile,TempDir,TempFile,Writefile}" ./...
@$(FAILLINT) -paths "fmt.{Print,Println}" -ignore-tests ./...
io/ioutil.{Discard,NopCloser,ReadAll,ReadDir,ReadFile,TempDir,TempFile,Writefile}" ./operators/... ./collectors/... ./loaders/... ./proxy/...
@$(FAILLINT) -paths "fmt.{Print,Println}" -ignore-tests ./operators/... ./collectors/... ./loaders/... ./proxy/...
@echo ">> examining all of the Go files"
@go vet -stdmethods=false ./...
@echo ">> linting all of the Go files GOGC=${GOGC}"
Expand All @@ -117,6 +117,10 @@ io/ioutil.{Discard,NopCloser,ReadAll,ReadDir,ReadFile,TempDir,TempFile,Writefile
@go run ./scripts/copyright
$(call require_clean_work_tree,'detected files without copyright, run make lint and commit changes')

.PHONY: check-metrics
check-metrics:
@$(MAKE) -C cicd-scripts/metrics check-metrics

.PHONY: unit-tests ## Run all unit tests.
unit-tests: unit-tests-operators unit-tests-loaders unit-tests-proxy unit-tests-collectors

Expand Down Expand Up @@ -188,6 +192,14 @@ install-envtest-deps: ## Install env-test.
@mkdir -p $(BIN_DIR)
@./scripts/install-binaries.sh install_envtest_deps $(BIN_DIR)

.PHONY: install-check-metrics-deps
install-check-metrics-deps:
@mkdir -p $(BIN_DIR)
@./scripts/install-binaries.sh install_jq $(BIN_DIR)
@./scripts/install-binaries.sh install_yq $(BIN_DIR)
@./scripts/install-binaries.sh install_mimirtool $(BIN_DIR)
@./scripts/install-binaries.sh install_promtool $(BIN_DIR)

##@ Multi-Cluster-Observability Operator

.PHONY: deploy
Expand Down
60 changes: 60 additions & 0 deletions cicd-scripts/metrics/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) 2021 Red Hat, Inc.
# Copyright Contributors to the Open Cluster Management project

DAHSBOARDS_DIR = ../../operators/multiclusterobservability/manifests/base/grafana
PLATFORM_DASH_DIR = $(DAHSBOARDS_DIR)/platform-mcoa
HCP_DASH_DIR = $(DAHSBOARDS_DIR)/hcp-mcoa
ALERTS_DASH_DIR = $(DAHSBOARDS_DIR)/alerts
VIRTUALIZATION_DASH_DIR = $(DAHSBOARDS_DIR)/virtualization
HUB_RULES = cluster:memory_requested:ratio,cluster:memory_utilized:ratio,cluster:cpu_allocatable:sum,cluster:cpu_requested:ratio,cluster:cpu_cores:sum,acm_label_names,acm_managed_cluster_labels

TMPDIR := $(shell mktemp -d)

.PHONY: check-metrics check-platform-metrics check-hcp-metrics check-alerts-metrics check-virtualization-metrics clean-tmpdir
check-metrics: check-platform-metrics check-hcp-metrics check-alerts-metrics check-virtualization-metrics clean-tmpdir

check-platform-metrics:
@echo "--> Checking platform metrics:"
@$(CURDIR)/scripts/extract-dashboards-metrics.sh $(PLATFORM_DASH_DIR) | tr '\n' ',' > $(TMPDIR)/dash-metrics
@go run cmd/dashcheck/main.go --scrape-configs=$(PLATFORM_DASH_DIR)/scrape-config.yaml \
--dashboard-metrics=$$(cat $(TMPDIR)/dash-metrics) \
--ignored-dashboard-metrics=$(HUB_RULES)
@cat $(PLATFORM_DASH_DIR)/prometheus-rule.yaml | yq '.spec' | promtool check rules
@go run cmd/rulescheck/main.go --scrape-configs=$(PLATFORM_DASH_DIR)/scrape-config.yaml \
--rules=$(PLATFORM_DASH_DIR)/prometheus-rule.yaml \
--ignore-duplicated-rules=namespace_workload_pod:kube_pod_owner:relabel
@rm -d $(TMPDIR)/dash-metrics

check-hcp-metrics:
@echo "--> Checking hcp metrics:"
@$(CURDIR)/scripts/extract-dashboards-metrics.sh $(HCP_DASH_DIR) | tr '\n' ',' > $(TMPDIR)/dash-metrics
@go run cmd/dashcheck/main.go --scrape-configs=$(HCP_DASH_DIR)/scrape-config.yaml \
--dashboard-metrics=$$(cat $(TMPDIR)/dash-metrics) \
--ignored-dashboard-metrics=$(HUB_RULES) \
--additional-scrape-configs=$(PLATFORM_DASH_DIR)/scrape-config.yaml
@go run cmd/rulescheck/main.go --scrape-configs=$(HCP_DASH_DIR)/scrape-config.yaml \
@rm -d $(TMPDIR)/dash-metrics

check-alerts-metrics:
@echo "--> Checking alert metrics:"
@$(CURDIR)/scripts/extract-dashboards-metrics.sh $(ALERTS_DASH_DIR) | tr '\n' ',' > $(TMPDIR)/dash-metrics
@go run cmd/dashcheck/main.go --scrape-configs=$(ALERTS_DASH_DIR)/scrape-config.yaml \
--dashboard-metrics=$$(cat $(TMPDIR)/dash-metrics) \
--ignored-dashboard-metrics=$(HUB_RULES) \
--additional-scrape-configs=$(PLATFORM_DASH_DIR)/scrape-config.yaml
@rm -d $(TMPDIR)/dash-metrics

check-virtualization-metrics:
@echo "--> Checking virtualization metrics:"
@$(CURDIR)/scripts/extract-dashboards-metrics.sh $(VIRTUALIZATION_DASH_DIR) | tr '\n' ',' > $(TMPDIR)/dash-metrics
@go run cmd/dashcheck/main.go --scrape-configs=$(VIRTUALIZATION_DASH_DIR)/scrape-config.yaml \
--dashboard-metrics=$$(cat $(TMPDIR)/dash-metrics) \
--ignored-dashboard-metrics=$(HUB_RULES) \
--additional-scrape-configs=$(PLATFORM_DASH_DIR)/scrape-config.yaml,$(ALERTS_DASH_DIR)/scrape-config.yaml
@rm -d $(TMPDIR)/dash-metrics

clean-tmpdir:
@if [ -n "$(TMPDIR)" ] && [ -d "$(TMPDIR)" ]; then \
rm -r "$(TMPDIR)"; \
echo "Temporary directory $(TMPDIR) removed."; \
fi
15 changes: 15 additions & 0 deletions cicd-scripts/metrics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Metrics CI tools

Collection of simple CI tools and scripts to validate the list of metrics collected for a list of dashboards.
This helps maintain the list of metrics and rules in sync with the dashboards, and ensure that we don't collect
more metrics than needed.

More precisely, it helps verifying that:
* Metrics needed in dashboards are collected, and not more
* Rules needed in dashboards are defined, and not more
* Query rules are valid

## Usage

When adding a new list of dashboards in a new directory, make sure that you define the corresponding scrapeConfig and rules.
Then add a target in the Makefile to run the metrics check for the new dashboards, following the existing examples.
89 changes: 89 additions & 0 deletions cicd-scripts/metrics/cmd/dashcheck/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright (c) Red Hat, Inc.
// Copyright Contributors to the Open Cluster Management project
// Licensed under the Apache License 2.0

/*
CI tool that provides a simple CLI to ensure that a list of metrics used in dashboards is well federated by
the referenced scrapeConfigs.
It ensures that metrics are not duplicated in scrape configs and that no unneeded metric is collected.
*/
package main

import (
"flag"
"fmt"
"os"
"slices"
"strings"

"github.com/stolostron/multicluster-observability-operator/cicd-scripts/metrics/internal/scrapeconfig"
"github.com/stolostron/multicluster-observability-operator/cicd-scripts/metrics/internal/utils"
)

func main() {
scrapeConfigsArg := flag.String("scrape-configs", "", "Path to the comma separated scrape_configs")
dashboardMetricsArg := flag.String("dashboard-metrics", "", "Comma separated dashboard metrics")
ignoredDashboardMetricsArg := flag.String("ignored-dashboard-metrics", "", "Comma separated ignored dashboard metrics. For example, rules that are computed on the hub instead of being collected from the spokes.")
additionalScrapeConfigsArg := flag.String("additional-scrape-configs", "", "Path to the comma separated scrape_configs that are collected in addition of the main one. Over collected metrics from them are ignored.")
flag.Parse()

fmt.Println("Dashcheck — Verifying alignement of federated metrics from scrape configs for dashboard metrics.")

if *scrapeConfigsArg == "" {
fmt.Println("Please provide the scrape_configs paths")
return
}

if *dashboardMetricsArg == "" {
fmt.Println("Please provide the dashboard metrics")
return
}

ignoredDashboardMetrics := strings.Split(*ignoredDashboardMetricsArg, ",")

dashboardMetrics := strings.Split(*dashboardMetricsArg, ",")
dashboardMetrics = slices.DeleteFunc(dashboardMetrics, func(s string) bool { return s == "" || slices.Contains(ignoredDashboardMetrics, s) })
if len(dashboardMetrics) == 0 {
fmt.Println("No dashboard metrics found")
os.Exit(1)
}

collectedMetrics, err := scrapeconfig.ReadFederatedMetrics(*scrapeConfigsArg)
if err != nil {
fmt.Printf("Failed to read scrape configs: %v", err)
os.Exit(1)
}

var additionalMetrics []string
if len(*additionalScrapeConfigsArg) > 0 {
additionalMetrics, err = scrapeconfig.ReadFederatedMetrics(*additionalScrapeConfigsArg)
if err != nil {
fmt.Printf("Failed to read additional scrape configs: %v", err)
os.Exit(1)
}

collectedMetrics = append(collectedMetrics, additionalMetrics...)
}

if dups := utils.Duplicates(collectedMetrics); len(dups) > 0 {
fmt.Println("Duplicate metrics found in scrape configs: ", dups)
os.Exit(1)
}

added, removed := utils.Diff(dashboardMetrics, collectedMetrics)
// Remove additional metrics from the added list
// They must be ignored
added = slices.DeleteFunc(added, func(s string) bool { return s == "" || slices.Contains(additionalMetrics, s) })
if len(added) > 0 {
fmt.Println("Metrics found in scrape configs but not in dashboards: ", added)
os.Exit(1)
}

if len(removed) > 0 {
fmt.Println("Metrics found in dashboards but not in scrape configs: ", removed)
os.Exit(1)
}

greenCheckMark := "\033[32m" + "✓" + "\033[0m"
fmt.Println(greenCheckMark, "Scrape configs are collecting all dashboards metrics, not more. Good job!")
}
106 changes: 106 additions & 0 deletions cicd-scripts/metrics/cmd/rulescheck/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright (c) Red Hat, Inc.
// Copyright Contributors to the Open Cluster Management project
// Licensed under the Apache License 2.0

/*
CI tool that provides a simple CLI to ensure that metrics resulting from rules evaluation defined in scrape configs
are defined in the listed rule files.
It ensures that rules are not duplicated and that no unneeded rule is defined.
*/
package main

import (
"flag"
"fmt"
"os"
"slices"
"strings"

"github.com/stolostron/multicluster-observability-operator/cicd-scripts/metrics/internal/rule"
"github.com/stolostron/multicluster-observability-operator/cicd-scripts/metrics/internal/scrapeconfig"
"github.com/stolostron/multicluster-observability-operator/cicd-scripts/metrics/internal/utils"
)

func main() {
scrapeConfigsArg := flag.String("scrape-configs", "", "Path to the comma separated scrape_configs")
rulesArg := flag.String("rules", "", "Comma separated prometheus rules files")
ignoreDupRulesArg := flag.String("ignore-duplicated-rules", "", "Comma separated ignored duplicated rules")
greenCheckMark := "\033[32m" + "✓" + "\033[0m"
flag.Parse()

fmt.Println("Rulescheck — Verifying alignement of rules definition with the rules federated by the scrape configs.")

if *scrapeConfigsArg == "" {
fmt.Println("Please provide the scrape_configs paths")
return
}

collectedRules, err := scrapeconfig.ReadFederatedMetrics(*scrapeConfigsArg)
if err != nil {
fmt.Printf("Failed to read scrape configs: %v", err)
os.Exit(1)
}

collectedRules = slices.DeleteFunc(collectedRules, func(s string) bool { return !strings.Contains(s, ":") })
if len(collectedRules) == 0 {
fmt.Println(greenCheckMark, "No rule collected by the scrape configs.")
return
}

if dups := utils.Duplicates(collectedRules); len(dups) > 0 {
fmt.Println("Duplicate metrics found in scrape configs: ", dups)
os.Exit(1)
}

if *rulesArg == "" {
fmt.Println("Please provide prometheus rules files")
return
}

promRulesList, err := rule.ReadFiles(*rulesArg)
if err != nil {
fmt.Println("Error reading prometheus rules: ", err)
os.Exit(1)
}

if len(promRulesList) == 0 {
fmt.Println("No prometheus rules found")
os.Exit(1)
}

rulesDefined := []string{}
for _, promRule := range promRulesList {
if promRule == nil {
fmt.Println("Rule is nil")
os.Exit(1)
}

metrics, err := rule.RuleNames(promRule)
if err != nil {
fmt.Println("Error extracting metrics: ", err)
os.Exit(1)
}

rulesDefined = append(rulesDefined, metrics...)
}

ignoredRules := strings.Split(*ignoreDupRulesArg, ",")
rulesWithoutIgnoredDups := slices.DeleteFunc(slices.Clone(rulesDefined), func(s string) bool { return s == "" || slices.Contains(ignoredRules, s) })
if dups := utils.Duplicates(rulesWithoutIgnoredDups); len(dups) > 0 {
fmt.Println("Duplicate rules found in prometheus rules: ", dups)
os.Exit(1)
}

added, removed := utils.Diff(collectedRules, rulesDefined)
if len(added) > 0 {
fmt.Println("Metrics found in scrape configs but not in rules: ", added)
os.Exit(1)
}

if len(removed) > 0 {
fmt.Println("Metrics found in rules but not in scrape configs: ", removed)
os.Exit(1)
}

fmt.Println(greenCheckMark, "The rules collected by the scrapeConfigs are all defined, not more. Good job!")
}
54 changes: 54 additions & 0 deletions cicd-scripts/metrics/internal/rule/rule.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) Red Hat, Inc.
// Copyright Contributors to the Open Cluster Management project
// Licensed under the Apache License 2.0

package rule

import (
"fmt"
"os"
"strings"

prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"sigs.k8s.io/yaml"
)

func ReadFiles(rulesPath string) ([]*prometheusv1.PrometheusRule, error) {
paths := strings.Split(rulesPath, ",")
ret := []*prometheusv1.PrometheusRule{}
for _, path := range paths {
fmt.Println("Reading prometheus rule: ", path)
res, err := ReadFile(path)
if err != nil {
return nil, err
}
ret = append(ret, res)
}

return ret, nil
}

func ReadFile(rulesPath string) (*prometheusv1.PrometheusRule, error) {
fileData, err := os.ReadFile(rulesPath)
if err != nil {
return nil, fmt.Errorf("failed to read file %s: %w", rulesPath, err)
}

rule := &prometheusv1.PrometheusRule{}
if err := yaml.Unmarshal(fileData, rule); err != nil {
return nil, fmt.Errorf("failed to unmarshal file %s: %w", rulesPath, err)
}

return rule, nil
}

func RuleNames(rules *prometheusv1.PrometheusRule) ([]string, error) {
ret := []string{}
for _, rule := range rules.Spec.Groups {
for _, rule := range rule.Rules {
ret = append(ret, rule.Record)
}
}

return ret, nil
}
Loading
Loading