Merge pull request #1299 from cloudflare/promql/fragile

Remove aggregation logic from promql/fragile
cloudflare · Feb 20, 2025 · 3d27620 · 3d27620
2 parents e045a3e + a60fc7c
commit 3d27620
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 337 deletions.
diff --git a/cmd/pint/tests/0076_ci_group_errors.txt b/cmd/pint/tests/0076_ci_group_errors.txt
@@ -293,9 +293,6 @@ rules.yml:32-33 Bug: `rule/owner` comments are required in all files, please add
 rules.yml:33 Warning: Alert query doesn't have any condition, it will always fire if the metric exists. (alerts/comparison)
  33 |     expr: errors / sum(requests) without(rack)
 
-rules.yml:33 Warning: Aggregation using `without()` can be fragile when used inside binary expression because both sides must have identical sets of labels to produce any results, adding or removing labels to metrics used here can easily break the query, consider aggregating using `by()` to ensure consistent labels. (promql/fragile)
- 33 |     expr: errors / sum(requests) without(rack)
-
 rules.yml:35-36 Bug: `rule/owner` comments are required in all files, please add a `# pint file/owner $owner` somewhere in this file and/or `# pint rule/owner $owner` on top of each rule. (rule/owner)
  35 |   - record: regexp
  36 |     expr: sum(no_such_metric{job=~"fake"})
@@ -333,7 +330,4 @@ rules.yml:38-39 Bug: `rule/owner` comments are required in all files, please add
 rules.yml:39 Warning: Alert query doesn't have any condition, it will always fire if the metric exists. (alerts/comparison)
  39 |     expr: errors / sum(requests) without(rack)
 
-rules.yml:39 Warning: Aggregation using `without()` can be fragile when used inside binary expression because both sides must have identical sets of labels to produce any results, adding or removing labels to metrics used here can easily break the query, consider aggregating using `by()` to ensure consistent labels. (promql/fragile)
- 39 |     expr: errors / sum(requests) without(rack)
-
 level=ERROR msg="Fatal error" err="submitting reports: fatal error(s) reported"
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -9,6 +9,10 @@
 - Added [promql/impossible](checks/promql/impossible.md) check.
 - Added `ignoreMatchingElsewhere` option to [promql/series](checks/promql/series.md) check.
 
+### Changed
+
+- Removed aggregation checks from the [promql/fragile](checks/promql/fragile.md) check - #1289.
+
 ### Fixed
 
 - Improved the logic of [promql/series](checks/promql/series.md) check to skip checks on some selectors

diff --git a/docs/checks/promql/fragile.md b/docs/checks/promql/fragile.md
@@ -6,94 +6,20 @@ grand_parent: Documentation
 
 # promql/fragile
 
-This check will try to find rules with queries that can be rewritten in a way
-which makes them more resilient to label changes.
+This check will try to find alerting rules that might produce flapping alerts
+due to how they use sampling functions like `topk()`.
 
-Example:
+Consider this alert:
 
-Let's assume we have these metrics:
-
-```js
-errors{cluster="prod", instance="server1", job="node_exporter"} 5
-requests{cluster="prod", instance="server1", job="node_exporter", rack="a"} 10
-requests{cluster="prod", instance="server1", job="node_exporter", rack="b"} 30
-requests{cluster="prod", instance="server1", job="node_exporter", rack="c"} 25
-```
-
-If we want to calculate the ratio of errors to requests we can use this query:
-
-```js
-errors / sum(requests) without(rack)
-```
-
-`sum(requests) without(rack)` will produce this result:
-
-```js
-requests{cluster="prod", instance="server1", job="node_exporter"} 65
-```
-
-Both sides of the query will have exact same set of labels:
-
-```js
-{cluster="prod", instance="server1", job="node_exporter"}`
-```
-
-which is needed to be able to use a binary expression here, and so this query will
-work correctly.
-
-But the risk here is that if at any point we change labels on those metrics we might
-end up with left and right hand sides having different set of labels.
-Let's see what happens if we add an extra label to `requests` metric.
-
-```js
-errors{cluster="prod", instance="server1", job="node_exporter"} 5
-requests{cluster="prod", instance="server1", job="node_exporter", rack="a", path="/"} 3
-requests{cluster="prod", instance="server1", job="node_exporter", rack="a", path="/users"} 7
-requests{cluster="prod", instance="server1", job="node_exporter", rack="b", path="/"} 10
-requests{cluster="prod", instance="server1", job="node_exporter", rack="b", path="/login"} 1
-requests{cluster="prod", instance="server1", job="node_exporter", rack="b", path="/users"} 19
-requests{cluster="prod", instance="server1", job="node_exporter", rack="c", path="/"} 25
-```
-
-Our left hand side (`errors` metric) still has the same set of labels:
-
-```js
-{cluster="prod", instance="server1", job="node_exporter"}
-```
-
-But `sum(requests) without(rack)` will now return a different result:
-
-```js
-requests{cluster="prod", instance="server1", job="node_exporter", path="/"} 38
-requests{cluster="prod", instance="server1", job="node_exporter", path="/users"} 26
-requests{cluster="prod", instance="server1", job="node_exporter", path="/login"} 1
-```
-
-We no longer get a single result because we only aggregate by removing `rack` label.
-Newly added `path` label is not being aggregated away so it splits our results into
-multiple series. Since our left hand side doesn't have any `path` label it won't
-match any of the right hand side result and this query won't produce anything.
-
-One solution here is to add `path` to `without()` to remove this label when aggregating,
-but this requires updating all queries that use this metric every time labels change.
-
-Another solution is to rewrite this query with `by()` instead of `without()` which
-will ensure that extra labels will be aggregated away automatically:
-
-```js
-errors / sum(requests) by(cluster, instance, job)
+```yaml
+- alert: oops
+  expr: topk(10, mymetric > 0)
 ```
 
-The list of labels we aggregate by doesn't have to match exactly with the list
-of labels on the left hand side, we can use `on()` to instruct Prometheus
-which labels should be used to match both sides.
-For example if we would remove `job` label during aggregation we would once
-again have different sets of labels on both side, but we can fix that
-by adding labels we use in `by()` to `on()`:
-
-```js
-errors / on(cluster, instance) sum(requests) by(cluster, instance)
-```
+The `topk(10, mymetric > 0)` query used here might return a different set of time series on each rule evaluation.
+Different time series will have different labels, and because labels from returned time series are added to the
+alert labels it means that a different set of alerts will fire each time.
+This will cause flapping alerts from this rule.
 
 ## Configuration
 

diff --git a/internal/checks/promql_fragile.go b/internal/checks/promql_fragile.go
@@ -54,16 +54,6 @@ func (c FragileCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rul
 		return nil
 	}
 
-	for _, problem := range c.checkAggregation(expr.Value.Value, expr.Query) {
-		problems = append(problems, Problem{
-			Lines:    expr.Value.Lines,
-			Reporter: c.Reporter(),
-			Text:     problem.text,
-			Details:  problem.details,
-			Severity: problem.severity,
-		})
-	}
-
 	if rule.AlertingRule != nil {
 		for _, problem := range c.checkSampling(expr.Value.Value, expr.Query.Expr) {
 			problems = append(problems, Problem{
@@ -79,54 +69,6 @@ func (c FragileCheck) Check(_ context.Context, _ discovery.Path, rule parser.Rul
 	return problems
 }
 
-func (c FragileCheck) checkAggregation(query string, node *parser.PromQLNode) (problems []exprProblem) {
-	if n := utils.HasOuterBinaryExpr(node); n != nil && n.Op != promParser.LOR && n.Op != promParser.LUNLESS {
-		if n.VectorMatching != nil && n.VectorMatching.On {
-			goto NEXT
-		}
-		if _, ok := n.LHS.(*promParser.NumberLiteral); ok {
-			goto NEXT
-		}
-		if _, ok := n.RHS.(*promParser.NumberLiteral); ok {
-			goto NEXT
-		}
-		var isFragile bool
-		for _, child := range node.Children {
-			for _, src := range utils.LabelsSource(query, child.Expr) {
-				if src.Type == utils.AggregateSource && !src.FixedLabels {
-					isFragile = true
-				}
-			}
-		}
-		if !isFragile {
-			goto NEXT
-		}
-
-		// don't report any issues if query uses same metric for both sides
-		series := map[string]struct{}{}
-		for _, ac := range node.Children {
-			for _, vs := range utils.HasVectorSelector(ac) {
-				series[vs.Name] = struct{}{}
-			}
-		}
-		if len(series) >= 2 {
-			p := exprProblem{
-				text:     "Aggregation using `without()` can be fragile when used inside binary expression because both sides must have identical sets of labels to produce any results, adding or removing labels to metrics used here can easily break the query, consider aggregating using `by()` to ensure consistent labels.",
-				severity: Warning,
-			}
-			problems = append(problems, p)
-			return problems
-		}
-	}
-
-NEXT:
-	for _, child := range node.Children {
-		problems = append(problems, c.checkAggregation(query, child)...)
-	}
-
-	return problems
-}
-
 func (c FragileCheck) checkSampling(expr string, node promParser.Node) (problems []exprProblem) {
 	for _, src := range utils.LabelsSource(expr, node) {
 		if src.Type != utils.AggregateSource {