Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Prometheus] Update prometheus histogram #36537

Merged
merged 14 commits into from
Sep 22, 2023
1 change: 1 addition & 0 deletions CHANGELOG-developer.next.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ The list below covers the major changes between 7.0.0-rc2 and main only.

==== Bugfixes

- Fix how Prometheus histograms are calculated when percentiles are provide.{pull}36537[36537]
- Stop using `mage:import` in community beats. This was ignoring the vendorized beats directory for some mage targets, using the code available in GOPATH, this causes inconsistencies and compilation problems if the version of the code in the GOPATH is different to the vendored one. Use of `mage:import` will continue to be unsupported in custom beats till beats is migrated to go modules, or mage supports vendored dependencies. {issue}13998[13998] {pull}14162[14162]
- Metricbeat module builders call host parser only once when instantiating light modules. {pull}20149[20149]
- Fix export dashboard command when running against Elastic Cloud hosted Kibana. {pull}22746[22746]
Expand Down
88 changes: 88 additions & 0 deletions metricbeat/helper/prometheus/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import (
"net/http"
"sort"
"testing"
"time"

"github.com/prometheus/prometheus/pkg/labels"

"github.com/stretchr/testify/assert"

Expand Down Expand Up @@ -1002,3 +1005,88 @@ func TestPrometheusKeyLabels(t *testing.T) {
}
}
}

func TestPrometheusHistogramWithoutSuffix(t *testing.T) {
promHistogramWithAndWithoutBucket := `
# HELP http_server_requests_seconds Duration of HTTP server request handling
# TYPE http_server_requests_seconds histogram
http_server_requests_seconds{exception="None",uri="/actuator/prometheus",quantile="0.002796201",} 0.046137344
http_server_requests_seconds{exception="None",uri="/actuator/prometheus",quantile="0.003145726",} 0.046137344
http_server_requests_seconds_bucket{exception="None",uri="/actuator/prometheus",le="0.001",} 0.0
http_server_requests_seconds_bucket{exception="None",uri="/actuator/prometheus",le="0.001048576",} 0.0
http_server_requests_seconds_count{exception="None",uri="/actuator/prometheus",} 1.0
http_server_requests_seconds_sum{exception="None",uri="/actuator/prometheus",} 0.046745444
http_server_requests_seconds_impossible{exception="None",uri="/actuator/prometheus",} 0.046745444
http_server_requests_seconds_created{exception="None",uri="/actuator/prometheus",} 0.046745444
`

labelsList := []*labels.Label{
{
Name: "exception",
Value: "None",
},
{
Name: "uri",
Value: "/actuator/prometheus",
},
}

cumulativeCounts := []uint64{
uint64(0.0),
uint64(0.0),
}
upperBounds := []float64{
0.001,
0.001048576,
}
var buckets []*Bucket
for i := range cumulativeCounts {
buckets = append(buckets, &Bucket{
CumulativeCount: &cumulativeCounts[i],
UpperBound: &upperBounds[i],
})
}

name := "http_server_requests_seconds"
help := "Duration of HTTP server request handling"

var expectedMetric OpenMetric
expectedMetric.Name = &name
expectedMetric.Label = labelsList
sampleSum := 0.046745444
sampleCount := uint64(1.0)
expectedMetric.Histogram = &Histogram{
IsGaugeHistogram: false,
SampleSum: &sampleSum,
SampleCount: &sampleCount,
Bucket: buckets,
}

expectedMetrics := []*OpenMetric{
&expectedMetric,
}

type Histogram struct {
SampleCount *uint64
SampleSum *float64
Bucket []*Bucket
IsGaugeHistogram bool
}

expected := []*MetricFamily{
{
Name: &name,
Help: &help,
Type: "histogram",
Unit: nil,
Metric: expectedMetrics,
},
}

b := []byte(promHistogramWithAndWithoutBucket)
result, err := ParseMetricFamilies(b, ContentTypeTextFormat, time.Now())
if err != nil {
t.Fatal("ParseMetricFamilies returned an error.")
}
assert.Equal(t, expected, result)
}
54 changes: 39 additions & 15 deletions metricbeat/helper/prometheus/textparse.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func (m *Info) GetValue() int64 {
}
return 0
}

func (m *Info) HasValidValue() bool {
return m != nil && *m.Value == 1
}
Expand Down Expand Up @@ -395,6 +396,14 @@ func summaryMetricName(name string, s float64, qv string, lbls string, t *int64,
return name, metric
}

/*
histogramMetricName returns the metric name without the suffix and its histogram.
OpenMetric suffixes: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#suffixes.
ChrsMark marked this conversation as resolved.
Show resolved Hide resolved
Prometheus histogram suffixes: https://prometheus.io/docs/concepts/metric_types/#histogram.
OpenMetric includes the extra suffix _created, that falls under default in this function.
OpenMetric also includes _g* suffixes that are not present for Prometheus metrics and are taken care of separately in
this function.
*/
func histogramMetricName(name string, s float64, qv string, lbls string, t *int64, isGaugeHistogram bool, e *exemplar.Exemplar, histogramsByName map[string]map[string]*OpenMetric) (string, *OpenMetric) {
var histogram = &Histogram{}
var bucket = []*Bucket{}
Expand All @@ -404,21 +413,18 @@ func histogramMetricName(name string, s float64, qv string, lbls string, t *int6
case isCount(name):
u := uint64(s)
histogram.SampleCount = &u
name = name[:len(name)-6]
name = strings.TrimSuffix(name, suffixCount)
case isSum(name):
histogram.SampleSum = &s
name = name[:len(name)-4]
name = strings.TrimSuffix(name, suffixSum)
case isGaugeHistogram && isGCount(name):
u := uint64(s)
histogram.SampleCount = &u
name = name[:len(name)-7]
name = strings.TrimSuffix(name, suffixGCount)
case isGaugeHistogram && isGSum(name):
histogram.SampleSum = &s
name = name[:len(name)-5]
default:
if isBucket(name) {
name = name[:len(name)-7]
}
name = strings.TrimSuffix(name, suffixGSum)
case isBucket(name):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have any unit-tests that cover these changes? It's hard to understand if something might break here in any corner case or whatever?

If not would you mind adding some tests? Otherwise we can exclude these changes from the patch if not needed for the fix. Improving code should always go along with adding tests ensuring that functionality is not broken :).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added one more test for this exact function. Do you think that is enough?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that for the new functionality the added test is fine but what about the existing functionality that is touched here? How we can be sure that the code which is changed (improved) is not broken?
I'm talking mostly about the changes like the one at https://github.com/elastic/beats/pull/36537/files#diff-d66f15900f5676672ddf353458a9373cf8b1ab8e237c473191e4a64ae93cbba2L407-R416.

So the point here is that if we want to improve the code we should ensure that we don't break it. So to my mind we should either:

  1. ensure that we have tests for these lines that still work
  2. if we don't have tests then add them along with the corresponding code changes.
  3. if we cannot add tests maybe it's safer to avoid any code changes that might be prone to errors

Could you verify the above cases please?

Copy link
Contributor Author

@constanca-m constanca-m Sep 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We never had tests for those cases. Do you mean adding a test case for every type (histogram, histogram gauge etc)? I only changed it to strings.TrimSuffix because then we know what is happening instead of defining the lenght of the suffix and then subtracting like we were doing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean adding a test case for every type (histogram, histogram gauge etc)?

Ideally yes since we touch these specific lines of code. Otherwise the improvement seems to be incomplete to me and potentially prone to errors :).
Maybe that's out of the scope of this PR but the generic idea is what I mention.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. So should I try to add new tests for this PR or should we leave it to a different one?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be nice since we are on this. Otherwise I doubt it will happen at any time soon.
Another option would be to exclude the specific changes/improvements that are not related to this specific fix and add them later in a follow-up along with tests. That would unblock this one and ensures the quality as well. It's up to you to decide.

f, err := strconv.ParseFloat(qv, 64)
if err != nil {
f = math.MaxUint64
Expand All @@ -433,6 +439,9 @@ func histogramMetricName(name string, s float64, qv string, lbls string, t *int6
}
bkt.Exemplar = e
}
name = strings.TrimSuffix(name, suffixBucket)
default:
return "", nil
}

_, k := histogramsByName[name]
Expand Down Expand Up @@ -485,10 +494,14 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
case textparse.EntryType:
buf, t := parser.Type()
s := string(buf)
_, ok = metricFamiliesByName[s]
fam, ok = metricFamiliesByName[s]
if !ok {
fam = &MetricFamily{Name: &s, Type: t}
metricFamiliesByName[s] = fam
} else {
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
// In case the metric family already exists, we need to make sure the type is correctly defined
// instead of being `unknown` like it was initialized for the other entry types (help and unit).
fam.Type = t
}
mt = t
continue
Expand Down Expand Up @@ -537,16 +550,21 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
var lbls strings.Builder
lbls.Grow(len(mets))
var labelPairs = []*labels.Label{}
var qv string // value of le or quantile label
for _, l := range lset.Copy() {
if l.Name == labels.MetricName {
continue
}

if l.Name != model.QuantileLabel && l.Name != labels.BucketLabel { // quantile and le are special labels handled below

if l.Name == model.QuantileLabel {
ChrsMark marked this conversation as resolved.
Show resolved Hide resolved
qv = lset.Get(model.QuantileLabel)
} else if l.Name == labels.BucketLabel {
qv = lset.Get(labels.BucketLabel)
} else {
lbls.WriteString(l.Name)
lbls.WriteString(l.Value)
}

n := l.Name
v := l.Value

Expand All @@ -569,7 +587,7 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
mn := lset.Get(labels.MetricName)
metric = &OpenMetric{Name: &mn, Counter: counter, Label: labelPairs}
if isTotal(metricName) && contentType == OpenMetricsType { // Remove suffix _total, get lookup metricname
ChrsMark marked this conversation as resolved.
Show resolved Hide resolved
lookupMetricName = metricName[:len(metricName)-6]
lookupMetricName = strings.TrimSuffix(metricName, suffixTotal)
} else {
lookupMetricName = metricName
}
Expand All @@ -583,7 +601,7 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
metric = &OpenMetric{Name: &metricName, Info: info, Label: labelPairs}
lookupMetricName = metricName
case textparse.MetricTypeSummary:
lookupMetricName, metric = summaryMetricName(metricName, v, lset.Get(model.QuantileLabel), lbls.String(), &t, summariesByName)
lookupMetricName, metric = summaryMetricName(metricName, v, qv, lbls.String(), &t, summariesByName)
metric.Label = labelPairs
if !isSum(metricName) {
continue
Expand All @@ -593,7 +611,10 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
if hasExemplar := parser.Exemplar(&e); hasExemplar {
exm = &e
}
lookupMetricName, metric = histogramMetricName(metricName, v, lset.Get(labels.BucketLabel), lbls.String(), &t, false, exm, histogramsByName)
lookupMetricName, metric = histogramMetricName(metricName, v, qv, lbls.String(), &t, false, exm, histogramsByName)
if metric == nil { // metric name does not have a suffix supported for the type histogram
constanca-m marked this conversation as resolved.
Show resolved Hide resolved
continue
}
metric.Label = labelPairs
if !isSum(metricName) {
continue
Expand All @@ -603,7 +624,10 @@ func ParseMetricFamilies(b []byte, contentType string, ts time.Time) ([]*MetricF
if hasExemplar := parser.Exemplar(&e); hasExemplar {
exm = &e
}
lookupMetricName, metric = histogramMetricName(metricName, v, lset.Get(labels.BucketLabel), lbls.String(), &t, true, exm, histogramsByName)
lookupMetricName, metric = histogramMetricName(metricName, v, qv, lbls.String(), &t, true, exm, histogramsByName)
if metric == nil { // metric name does not have a suffix supported for the type gauge histogram
continue
}
metric.Label = labelPairs
metric.Histogram.IsGaugeHistogram = true
if !isGSum(metricName) {
Expand Down
Loading