Skip to content

Commit

Permalink
add explanation why p95 of direct and loki might not be the same
Browse files Browse the repository at this point in the history
  • Loading branch information
Tofel committed Dec 13, 2024
1 parent 73df456 commit b847867
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 2 deletions.
13 changes: 13 additions & 0 deletions book/src/libs/wasp/benchspy/loki_dillema.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,16 @@ This means you can:
- Avoid calculating metrics like the median, 95th percentile latency, or error ratio yourself.

By using `Direct`, you save resources and simplify the process when advanced analysis isn't required.

> [!WARNING]
> Metrics calculated by the two query executors may differ slightly due to differences in their data processing and calculation methods:
> - **`Direct` QueryExecutor**: This method processes all individual data points from the raw dataset, ensuring that every value is taken into account for calculations like averages, percentiles, or other statistics. It provides the most granular and precise results but may also be more sensitive to outliers and noise in the data.
> - **`Loki` QueryExecutor**: This method aggregates data using a default window size of 10 seconds. Within each window, multiple raw data points are combined (e.g., through averaging, summing, or other aggregation functions), which reduces the granularity of the dataset. While this approach can improve performance and reduce noise, it also smooths the data, which may obscure outliers or small-scale variability.
> #### Why This Matters for Percentiles:
> Percentiles, such as the 95th percentile (p95), are particularly sensitive to the granularity of the input data:
> - In the **`Direct` QueryExecutor**, the p95 is calculated across all raw data points, capturing the true variability of the dataset, including any extreme values or spikes.
> - In the **`Loki` QueryExecutor**, the p95 is calculated over aggregated data (i.e. using the 10-second window). As a result, the raw values within each window are smoothed into a single representative value, potentially lowering or altering the calculated p95. For example, an outlier that would significantly affect the p95 in the `Direct` calculation might be averaged out in the `Loki` window, leading to a slightly lower percentile value.
> #### Key Takeaway:
> The difference arises because `Direct` prioritizes precision by using raw data, while `Loki` prioritizes efficiency and scalability by using aggregated data. When interpreting results, it’s essential to consider how the smoothing effect of `Loki` might impact the representation of variability or extremes in the dataset. This is especially important for metrics like percentiles, where such details can significantly influence the outcome.
4 changes: 4 additions & 0 deletions book/src/libs/wasp/benchspy/loki_std.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ compareMedian(string(benchspy.Percentile95Latency))
compareMedian(string(benchspy.ErrorRate))
```

> [!WARNING]
> Standard Loki metrics are all calculated using a 10 seconds moving window, which results in smoothing of values due to aggregation.
> To learn what that means in details, please refer to [To Loki or Not to Loki](./loki_dillema.md) chapter.
## What’s Next?

In this example, we used standard metrics, which are the same as in the first test. Now, [let’s explore how to use your custom LogQL queries](./loki_custom.md).
Expand Down
8 changes: 6 additions & 2 deletions wasp/benchspy/direct.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
medianFn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
var asMiliDuration []float64
for _, response := range responses.Data {
asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
// get duration as nanoseconds and convert to milliseconds in order to not lose precision
// otherwise, the duration will be rounded to the nearest millisecond
asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
}

return CalculatePercentile(asMiliDuration, 0.5), nil
Expand All @@ -164,7 +166,9 @@ func (g *DirectQueryExecutor) standardQuery(standardMetric StandardLoadMetric) (
p95Fn := func(responses *wasp.SliceBuffer[wasp.Response]) (float64, error) {
var asMiliDuration []float64
for _, response := range responses.Data {
asMiliDuration = append(asMiliDuration, float64(response.Duration.Milliseconds()))
// get duration as nanoseconds and convert to milliseconds in order to not lose precision
// otherwise, the duration will be rounded to the nearest millisecond
asMiliDuration = append(asMiliDuration, float64(response.Duration.Nanoseconds())/1_000_000)
}

return CalculatePercentile(asMiliDuration, 0.95), nil
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package main

import (
"context"
"fmt"
"math"
"strconv"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/smartcontractkit/chainlink-testing-framework/wasp"
"github.com/smartcontractkit/chainlink-testing-framework/wasp/benchspy"
)

// both should give the same results
func TestBenchSpy_Standard_Direct_And_Loki_Metrics(t *testing.T) {
// this test requires CTFv2 node_set with observability stack to be running

label := "benchspy-direct-loki"

gen, err := wasp.NewGenerator(&wasp.Config{
T: t,
GenName: "vu",
CallTimeout: 100 * time.Millisecond,
LoadType: wasp.VU,
Schedule: wasp.Plain(1, 10*time.Second),
VU: wasp.NewMockVU(&wasp.MockVirtualUserConfig{
CallSleep: 50 * time.Millisecond,
}),
Labels: map[string]string{
"branch": label,
"commit": label,
},
LokiConfig: wasp.NewEnvLokiConfig(),
})
require.NoError(t, err)

gen.Run(true)

baseLineReport, err := benchspy.NewStandardReport(
"91ee9e3c903d52de12f3d0c1a07ac3c2a6d141fb",
benchspy.WithStandardQueries(benchspy.StandardQueryExecutor_Direct, benchspy.StandardQueryExecutor_Loki),
benchspy.WithGenerators(gen),
)
require.NoError(t, err, "failed to create original report")

fetchCtx, cancelFn := context.WithTimeout(context.Background(), 60*time.Second)
defer cancelFn()

fetchErr := baseLineReport.FetchData(fetchCtx)
require.NoError(t, fetchErr, "failed to fetch current report")

currentAsLokiSlices := benchspy.MustAllLokiResults(baseLineReport)
currentAsDirectFloats := benchspy.MustAllDirectResults(baseLineReport)

require.NotEmpty(t, currentAsLokiSlices[string(benchspy.MedianLatency)], "%s results were missing for loki", string(benchspy.MedianLatency))
require.NotEmpty(t, currentAsDirectFloats[string(benchspy.MedianLatency)], "%s results were missing for direct", string(benchspy.MedianLatency))

var compareValues = func(t *testing.T, metricName string, lokiFloat, directFloat, maxDiffPrecentage float64) {
var diffPrecentage float64
if lokiFloat != 0.0 && directFloat != 0.0 {
diffPrecentage = (directFloat - lokiFloat) / lokiFloat * 100
} else if lokiFloat == 0.0 && directFloat == 0.0 {
diffPrecentage = 0.0
} else {
diffPrecentage = 100.0
}
assert.LessOrEqual(t, math.Abs(diffPrecentage), maxDiffPrecentage, "%s are more than 1% different", metricName, fmt.Sprintf("%.4f", diffPrecentage))
}

lokiFloatSlice, err := benchspy.StringSliceToFloat64Slice(currentAsLokiSlices[string(benchspy.MedianLatency)])
require.NoError(t, err, "failed to convert %s results to float64 slice", string(benchspy.MedianLatency))
lokiMedian := benchspy.CalculatePercentile(lokiFloatSlice, 0.5)

compareValues(t, string(benchspy.MedianLatency), lokiMedian, currentAsDirectFloats[string(benchspy.MedianLatency)], 1.0)

lokip95 := benchspy.CalculatePercentile(lokiFloatSlice, 0.95)
// here the max diff is 1.5% because of higher impact of data aggregation in loki
compareValues(t, string(benchspy.Percentile95Latency), lokip95, currentAsDirectFloats[string(benchspy.Percentile95Latency)], 1.5)

lokiErrorRate := 0
for _, v := range currentAsLokiSlices[string(benchspy.ErrorRate)] {
asInt, err := strconv.Atoi(v)
require.NoError(t, err)
lokiErrorRate += int(asInt)
}

lokiErrorRate = lokiErrorRate / len(currentAsLokiSlices[string(benchspy.ErrorRate)])
compareValues(t, string(benchspy.ErrorRate), float64(lokiErrorRate), currentAsDirectFloats[string(benchspy.ErrorRate)], 1.0)
}

0 comments on commit b847867

Please sign in to comment.