diff --git a/book/src/libs/wasp/benchspy/real_world.md b/book/src/libs/wasp/benchspy/real_world.md
index ae6af9d62..1d75e9160 100644
--- a/book/src/libs/wasp/benchspy/real_world.md
+++ b/book/src/libs/wasp/benchspy/real_world.md
@@ -87,15 +87,17 @@ This function fetches the current report (for version passed as environment vari
 Let’s assume you want to ensure that none of the performance metrics degrade by more than **1%** between releases (and that error rate has not changed at all). Here's how you can write assertions using a convenient function for the `Direct` query executor:
 
 ```go
-hasErrors, errors := benchspy.CompareDirectWithThresholds(
+hasFailed, error := benchspy.CompareDirectWithThresholds(
     1.0, // Max 1% worse median latency
     1.0, // Max 1% worse p95 latency
     1.0, // Max 1% worse maximum latency
     0.0, // No increase in error rate
     currentReport, previousReport)
-require.False(t, hasErrors, fmt.Sprintf("errors found: %v", errors))
+require.False(t, hasError, fmt.Sprintf("issues found: %v", error))
 ```
 
+Error returned by this function is a concatenation of all threshold violations found for each standard metric and generator.
+
 ---
 
 ## Conclusion
diff --git a/wasp/benchspy/report.go b/wasp/benchspy/report.go
index ec6b8f1f9..d2ba0054b 100644
--- a/wasp/benchspy/report.go
+++ b/wasp/benchspy/report.go
@@ -3,6 +3,7 @@ package benchspy
 import (
 	"context"
 	"encoding/json"
+	goerrors "errors"
 	"fmt"
 	"os"
 	"strings"
@@ -150,21 +151,27 @@ func MustAllPrometheusResults(sr *StandardReport) map[string]model.Value {
 }
 
 func calculateDiffPercentage(current, previous float64) float64 {
-	var diffPrecentage float64
-	if previous != 0.0 && current != 0.0 {
-		diffPrecentage = (current - previous) / previous * 100
-	} else if previous == 0.0 && current == 0.0 {
-		diffPrecentage = 0.0
-	} else {
-		diffPrecentage = 100.0
+	if previous == 0.0 {
+		if current == 0.0 {
+			return 0.0
+		}
+		return 999.0 // Convention for infinite change when previous is 0
+	}
+
+	if current == 0.0 {
+		return -100.0 // Complete improvement when current is 0
 	}
 
-	return diffPrecentage
+	return (current - previous) / previous * 100
 }
 
 // CompareDirectWithThresholds evaluates the current and previous reports against specified thresholds.
 // It checks for significant differences in metrics and returns any discrepancies found, aiding in performance analysis.
-func CompareDirectWithThresholds(medianThreshold, p95Threshold, maxThreshold, errorRateThreshold float64, currentReport, previousReport *StandardReport) (bool, map[string][]error) {
+func CompareDirectWithThresholds(medianThreshold, p95Threshold, maxThreshold, errorRateThreshold float64, currentReport, previousReport *StandardReport) (bool, error) {
+	if currentReport == nil || previousReport == nil {
+		return true, errors.New("one or both reports are nil")
+	}
+
 	L.Info().
 		Str("Current report", currentReport.CommitOrTag).
 		Str("Previous report", previousReport.CommitOrTag).
@@ -174,6 +181,10 @@ func CompareDirectWithThresholds(medianThreshold, p95Threshold, maxThreshold, er
 		Float64("Error rate threshold", errorRateThreshold).
 		Msg("Comparing Direct metrics with thresholds")
 
+	if thresholdsErr := validateThresholds(medianThreshold, p95Threshold, maxThreshold, errorRateThreshold); thresholdsErr != nil {
+		return true, thresholdsErr
+	}
+
 	allCurrentResults := MustAllDirectResults(currentReport)
 	allPreviousResults := MustAllDirectResults(previousReport)
 
@@ -239,7 +250,46 @@ func CompareDirectWithThresholds(medianThreshold, p95Threshold, maxThreshold, er
 		Int("Number of meaningful differences", len(errors)).
 		Msg("Finished comparing Direct metrics with thresholds")
 
-	return len(errors) > 0, errors
+	return len(errors) > 0, concatenateGeneratorErrors(errors)
+}
+
+func concatenateGeneratorErrors(errors map[string][]error) error {
+	var errs []error
+	for generatorName, errors := range errors {
+		for _, err := range errors {
+			errs = append(errs, fmt.Errorf("[%s] %w", generatorName, err))
+		}
+	}
+	return goerrors.Join(errs...)
+}
+
+func validateThresholds(medianThreshold, p95Threshold, maxThreshold, errorRateThreshold float64) error {
+	var errs []error
+
+	var validateThreshold = func(name string, threshold float64) error {
+		if threshold < 0 || threshold > 100 {
+			return fmt.Errorf("%s threshold %.4f is not in the range [0, 100]", name, threshold)
+		}
+		return nil
+	}
+
+	if err := validateThreshold("median", medianThreshold); err != nil {
+		errs = append(errs, err)
+	}
+
+	if err := validateThreshold("p95", p95Threshold); err != nil {
+		errs = append(errs, err)
+	}
+
+	if err := validateThreshold("max", maxThreshold); err != nil {
+		errs = append(errs, err)
+	}
+
+	if err := validateThreshold("error rate", errorRateThreshold); err != nil {
+		errs = append(errs, err)
+	}
+
+	return goerrors.Join(errs...)
 }
 
 // PrintStandardDirectMetrics outputs a comparison of direct metrics between two reports.
diff --git a/wasp/benchspy/report_test.go b/wasp/benchspy/report_test.go
index 7e07c5216..6e1eed711 100644
--- a/wasp/benchspy/report_test.go
+++ b/wasp/benchspy/report_test.go
@@ -1384,13 +1384,10 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
-		failed, errs := CompareDirectWithThresholds(10.0, 1.0, 1.0, 1.0, currentReport, previousReport)
+		failed, err := CompareDirectWithThresholds(10.0, 1.0, 1.0, 1.0, currentReport, previousReport)
 		assert.True(t, failed)
-		assert.Len(t, errs, 1)
-		assert.Len(t, errs["test-gen"], 1)
-		for _, err := range errs["test-gen"] {
-			assert.Contains(t, err.Error(), "different, which is higher than the threshold")
-		}
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 50.0000%% different, which is higher than the threshold", string(MedianLatency)))
 	})
 
 	t.Run("all metrics exceed thresholds", func(t *testing.T) {
@@ -1442,13 +1439,13 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
-		failed, errs := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
 		assert.True(t, failed)
-		assert.Len(t, errs, 1)
-		assert.Len(t, errs["test-gen"], 4)
-		for _, err := range errs["test-gen"] {
-			assert.Contains(t, err.Error(), "different, which is higher than the threshold")
-		}
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 50.0000%% different, which is higher than the threshold", string(MedianLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 50.0000%% different, which is higher than the threshold", string(Percentile95Latency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 50.0000%% different, which is higher than the threshold", string(MaxLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 100.0000%% different, which is higher than the threshold", string(ErrorRate)))
 	})
 
 	t.Run("handle zero values", func(t *testing.T) {
@@ -1500,12 +1497,67 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
-		failed, errs := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
 		assert.False(t, failed)
-		assert.Empty(t, errs)
+		assert.Nil(t, err)
 	})
 
-	t.Run("handle missing metrics", func(t *testing.T) {
+	t.Run("handle missing metrics from current report", func(t *testing.T) {
+		previousReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       100.0,
+							string(Percentile95Latency): 0.0,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		currentReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency): 105.0,
+							// missing other metrics
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(Percentile95Latency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(MaxLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(ErrorRate)))
+	})
+
+	t.Run("handle missing metrics from previous report", func(t *testing.T) {
 		previousReport := &StandardReport{
 			BasicData: BasicData{
 				GeneratorConfigs: map[string]*wasp.Config{
@@ -1528,6 +1580,63 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
+		currentReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       105.0,
+							string(Percentile95Latency): 0.0,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from previous report", string(Percentile95Latency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from previous report", string(MaxLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from previous report", string(ErrorRate)))
+	})
+
+	t.Run("handle missing metrics from both reports", func(t *testing.T) {
+		previousReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       100.0,
+							string(Percentile95Latency): 0.0,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
 		currentReport := &StandardReport{
 			BasicData: BasicData{
 				GeneratorConfigs: map[string]*wasp.Config{
@@ -1550,13 +1659,12 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
-		failed, errs := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
 		assert.True(t, failed)
-		assert.Len(t, errs, 1)
-		assert.Len(t, errs["test-gen"], 3) // Should have errors for missing P95, Max, and Error Rate
-		for _, err := range errs["test-gen"] {
-			assert.Contains(t, err.Error(), "results were missing")
-		}
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(Percentile95Latency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(MaxLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s metric results were missing from current report", string(ErrorRate)))
 	})
 
 	t.Run("handle zero to non-zero transition", func(t *testing.T) {
@@ -1608,13 +1716,207 @@ func TestBenchSpy_CompareDirectWithThresholds(t *testing.T) {
 			},
 		}
 
-		failed, errs := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
 		assert.True(t, failed)
-		assert.Len(t, errs, 1)
-		assert.Len(t, errs["test-gen"], 4)
-		for _, err := range errs["test-gen"] {
-			assert.Contains(t, err.Error(), "100.0000% different")
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 999.0000%% different, which is higher than the threshold", string(MedianLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 999.0000%% different, which is higher than the threshold", string(Percentile95Latency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 999.0000%% different, which is higher than the threshold", string(MaxLatency)))
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 999.0000%% different, which is higher than the threshold", string(ErrorRate)))
+	})
+
+	t.Run("handle non-zero to zero transition", func(t *testing.T) {
+		previousReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       10.0,
+							string(Percentile95Latency): 20.0,
+							string(MaxLatency):          311.0,
+							string(ErrorRate):           1.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
 		}
+
+		currentReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       0.0,
+							string(Percentile95Latency): 0.0,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, currentReport, previousReport)
+		assert.False(t, failed)
+		assert.Nil(t, err)
+	})
+
+	t.Run("handle edge-cases", func(t *testing.T) {
+		previousReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       10.1,
+							string(Percentile95Latency): 10.1,
+							string(MaxLatency):          10.0,
+							string(ErrorRate):           10.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		currentReport := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       10.2,
+							string(Percentile95Latency): 10.1999,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(0.99, 0.9892, 10.0, 10.0, currentReport, previousReport)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), fmt.Sprintf("[test-gen] %s is 0.9901%% different, which is higher than the threshold", string(MedianLatency)))
+	})
+
+	t.Run("handle nil reports", func(t *testing.T) {
+		report := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       10.2,
+							string(Percentile95Latency): 10.1999,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, report, nil)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "one or both reports are nil")
+
+		failed, err = CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, nil, report)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "one or both reports are nil")
+
+		failed, err = CompareDirectWithThresholds(10.0, 10.0, 10.0, 10.0, nil, nil)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "one or both reports are nil")
+	})
+
+	t.Run("handle incorrect thresholds", func(t *testing.T) {
+		report := &StandardReport{
+			BasicData: BasicData{
+				GeneratorConfigs: map[string]*wasp.Config{
+					"test-gen": {
+						GenName: "test-gen",
+					},
+				},
+			},
+			QueryExecutors: []QueryExecutor{
+				&MockQueryExecutor{
+					KindFn: func() string { return string(StandardQueryExecutor_Direct) },
+					ResultsFn: func() map[string]interface{} {
+						return map[string]interface{}{
+							string(MedianLatency):       10.0,
+							string(Percentile95Latency): 10.0,
+							string(MaxLatency):          0.0,
+							string(ErrorRate):           0.0,
+						}
+					},
+					GeneratorNameFn: func() string { return "test-gen" },
+				},
+			},
+		}
+
+		failed, err := CompareDirectWithThresholds(-0.1, 100.0, 0.0, 100.0, report, report)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "median threshold -0.1000 is not in the range [0, 100]")
+
+		failed, err = CompareDirectWithThresholds(1.0, 101.0, 0.0, 100.0, report, report)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "p95 threshold 101.0000 is not in the range [0, 100]")
+
+		failed, err = CompareDirectWithThresholds(-1, -1, -1, -1, report, report)
+		assert.True(t, failed)
+		assert.NotNil(t, err)
+		assert.Contains(t, err.Error(), "median threshold -1.0000 is not in the range [0, 100]")
+		assert.Contains(t, err.Error(), "p95 threshold -1.0000 is not in the range [0, 100]")
+		assert.Contains(t, err.Error(), "max threshold -1.0000 is not in the range [0, 100]")
+		assert.Contains(t, err.Error(), "error rate threshold -1.0000 is not in the range [0, 100]")
 	})
 }