Skip to content

Commit

Permalink
improvement(perf): add validation rules for latency decorator
Browse files Browse the repository at this point in the history
Added validation rules for results sent by
`latency_calculator_decorator` to Argus.
Each workload and result name (nemesis, predefined step) may set own
rules.

Current rules were created based on existing results - to pass typical
good results.

closes: scylladb#9237
  • Loading branch information
soyacz committed Nov 25, 2024
1 parent 481ebee commit c0a7676
Show file tree
Hide file tree
Showing 15 changed files with 208 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
latency_decorator_error_thresholds:
write:
_mgmt_repair_cli:
duration:
fixed_limit: 10800
_terminate_and_wait:
duration:
fixed_limit: 500
add_new_nodes:
duration:
fixed_limit: 3600
decommission_nodes:
duration:
fixed_limit: 5000
replace_node:
duration:
fixed_limit: 1800

read:
_mgmt_repair_cli:
duration:
fixed_limit: 3600
_terminate_and_wait:
duration:
fixed_limit: 500
add_new_nodes:
duration:
fixed_limit: 1800
decommission_nodes:
duration:
fixed_limit: 2500
replace_node:
duration:
fixed_limit: 1200

mixed:
_mgmt_repair_cli:
duration:
fixed_limit: 3600
_terminate_and_wait:
duration:
fixed_limit: 500
add_new_nodes:
duration:
fixed_limit: 2000
decommission_nodes:
duration:
fixed_limit: 3000
replace_node:
duration:
fixed_limit: 1400
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
latency_decorator_error_thresholds:
write:
"300000":
P99 write:
fixed_limit: 1000
"400000":
P99 write:
fixed_limit: 1000
unthrottled:
P90 write:
fixed_limit: null
P99 write:
fixed_limit: null
Throughput write:
best_pct: 10

read:
"150000":
P90 read:
fixed_limit: 1
P99 read:
fixed_limit: 1
"300000":
P90 read:
fixed_limit: 1
P99 read:
fixed_limit: 1
"450000":
P90 read:
fixed_limit: 1
P99 read:
fixed_limit: 3
"600000":
P90 read:
fixed_limit: 1.5
P99 read:
fixed_limit: 50
"700000":
P90 read:
fixed_limit: 3
P99 read:
fixed_limit: 50
unthrottled:
P90 read:
fixed_limit: null
P99 read:
fixed_limit: null
Throughput read:
best_pct: 10

mixed:
"50000":
P90 write:
fixed_limit: 1
P90 read:
fixed_limit: 1
P99 write:
fixed_limit: 2.5
P99 read:
fixed_limit: 2.5
"150000":
P90 write:
fixed_limit: 1
P90 read:
fixed_limit: 1.7
P99 write:
fixed_limit: 3
P99 read:
fixed_limit: 3
"300000":
P90 write:
fixed_limit: 3
P90 read:
fixed_limit: 3
P99 write:
fixed_limit: 5
P99 read:
fixed_limit: 5
"450000":
P90 write:
fixed_limit: 3
P90 read:
fixed_limit: 4
P99 write:
fixed_limit: 15
P99 read:
fixed_limit: 15
unthrottled:
P90 write:
fixed_limit: null
P90 read:
fixed_limit: null
P99 write:
fixed_limit: null
P99 read:
fixed_limit: null
Throughput write:
best_pct: 10
32 changes: 32 additions & 0 deletions defaults/test_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,35 @@ skip_test_stages: {}
n_db_zero_token_nodes: 0
zero_token_instance_type_db: 'i4i.large'
use_zero_nodes: false

latency_decorator_error_thresholds:
write:
default:
P90 write:
fixed_limit: 5
P99 write:
fixed_limit: 10
Throughput write:
best_abs: 2000
read:
default:
P90 read:
fixed_limit: 5
P99 read:
fixed_limit: 10
Throughput read:
best_abs: 2000
mixed:
default:
P90 write:
fixed_limit: 5
P90 read:
fixed_limit: 5
P99 write:
fixed_limit: 10
P99 read:
fixed_limit: 10
Throughput write:
best_abs: 2000
Throughput read:
best_abs: 2000
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
perfRegressionParallelPipeline(
backend: "aws",
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/disable_kms.yaml"]""",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/disable_kms.yaml", "configurations/performance/latency-decorator-error-thresholds-nemesis-ent.yaml"]""",
sub_tests: ["test_latency_write_with_nemesis", "test_latency_read_with_nemesis", "test_latency_mixed_with_nemesis"],
test_email_title: "latency during operations / tablets",
perf_extra_jobs_to_compare: "scylla-master/perf-regression/scylla-master-perf-regression-latency-650gb-with-nemesis-tablets",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
perfRegressionParallelPipeline(
backend: "aws",
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_kms.yaml"]""",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_kms.yaml", "configurations/performance/latency-decorator-error-thresholds-nemesis-ent.yaml"]""",
sub_tests: ["test_latency_write_with_nemesis", "test_latency_read_with_nemesis", "test_latency_mixed_with_nemesis"],
perf_extra_jobs_to_compare: """["scylla-enterprise/scylla-enterprise-perf-regression-latency-650gb-with-nemesis","scylla-enterprise/perf-regression/scylla-enterprise-perf-regression-latency-650gb-with-nemesis"]""",
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ perfRegressionParallelPipeline(
backend: "aws",
aws_region: "us-east-1",
test_name: "performance_regression_gradual_grow_throughput.PerformanceRegressionPredefinedStepsTest",
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml"]''',
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml", "configurations/performance/latency-decorator-error-thresholds-steps-ent-vnodes.yaml"]''',
sub_tests: ["test_write_gradual_increase_load", "test_read_gradual_increase_load", "test_mixed_gradual_increase_load"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
perfRegressionParallelPipeline(
backend: "aws",
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml"]""",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/performance/latency-decorator-error-thresholds-nemesis-oss.yaml"]""",
sub_tests: ["test_latency_write_with_nemesis", "test_latency_read_with_nemesis", "test_latency_mixed_with_nemesis"],
test_email_title: "latency during operations / tablets",
perf_extra_jobs_to_compare: "scylla-master/perf-regression/scylla-master-perf-regression-latency-650gb-with-nemesis-tablets",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
perfRegressionParallelPipeline(
backend: "aws",
test_name: "performance_regression_test.PerformanceRegressionTest",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/tablets_disabled.yaml"]""",
test_config: """["test-cases/performance/perf-regression-latency-650gb-with-nemesis.yaml", "configurations/tablets_disabled.yaml", "configurations/performance/latency-decorator-error-thresholds-nemesis-oss.yaml"]""",
sub_tests: ["test_latency_write_with_nemesis", "test_latency_read_with_nemesis", "test_latency_mixed_with_nemesis"],
perf_extra_jobs_to_compare: """["scylla-master/scylla-master-perf-regression-latency-650gb-with-nemesis","scylla-master/perf-regression/scylla-master-perf-regression-latency-650gb-with-nemesis"]""",
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ perfRegressionParallelPipeline(
backend: "aws",
aws_region: "us-east-1",
test_name: "performance_regression_gradual_grow_throughput.PerformanceRegressionPredefinedStepsTest",
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml"]''',
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml", "configurations/performance/latency-decorator-error-thresholds-steps-ent-vnodes.yaml"]''',
sub_tests: ["test_read_gradual_increase_load", "test_mixed_gradual_increase_load"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ perfRegressionParallelPipeline(
backend: "aws",
aws_region: "us-east-1",
test_name: "performance_regression_gradual_grow_throughput.PerformanceRegressionPredefinedStepsTest",
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml","configurations/perf-loaders-shard-aware-config.yaml"]''',
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps_enterprise.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml","configurations/perf-loaders-shard-aware-config.yaml, "configurations/performance/latency-decorator-error-thresholds-steps-ent-vnodes.yaml"]''',
sub_tests: ["test_write_gradual_increase_load"],
)
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ perfRegressionParallelPipeline(
backend: "aws",
aws_region: "us-east-1",
test_name: "performance_regression_gradual_grow_throughput.PerformanceRegressionPredefinedStepsTest",
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml"]''',
test_config: '''["test-cases/performance/perf-regression-predefined-throughput-steps.yaml", "configurations/performance/cassandra_stress_gradual_load_steps.yaml", "configurations/disable_kms.yaml", "configurations/tablets_disabled.yaml", "configurations/disable_speculative_retry.yaml", "configurations/performance/latency-decorator-error-thresholds-steps-ent-vnodes.yaml"]''',
sub_tests: ["test_write_gradual_increase_load", "test_read_gradual_increase_load", "test_mixed_gradual_increase_load"],

timeout: [time: 1600, unit: "MINUTES"]
Expand Down
10 changes: 5 additions & 5 deletions sdcm/argus_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,12 @@ def submit_results_to_argus(argus_client: ArgusClient, result_table: GenericResu


def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, description: str, cycle: int, result: dict,
start_time: float = 0):
start_time: float = 0, error_thresholds: dict = None):
result_table = workload_to_table[workload]()
result_table.name = f"{workload} - {name} - latencies"
result_table.description = f"{workload} workload - {description}"
operation_error_thresholds = LATENCY_ERROR_THRESHOLDS.get(name, LATENCY_ERROR_THRESHOLDS["default"])
error_thresholds = error_thresholds[workload]["default"] | error_thresholds[workload].get(name, {})
result_table.validation_rules = {metric: ValidationRule(**rules) for metric, rules in error_thresholds.items()}
try:
start_time = datetime.fromtimestamp(start_time or time.time(), tz=timezone.utc).strftime('%H:%M:%S')
except ValueError:
Expand All @@ -154,16 +155,15 @@ def send_result_to_argus(argus_client: ArgusClient, workload: str, name: str, de
result_table.add_result(column=f"P{percentile} {operation}",
row=f"Cycle #{cycle}",
value=value,
status=Status.PASS if value < operation_error_thresholds[f"percentile_{percentile}"] else Status.ERROR)
status=Status.UNSET)
if value := summary[operation.upper()].get("throughput", None):
# TODO: This column will be validated in the gradual test. `PASS` is temporary status. Should be handled later
result_table.add_result(column=f"Throughput {operation.lower()}",
row=f"Cycle #{cycle}",
value=value,
status=Status.UNSET)

result_table.add_result(column="duration", row=f"Cycle #{cycle}",
value=result["duration_in_sec"], status=Status.PASS)
value=result["duration_in_sec"], status=Status.UNSET)
try:
overview_screenshot = [screenshot for screenshot in result["screenshots"] if "overview" in screenshot][0]
result_table.add_result(column="Overview", row=f"Cycle #{cycle}",
Expand Down
4 changes: 4 additions & 0 deletions sdcm/sct_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1672,6 +1672,10 @@ class SCTConfiguration(dict):
dict(name="zero_token_instance_type_db", env="SCT_ZERO_TOKEN_INSTANCE_TYPE_DB", type=str,
help="""Instance type for zero token node"""),

dict(name="latency_decorator_error_thresholds", env="SCT_LATENCY_DECORATOR_ERROR_THRESHOLDS", type=dict_or_str,
help="Error thresholds for latency decorator."
" Defined by dict: {<write, read, mixed>: {<default|nemesis_name>:{<metric_name>: {<rule>: <value>}}}"),

]

required_params = ['cluster_backend', 'test_duration', 'n_db_nodes', 'n_loaders', 'use_preinstalled_scylla',
Expand Down
4 changes: 3 additions & 1 deletion sdcm/utils/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def wrapped(*args, **kwargs): # noqa: PLR0914
hdr_throughput += values["throughput"]
result["cycle_hdr_throughput"] = round(hdr_throughput)
result["reactor_stalls_stats"] = reactor_stall_stats

error_thresholds = tester.params.get("latency_decorator_error_thresholds")
if "steady" in func_name.lower():
if 'Steady State' not in latency_results:
latency_results['Steady State'] = result
Expand All @@ -266,6 +266,7 @@ def wrapped(*args, **kwargs): # noqa: PLR0914
cycle=0,
result=result,
start_time=start,
error_thresholds=error_thresholds,
)
else:
latency_results[func_name]['cycles'].append(result)
Expand All @@ -277,6 +278,7 @@ def wrapped(*args, **kwargs): # noqa: PLR0914
cycle=len(latency_results[func_name]['cycles']),
result=result,
start_time=start,
error_thresholds=error_thresholds,
)

with open(latency_results_file_path, 'w', encoding="utf-8") as file:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
test_duration: 3000
prepare_write_cmd: ["cassandra-stress write no-warmup cl=ALL n=162500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..162500000",
"cassandra-stress write no-warmup cl=ALL n=162500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=162500000..325000000",
"cassandra-stress write no-warmup cl=ALL n=162500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=325000000..487500000",
"cassandra-stress write no-warmup cl=ALL n=162500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=487500000..650000000"]
prepare_write_cmd: ["cassandra-stress write no-warmup cl=ALL n=16250000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..16250000",
"cassandra-stress write no-warmup cl=ALL n=16250000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=16250000..32500000",
"cassandra-stress write no-warmup cl=ALL n=16250000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=32500000..48750000",
"cassandra-stress write no-warmup cl=ALL n=16250000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=48750000..65000000"]

stress_cmd_w: "cassandra-stress write no-warmup cl=QUORUM duration=2850m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=20332/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..650000000,325000000,9750000)' "
stress_cmd_r: "cassandra-stress read no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=10310/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..650000000,325000000,9750000)' "
stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=8750/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..650000000,325000000,6500000)' "
stress_cmd_w: "cassandra-stress write no-warmup cl=QUORUM duration=2850m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=20332/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..65000000,32500000,975000)' "
stress_cmd_r: "cassandra-stress read no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=10310/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..65000000,32500000,975000)' "
stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=250 fixed=8750/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..65000000,32500000,650000)' "

n_db_nodes: 3
nemesis_add_node_cnt: 3
Expand Down

0 comments on commit c0a7676

Please sign in to comment.