90% utilization tests

scylladb · Dec 11, 2024 · 8cf8d4d · 8cf8d4d
1 parent 63b10fb
commit 8cf8d4d
Show file tree

Hide file tree

Showing 16 changed files with 549 additions and 0 deletions.
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_drop.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_drop.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-drop.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_experiment.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_experiment.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-experiment.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_scale_out_larger_instance.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_scale_out_larger_instance.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-large-grow-i4i-4xlarge.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_scale_out_scale_in_datacenter.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_scale_out_scale_in_datacenter.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-grow-shrink-dc.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_scale_out_scale_in_same_instance.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_scale_out_scale_in_same_instance.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-grow-shrink-i4i-2xlarge.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_scale_out_smaller_instance.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_scale_out_smaller_instance.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-4xlarge-grow-i4i-large.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/90_percent_truncate.jenkinsfile b/jenkins-pipelines/temp_90_percent/90_percent_truncate.jenkinsfile
@@ -0,0 +1,19 @@
+#!groovy
+
+// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
+def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)
+
+perfRegressionParallelPipeline(
+    backend: "aws",
+    availability_zone: 'a',
+    test_name: "performance_regression_test.PerformanceRegressionTest",
+    test_config: """["test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-truncate.yaml", "configurations/disable_kms.yaml"]""",
+    sub_tests: ["test_latency_mixed_with_nemesis"],
+    scylla_version: 'enterprise:latest',
+    email_recipients: '[email protected]',
+    post_behavior_db_nodes: 'destroy',
+    post_behavior_loader_nodes: 'destroy',
+    post_behavior_monitor_nodes: 'destroy',
+    post_behavior_k8s_cluster: 'destroy',
+    provision_type: 'on_demand',
+)
diff --git a/jenkins-pipelines/temp_90_percent/_display_name b/jenkins-pipelines/temp_90_percent/_display_name
@@ -0,0 +1 @@
+90% Utilization
diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py
@@ -2094,6 +2094,18 @@ def _truncate_cmd_timeout_suffix(self, truncate_timeout):  # pylint: disable=no-
         # NOTE: 'self' is used by the 'scylla_versions' decorator
         return ''
 
+    @latency_calculator_decorator(legend="Drop Table")
+    def disrupt_drop(self):
+        keyspace_drop = 'ks_drop'
+        table = 'standard1'
+
+        self._prepare_test_table(ks=keyspace_drop)
+
+        # do the actual drop
+        with self.cluster.cql_connection_patient(self.target_node, keyspace=keyspace_drop) as session:
+            session.execute(f"DROP TABLE {table};")
+
+    @latency_calculator_decorator(legend="Truncate Table")
     def disrupt_truncate(self):
         keyspace_truncate = 'ks_truncate'
         table = 'standard1'
@@ -4244,6 +4256,14 @@ def _double_cluster_load(self, duration: int) -> None:
         results = self.tester.get_stress_results(queue=stress_queue, store_results=False)
         self.log.info(f"Double load results: {results}")
 
+    @target_data_nodes
+    def disrupt_grow_cluster(self):
+        sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
+        if not self.has_steady_run and sleep_time_between_ops:
+            self.steady_state_latency()
+            self.has_steady_run = True
+        self._grow_cluster(rack=None)
+
     @target_data_nodes
     def disrupt_grow_shrink_cluster(self):
         sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
@@ -4258,6 +4278,45 @@ def disrupt_grow_shrink_cluster(self):
             self._double_cluster_load(duration)
         self._shrink_cluster(rack=None, new_nodes=new_nodes)
 
+    @target_data_nodes
+    def disrupt_grow_shrink_datacenter(self):
+        if self._is_it_on_kubernetes():
+            raise UnsupportedNemesis("Operator doesn't support multi-DC yet. Skipping.")
+        if self.cluster.test_config.MULTI_REGION:
+            raise UnsupportedNemesis(
+                "grow_shring_datacenter skipped for multi-dc scenario (https://github.com/scylladb/scylla-cluster-tests/issues/5369)")
+        InfoEvent(message='Starting Grow Shrink DC Nemesis').publish()
+        sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
+        sleep_time_between_ops = sleep_time_between_ops if sleep_time_between_ops else 10
+        sleep_time_between_ops = sleep_time_between_ops * 60
+        if not self.has_steady_run and sleep_time_between_ops:
+            self.steady_state_latency()
+            self.has_steady_run = True
+
+        # create a new dc
+        InfoEvent(message='New DC').publish()
+        nodes_on_new_dc = []
+        initial_dc_nodes = self.cluster.params.get('n_db_nodes')
+        for _ in range(initial_dc_nodes):
+            nodes_on_new_dc += [self._add_new_node_in_new_dc()]
+        time.sleep(sleep_time_between_ops)
+
+        # reconfigure keyspaces
+        # TODO
+
+        # add nodes to each dc
+        InfoEvent(message='Grow both DCs').publish()
+        add_nodes_number = self.tester.params.get('nemesis_add_node_cnt')
+        self._grow_cluster()
+        for _ in range(add_nodes_number):
+            nodes_on_new_dc += [self._add_new_node_in_new_dc()]
+        time.sleep(sleep_time_between_ops)
+
+        # remove the new dc
+        InfoEvent(message='Remove DC').publish()
+        for node in nodes_on_new_dc:
+            self.cluster.decommission(node)
+
     # NOTE: version limitation is caused by the following:
     #       - https://github.com/scylladb/scylla-enterprise/issues/3211
     #       - https://github.com/scylladb/scylladb/issues/14184
@@ -4697,6 +4756,7 @@ def _verify_cdc_feature_status(self, keyspace: str, table: str, cdc_settings: di
             assert actual_cdc_settings == cdc_settings, \
                 f"CDC extension settings are differs. Current: {actual_cdc_settings} expected: {cdc_settings}"
 
+    @latency_calculator_decorator(legend="Adding new nodes in new DC")
     def _add_new_node_in_new_dc(self, is_zero_node=False) -> BaseNode:
         if is_zero_node:
             new_node = skip_on_capacity_issues(self.cluster.add_nodes)(
@@ -5549,6 +5609,16 @@ def disrupt(self):
         time.sleep(300)
 
 
+class SteadyMonkey(Nemesis):
+    kubernetes = True
+
+    def disrupt(self):
+        sleep_time_between_ops = self.cluster.params.get('nemesis_sequence_sleep_between_ops')
+        if not self.has_steady_run and sleep_time_between_ops:
+            self.steady_state_latency(sleep_time=sleep_time_between_ops)
+            self.has_steady_run = True
+
+
 class AddRemoveDcNemesis(Nemesis):
 
     disruptive = True
@@ -5561,6 +5631,15 @@ def disrupt(self):
         self.disrupt_add_remove_dc()
 
 
+class GrowClusterNemesis(Nemesis):
+    disruptive = True
+    kubernetes = True
+    topology_changes = True
+
+    def disrupt(self):
+        self.disrupt_grow_cluster()
+
+
 class GrowShrinkClusterNemesis(Nemesis):
     disruptive = True
     kubernetes = True
@@ -5570,6 +5649,15 @@ def disrupt(self):
         self.disrupt_grow_shrink_cluster()
 
 
+class GrowShrinkDatacenterNemesis(Nemesis):
+    disruptive = True
+    kubernetes = True
+    topology_changes = True
+
+    def disrupt(self):
+        self.disrupt_grow_shrink_datacenter()
+
+
 class AddRemoveRackNemesis(Nemesis):
     disruptive = True
     kubernetes = True
@@ -5790,6 +5878,16 @@ def disrupt(self):
         self.disrupt_nodetool_cleanup()
 
 
+class DropMonkey(Nemesis):
+    disruptive = False
+    kubernetes = True
+    limited = True
+    free_tier_set = True
+
+    def disrupt(self):
+        self.disrupt_drop()
+
+
 class TruncateMonkey(Nemesis):
     disruptive = False
     kubernetes = True

diff --git a/test-cases/temp_90_percent/90-percent-perf-experiment.yaml b/test-cases/temp_90_percent/90-percent-perf-experiment.yaml
@@ -0,0 +1,44 @@
+test_duration: 1080
+prepare_write_cmd: [
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..312500000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=312500001..625000000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=625000001..937500000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=937500001..1250000000",
+  ]
+
+stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=300 fixed=16875/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..650000000,325000000,6500000)' "
+
+
+n_db_nodes: 3
+nemesis_add_node_cnt: 2
+n_loaders: 4
+n_monitor_nodes: 1
+nemesis_grow_shrink_instance_type: 'i4i.2xlarge'
+
+instance_type_loader: 'c6i.2xlarge'
+instance_type_monitor: 't3.large'
+instance_type_db: 'i4i.2xlarge'
+
+nemesis_class_name: 'DropMonkey'
+nemesis_interval: 30
+nemesis_sequence_sleep_between_ops: 10
+
+user_prefix: 'elasticity-test'
+space_node_threshold: 644245094
+ami_id_db_scylla_desc: 'VERSION_DESC'
+
+round_robin: true
+append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'
+backtrace_decoding: false
+print_kernel_callstack: true
+
+store_perf_results: true
+# email_recipients: ["[email protected]"]
+# use_prepared_loaders: true
+use_hdr_cs_histogram: true
+email_subject_postfix: 'elasticity test'
+nemesis_double_load_during_grow_shrink_duration: 0
+parallel_node_operations: false
+
+stress_image:
+  cassandra-stress: 'scylladb/cassandra-stress:3.17.0'
diff --git a/test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-drop.yaml b/test-cases/temp_90_percent/90-percent-perf-i4i-2xlarge-drop.yaml
@@ -0,0 +1,47 @@
+test_duration: 1080
+prepare_write_cmd: [
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=1..312500000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=312500001..625000000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=625000001..937500000",
+  "cassandra-stress write no-warmup cl=ALL n=312500000 -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate threads=200 -col 'size=FIXED(128) n=FIXED(8)' -pop seq=937500001..1250000000",
+  ]
+
+stress_cmd_m: "cassandra-stress mixed no-warmup cl=QUORUM duration=800m -schema 'replication(strategy=NetworkTopologyStrategy,replication_factor=3)' -mode cql3 native -rate 'threads=300 fixed=16875/s' -col 'size=FIXED(128) n=FIXED(8)' -pop 'dist=gauss(1..650000000,325000000,6500000)' "
+
+
+
+n_db_nodes: 3
+n_loaders: 4
+n_monitor_nodes: 1
+
+instance_type_loader: 'c6i.2xlarge'
+instance_type_monitor: 't3.large'
+instance_type_db: 'i4i.2xlarge'
+
+nemesis_class_name: 'SteadyMonkey: DropMonkey:1'
+nemesis_interval: 30
+nemesis_sequence_sleep_between_ops: 60
+
+user_prefix: 'elasticity-test'
+space_node_threshold: 644245094
+ami_id_db_scylla_desc: 'VERSION_DESC'
+
+round_robin: true
+append_scylla_args: '--blocked-reactor-notify-ms 5 --abort-on-lsa-bad-alloc 1 --abort-on-seastar-bad-alloc --abort-on-internal-error 1 --abort-on-ebadf 1'
+backtrace_decoding: false
+print_kernel_callstack: true
+
+store_perf_results: true
+# email_recipients: ["[email protected]"]
+# use_prepared_loaders: true
+use_hdr_cs_histogram: true
+email_subject_postfix: 'elasticity test'
+nemesis_double_load_during_grow_shrink_duration: 0
+parallel_node_operations: false
+
+append_scylla_yaml:
+  enable_tablets: true
+  auto_snapshot: false
+
+stress_image:
+  cassandra-stress: 'scylladb/cassandra-stress:3.17.0'