-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
full_storage_utilization_test: Storage utilization at 90% cluster size
Populate data until it reaches over 90% disk storage and perform db and cluster options. Signed-off-by: Lakshmipathi <[email protected]>
- Loading branch information
Lakshmipathi
committed
Oct 23, 2024
1 parent
9018c7f
commit 4a24b29
Showing
3 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import time | ||
from sdcm.tester import ClusterTester | ||
from sdcm.cluster import MAX_TIME_WAIT_FOR_NEW_NODE_UP | ||
|
||
class FullStorageUtilizationTest(ClusterTester): | ||
def __init__(self, *args, **kwargs): | ||
super().__init__(*args, **kwargs) | ||
self.stress_cmd_30mins = self.params.get('stress_cmd') | ||
|
||
def prepare_dataset_layout(self, dataset_size, row_size=10240): | ||
n = dataset_size * 1024 * 1024 * 1024 // row_size | ||
seq_end = n * 100 | ||
|
||
return f'cassandra-stress write cl=ONE n={n} -mode cql3 native -rate threads=10 -pop dist="uniform(1..{seq_end})" ' \ | ||
f'-col "size=FIXED({row_size}) n=FIXED(1)" -schema "replication(strategy=NetworkTopologyStrategy,replication_factor=3)"' | ||
|
||
def setUp(self): | ||
super().setUp() | ||
self.start_time = time.time() | ||
|
||
def test_storage_utilization(self): | ||
""" | ||
3 nodes cluster, RF=3. | ||
Write data until 85% disk usage is reached. | ||
Sleep for 90 minutes. | ||
Add 4th node. | ||
""" | ||
self.log.info("Running df command on all nodes:") | ||
self.get_df_output() | ||
self.run_stress(85) | ||
self.run_stress(93) | ||
|
||
self.run_stress_thread(stress_cmd=self.stress_cmd_30mins, keyspace_name="add_node") | ||
time.sleep(30) | ||
self.log.info("Adding a new node") | ||
self.add_new_node() | ||
|
||
## TODO: Move below code | ||
self.log_disk_usage() | ||
self.log.info("Wait for 60 minutes") | ||
time.sleep(3600) | ||
self.log_disk_usage() | ||
|
||
def run_stress(self, target_usage): | ||
target_used_size = self.calculate_target_used_size(target_usage) | ||
self.run_stress_until_target(target_used_size, target_usage) | ||
|
||
self.log_disk_usage() | ||
self.log.info("Wait for 60 minutes") | ||
time.sleep(3600) | ||
self.log_disk_usage() | ||
|
||
def run_stress_until_target(self, target_used_size, target_usage): | ||
current_used = self.get_max_disk_used() | ||
current_usage = self.get_max_disk_usage() | ||
num = 0 | ||
smaller_dataset = True | ||
while current_used < target_used_size and current_usage < target_usage: | ||
num += 1 | ||
|
||
# Write smaller dataset near the threshold (10% or 20GB of the target) | ||
smaller_dataset = (((target_used_size - current_used) < 20) or ((target_usage - current_usage) <= 10)) | ||
|
||
dataset_size = 1 if smaller_dataset else 10 # 1 GB or 10 GB | ||
ks_name = "keyspace_small" if smaller_dataset else "keyspace_large" | ||
stress_cmd = self.prepare_dataset_layout(dataset_size) | ||
stress_queue = self.run_stress_thread(stress_cmd=stress_cmd, keyspace_name=f"{ks_name}{num}", stress_num=1, keyspace_num=num) | ||
|
||
self.verify_stress_thread(cs_thread_pool=stress_queue) | ||
self.get_stress_results(queue=stress_queue) | ||
|
||
self.flush_all_nodes() | ||
|
||
current_used = self.get_max_disk_used() | ||
current_usage = self.get_max_disk_usage() | ||
self.log.info(f"Current max disk usage after writing to keyspace{num}: {current_usage}% ({current_used} GB / {target_used_size} GB)") | ||
|
||
|
||
|
||
def add_new_node(self): | ||
self.get_df_output() | ||
new_nodes = self.db_cluster.add_nodes(count=1, enable_auto_bootstrap=True) | ||
self.monitors.reconfigure_scylla_monitoring() | ||
new_node = new_nodes[0] | ||
self.db_cluster.wait_for_init(node_list=[new_node], timeout=MAX_TIME_WAIT_FOR_NEW_NODE_UP, check_node_health=False) | ||
self.db_cluster.wait_for_nodes_up_and_normal(nodes=[new_node]) | ||
total_nodes_in_cluster = len(self.db_cluster.nodes) | ||
self.log.info(f"New node added, total nodes in cluster: {total_nodes_in_cluster}") | ||
self.get_df_output() | ||
|
||
def get_df_output(self): | ||
for node in self.db_cluster.nodes: | ||
result = node.remoter.run('df -h /var/lib/scylla') | ||
self.log.info(f"DF output for node {node.name}:\n{result.stdout}") | ||
|
||
def flush_all_nodes(self): | ||
for node in self.db_cluster.nodes: | ||
self.log.info(f"Flushing data on node {node.name}") | ||
node.run_nodetool("flush") | ||
|
||
def get_max_disk_usage(self): | ||
max_usage = 0 | ||
for node in self.db_cluster.nodes: | ||
result = node.remoter.run("df -h --output=pcent /var/lib/scylla | sed 1d | sed 's/%//'") | ||
usage = int(result.stdout.strip()) | ||
max_usage = max(max_usage, usage) | ||
return max_usage | ||
|
||
def get_max_disk_used(self): | ||
max_used = 0 | ||
for node in self.db_cluster.nodes: | ||
result = node.remoter.run("df -h --output=used /var/lib/scylla | sed 1d | sed 's/G//'") | ||
used = int(result.stdout.strip()) | ||
max_used = max(max_used, used) | ||
return max_used | ||
|
||
def get_disk_info(self, node): | ||
result = node.remoter.run("df -h --output=size,used,avail,pcent /var/lib/scylla | sed 1d") | ||
size, used, avail, pcent = result.stdout.strip().split() | ||
return { | ||
'total': int(size.rstrip('G')), | ||
'used': int(used.rstrip('G')), | ||
'available': int(avail.rstrip('G')), | ||
'used_percent': int(pcent.rstrip('%')) | ||
} | ||
|
||
def calculate_target_used_size(self, target_percent): | ||
max_total = 0 | ||
for node in self.db_cluster.nodes: | ||
info = self.get_disk_info(node) | ||
max_total = max(max_total, info['total']) | ||
|
||
target_used_size = (target_percent / 100) * max_total | ||
current_used = self.get_max_disk_used() | ||
additional_usage_needed = target_used_size - current_used | ||
|
||
self.log.info(f"Current max disk usage: {self.get_max_disk_usage():.2f}%") | ||
self.log.info(f"Current max used space: {current_used:.2f} GB") | ||
self.log.info(f"Max total disk space: {max_total:.2f} GB") | ||
self.log.info(f"Target used space to reach {target_percent}%: {target_used_size:.2f} GB") | ||
self.log.info(f"Additional space to be used: {additional_usage_needed:.2f} GB") | ||
|
||
return target_used_size | ||
|
||
def log_disk_usage(self): | ||
for node in self.db_cluster.nodes: | ||
info = self.get_disk_info(node) | ||
self.log.info(f"Disk usage for node {node.name}:") | ||
self.log.info(f" Total: {info['total']} GB") | ||
self.log.info(f" Used: {info['used']} GB") | ||
self.log.info(f" Available: {info['available']} GB") | ||
self.log.info(f" Used %: {info['used_percent']}%") |
12 changes: 12 additions & 0 deletions
12
jenkins-pipelines/oss/full-storage-utilization.jenkinsfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!groovy | ||
|
||
// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43 | ||
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm) | ||
|
||
longevityPipeline( | ||
backend: 'aws', | ||
region: 'eu-west-1', | ||
test_name: 'full_storage_utilization_test.FullStorageUtilizationTest.test_storage_utilization', | ||
test_config: 'test-cases/scale/full-storage-utilization.yaml', | ||
timeout: [time: 300, unit: 'MINUTES'] | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
test_duration: 300 | ||
n_db_nodes: 3 | ||
n_loaders: 1 | ||
n_monitor_nodes: 1 | ||
user_prefix: 'df-test' | ||
instance_type_db: 'i4i.large' | ||
instance_provision: 'spot' | ||
|
||
post_behavior_db_nodes: "destroy" | ||
post_behavior_loader_nodes: "destroy" | ||
post_behavior_monitor_nodes: "destroy" | ||
enterprise_disable_kms: true | ||
|
||
stress_cmd: 'cassandra-stress write duration=30m -rate threads=10 "throttle=1400/s" -mode cql3 native -pop seq=1..5000000 -col "size=FIXED(10240) n=FIXED(1)" -schema "replication(strategy=NetworkTopologyStrategy,replication_factor=3)"' | ||
append_scylla_yaml: | ||
enable_tablets: true |