Skip to content

Commit

Permalink
full_storage_utilization_test: Storage utilization at 90% cluster size
Browse files Browse the repository at this point in the history
Populate data until it reaches over 90% disk storage and perform
db and cluster options.

Signed-off-by: Lakshmipathi <[email protected]>
  • Loading branch information
Lakshmipathi committed Nov 3, 2024
1 parent 9018c7f commit 0bb3e8e
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 0 deletions.
177 changes: 177 additions & 0 deletions full_storage_utilization_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import time
from sdcm.tester import ClusterTester


class FullStorageUtilizationTest(ClusterTester):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.stress_cmd_30mins = self.params.get('stress_cmd')
self.add_node_cnt = self.params.get('add_node_cnt')
self.action_type = self.params.get('action_type')

def prepare_dataset_layout(self, dataset_size, row_size=10240):
n = dataset_size * 1024 * 1024 * 1024 // row_size
seq_end = n * 100

return f'cassandra-stress write cl=ONE n={n} -mode cql3 native -rate threads=10 -pop dist="uniform(1..{seq_end})" ' \
f'-col "size=FIXED({row_size}) n=FIXED(1)" -schema "replication(strategy=NetworkTopologyStrategy,replication_factor=3)"'

def setUp(self):
super().setUp()
self.start_time = time.time()

def start_slow_write(self):
self.run_stress_thread(stress_cmd=self.stress_cmd_30mins, keyspace_name=f"{self.action_type}")
self.log.info("Wait for 2 mins for the stress command to start")
time.sleep(120)

def scale_out(self):
self.start_slow_write()
self.log.info("Adding a new node")
self.add_new_node()
time.sleep(1800)

def scale_in(self):
self.start_slow_write()
self.log.info("Removing a node")
self.remove_node()
time.sleep(1800)

def perform_action(self):
self.log_disk_usage()
# Trigger specific action
if self.action_type == "scale_out":
self.scale_out()
elif self.action_type == "scale_in":
self.scale_out()
self.scale_in()
self.log_disk_usage()

def test_storage_utilization(self):
"""
3 nodes cluster, RF=3.
Write data until 90% disk usage is reached.
Sleep for 60 minutes.
Perform specific action.
"""
self.run_stress(75, sleep_time=1800)
self.run_stress(90, sleep_time=1800)
self.perform_action()

def run_stress(self, target_usage, sleep_time=600):
target_used_size = self.calculate_target_used_size(target_usage)
self.run_stress_until_target(target_used_size, target_usage)

self.log_disk_usage()
self.log.info(f"Wait for {sleep_time} seconds")
time.sleep(sleep_time)
self.log_disk_usage()

def run_stress_until_target(self, target_used_size, target_usage):
current_used = self.get_max_disk_used()
current_usage = self.get_max_disk_usage()
num = 0
smaller_dataset = False

space_needed = target_used_size - current_used
# Calculate chunk size as 10% of space needed
chunk_size = int(space_needed * 0.1)
while current_used < target_used_size and current_usage < target_usage:
num += 1

# Write smaller dataset near the threshold (15% or 30GB of the target)
smaller_dataset = (((target_used_size - current_used) < 30) or ((target_usage - current_usage) <= 15))

# Use 1GB chunks near threshold, otherwise use 10% of remaining space
dataset_size = 1 if smaller_dataset else chunk_size
ks_name = "keyspace_small" if smaller_dataset else "keyspace_large"
self.log.info(f"Writing chunk of size: {dataset_size} GB")
stress_cmd = self.prepare_dataset_layout(dataset_size)
stress_queue = self.run_stress_thread(stress_cmd=stress_cmd, keyspace_name=f"{ks_name}{num}", stress_num=1, keyspace_num=num)

self.verify_stress_thread(cs_thread_pool=stress_queue)
self.get_stress_results(queue=stress_queue)

self.flush_all_nodes()
#time.sleep(60) if smaller_dataset else time.sleep(600)

current_used = self.get_max_disk_used()
current_usage = self.get_max_disk_usage()
self.log.info(f"Current max disk usage after writing to keyspace{num}: {current_usage}% ({current_used} GB / {target_used_size} GB)")

def add_new_node(self):
new_nodes = self.db_cluster.add_nodes(count=self.add_node_cnt, enable_auto_bootstrap=True)
self.db_cluster.wait_for_init(node_list=new_nodes)
self.db_cluster.wait_for_nodes_up_and_normal(nodes=new_nodes)
total_nodes_in_cluster = len(self.db_cluster.nodes)
self.log.info(f"New node added, total nodes in cluster: {total_nodes_in_cluster}")
self.monitors.reconfigure_scylla_monitoring()

def remove_node(self):
self.log.info('Removing a first node from the cluster')
node_to_remove = self.db_cluster.nodes[0]
self.log.info(f"Node to be removed: {node_to_remove.name}")
self.db_cluster.decommission(node_to_remove)
self.db_cluster.wait_for_nodes_up_and_normal(nodes=[node_to_remove])
self.log.info(f"Node {node_to_remove.name} has been removed from the cluster")
self.monitors.reconfigure_scylla_monitoring()

def flush_all_nodes(self):
for node in self.db_cluster.nodes:
self.log.info(f"Flushing data on node {node.name}")
node.run_nodetool("flush")

def get_max_disk_usage(self):
max_usage = 0
for node in self.db_cluster.nodes:
result = node.remoter.run("df -h --output=pcent /var/lib/scylla | sed 1d | sed 's/%//'")
usage = int(result.stdout.strip())
max_usage = max(max_usage, usage)
return max_usage

def get_max_disk_used(self):
max_used = 0
for node in self.db_cluster.nodes:
result = node.remoter.run("df -h --output=used /var/lib/scylla | sed 1d | sed 's/G//'")
used = int(result.stdout.strip())
max_used = max(max_used, used)
return max_used

def get_disk_info(self, node):
result = node.remoter.run("df -h --output=size,used,avail,pcent /var/lib/scylla | sed 1d")
size, used, avail, pcent = result.stdout.strip().split()
return {
'total': int(float(size.rstrip('G'))),
'used': int(float(used.rstrip('G'))),
'available': int(float(avail.rstrip('G'))),
'used_percent': int(pcent.rstrip('%'))
}

def calculate_target_used_size(self, target_percent):
max_total = 0
for node in self.db_cluster.nodes:
info = self.get_disk_info(node)
max_total = max(max_total, info['total'])

target_used_size = (target_percent / 100) * max_total
current_used = self.get_max_disk_used()
additional_usage_needed = target_used_size - current_used

self.log.info(f"Current max disk usage: {self.get_max_disk_usage():.2f}%")
self.log.info(f"Current max used space: {current_used:.2f} GB")
self.log.info(f"Max total disk space: {max_total:.2f} GB")
self.log.info(f"Target used space to reach {target_percent}%: {target_used_size:.2f} GB")
self.log.info(f"Additional space to be used: {additional_usage_needed:.2f} GB")

return target_used_size

def log_disk_usage(self):
for node in self.db_cluster.nodes:
info = self.get_disk_info(node)
self.log.info(f"Disk usage for node {node.name}:")
self.log.info(f" Total: {info['total']} GB")
self.log.info(f" Used: {info['used']} GB")
self.log.info(f" Available: {info['available']} GB")
self.log.info(f" Used %: {info['used_percent']}%")


12 changes: 12 additions & 0 deletions jenkins-pipelines/oss/full-storage-utilization.jenkinsfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!groovy

// trick from https://github.com/jenkinsci/workflow-cps-global-lib-plugin/pull/43
def lib = library identifier: 'sct@snapshot', retriever: legacySCM(scm)

longevityPipeline(
backend: 'aws',
region: 'eu-west-1',
test_name: 'full_storage_utilization_test.FullStorageUtilizationTest.test_storage_utilization',
test_config: 'test-cases/scale/full-storage-utilization.yaml',
timeout: [time: 300, unit: 'MINUTES']
)
3 changes: 3 additions & 0 deletions sdcm/sct_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,6 +1231,9 @@ class SCTConfiguration(dict):
dict(name="perf_gradual_throttle_steps", env="SCT_PERF_GRADUAL_THROTTLE_STEPS", type=dict,
help="Used for gradual performance test. Define throttle for load step in ops. Example: {'read': ['100000', '150000'], 'mixed': ['300']}"),

# StorageUtilizationTest
dict(name="action_type", env="SCT_ACTION_TYPE", type=str,
help="Perform specific tasks (scale_in,scale_out etc)"),
# RefreshTest
dict(name="skip_download", env="SCT_SKIP_DOWNLOAD", type=boolean,
help=""),
Expand Down
18 changes: 18 additions & 0 deletions test-cases/scale/full-storage-utilization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
test_duration: 3600
n_db_nodes: 3
n_loaders: 1
n_monitor_nodes: 1
user_prefix: 'df-test'
instance_type_db: 'i4i.large'
instance_provision: 'spot'

post_behavior_db_nodes: "destroy"
post_behavior_loader_nodes: "destroy"
post_behavior_monitor_nodes: "destroy"
enterprise_disable_kms: true
add_node_cnt: 1
action_type: "scale_in"

stress_cmd: 'cassandra-stress write duration=30m -rate threads=10 "throttle=1400/s" -mode cql3 native -pop seq=1..5000000 -col "size=FIXED(10240) n=FIXED(1)" -schema "replication(strategy=NetworkTopologyStrategy,replication_factor=3)"'
append_scylla_yaml:
enable_tablets: true

0 comments on commit 0bb3e8e

Please sign in to comment.