From f24debec18d175145b728b950d3e0b0ab59ca4e5 Mon Sep 17 00:00:00 2001 From: yarongilor Date: Mon, 28 Oct 2024 19:01:14 +0200 Subject: [PATCH] fix(nodetool rebuild): use repair instead of rebuild if no tablets support if no tables support for nodetool rebuild, test should use an alternative action of repair. it should then disable load-balancing and repair all nodes in this datacenter. refs: https://github.com/scylladb/scylladb/issues/17575 refs: https://github.com/scylladb/scylladb/issues/20084#issuecomment-2323211550 --- sdcm/nemesis.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index 4ef6a5f1c6..413c4307d8 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -3976,8 +3976,15 @@ def start_and_interrupt_repair_streaming(self): self.target_node.wait_node_fully_start() - with adaptive_timeout(Operations.REBUILD, self.target_node, timeout=HOUR_IN_SEC * 48): - self.target_node.run_nodetool("rebuild", long_running=True, retry=0) + is_rebuild_supported = SkipPerIssues('scylladb/scylladb#17575', params=self.tester.params) + # If tablets in use and rebuild is not supported, running a DC repair instead. + with self.cluster.cql_connection_patient(self.target_node) as session: + if is_tablets_feature_enabled(session=session) and not is_rebuild_supported: + for node in [n for n in self.cluster.nodes if n.dc_idx == self.target_node.dc_idx]: + node.run_nodetool(sub_cmd="repair") + else: + with adaptive_timeout(Operations.REBUILD, self.target_node, timeout=HOUR_IN_SEC * 48): + self.target_node.run_nodetool("rebuild", long_running=True, retry=0) def start_and_interrupt_rebuild_streaming(self): """ @@ -4007,8 +4014,15 @@ def start_and_interrupt_rebuild_streaming(self): ) ParallelObject(objects=[trigger, watcher], timeout=timeout + 60).call_objects() self.target_node.wait_node_fully_start(timeout=300) - with adaptive_timeout(Operations.REBUILD, self.target_node, timeout=HOUR_IN_SEC * 48): - self.target_node.run_nodetool("rebuild", long_running=True, retry=0) + is_rebuild_supported = SkipPerIssues('scylladb/scylladb#17575', params=self.tester.params) + # If tablets in use and rebuild is not supported, running a DC repair instead. + with self.cluster.cql_connection_patient(self.target_node) as session: + if is_tablets_feature_enabled(session=session) and not is_rebuild_supported: + for node in [n for n in self.cluster.nodes if n.dc_idx == self.target_node.dc_idx]: + node.run_nodetool(sub_cmd="repair") + else: + with adaptive_timeout(Operations.REBUILD, self.target_node, timeout=HOUR_IN_SEC * 48): + self.target_node.run_nodetool("rebuild", long_running=True, retry=0) def disrupt_decommission_streaming_err(self): """