From e615963fe219efcbcc9a1e4d3d6569237f2a7909 Mon Sep 17 00:00:00 2001 From: asbishop Date: Tue, 27 Sep 2016 07:57:25 -0400 Subject: [PATCH] Allow OSD nodes to quiesce after a deleting pool When deleting a pool, it may take a while for the OSD nodes to delete the objects in the pool. This change makes CBT wait until the OSD nodes quiesce in order to ensure they are idle before starting the next test run. Quiescing is done by waiting until the maximum disk utilization for any disk falls below a threshold, and waiting until the maximum CPU utilization for any ceph-osd process falls below a threshold. The thresholds can be tuned using the following cluster configuration parameters (the default values are listed): cluster: quiesce_disk_util_max: 3 quiesce_disk_window_size: 30 quiesce_osd_cpu_max: 3 If quiesce_disk_util_max or quiesce_osd_cpu_max is zero then the corresponding disk/CPU quiescing operation is skipped. Closes #117 (cherry picked from commit 3d442c7ce4a9ac1a805dce893a2024471d06782c) --- cluster/ceph.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cluster/ceph.py b/cluster/ceph.py index 01153731..c9cece26 100644 --- a/cluster/ceph.py +++ b/cluster/ceph.py @@ -562,6 +562,19 @@ def rmpool(self, name, profile_name): common.pdsh(settings.getnodes('head'), 'sudo %s -c %s osd pool delete %s %s --yes-i-really-really-mean-it' % (self.ceph_cmd, self.tmp_conf, name, name), continue_if_error=False).communicate() + disk_util_max = settings.cluster.get('quiesce_disk_util_max', 3) + if disk_util_max > 0 and disk_util_max < 100: + logger.info('Waiting for OSD disk utilization to quiesce...') + window_size = settings.cluster.get('quiesce_disk_window_size', 30) + wait_cmd = 'while [ $(iostat -dxyz ALL %s 1 | awk \'BEGIN {m=0} {v=int($NF); if(v>m){m=v}} END {print m}\') -gt %s ]; do true; done' % (window_size, disk_util_max) + common.pdsh(settings.getnodes('osds'), wait_cmd).communicate() + + osd_cpu_max = settings.cluster.get('quiesce_osd_cpu_max', 3) + if osd_cpu_max > 0 and osd_cpu_max < 100: + logger.info('Waiting for OSD CPU utilization to quiesce...') + wait_cmd = 'while [ $(top -bn1 | awk \'$NF == "ceph-osd" {print int($9) ; exit}\') -gt %s ]; do sleep 5; done' % (osd_cpu_max) + common.pdsh(settings.getnodes('osds'), wait_cmd).communicate() + def rbd_unmount(self): common.pdsh(settings.getnodes('clients'), 'sudo find /dev/rbd* -maxdepth 0 -type b -exec umount \'{}\' \;').communicate() # common.pdsh(settings.getnodes('clients'), 'sudo find /dev/rbd* -maxdepth 0 -type b -exec rbd -c %s unmap \'{}\' \;' % self.tmp_conf).communicate()