diff --git a/ceph/rados/core_workflows.py b/ceph/rados/core_workflows.py index 161c9682082..25d9dddc70d 100644 --- a/ceph/rados/core_workflows.py +++ b/ceph/rados/core_workflows.py @@ -416,6 +416,7 @@ def bench_write(self, pool_name: str, **kwargs) -> bool: - rados_write_duration -> duration of write operation (int) - byte_size -> size of objects to be written (str) eg : 10KB, default - 4096KB + - num_threads -> Number of threads to be used(int) - max_objs -> max number of objects to be written (int) - verify_stats -> arg to control whether obj stats need to be verified after write (bool) | default: True @@ -427,6 +428,7 @@ def bench_write(self, pool_name: str, **kwargs) -> bool: """ duration = kwargs.get("rados_write_duration", 200) byte_size = kwargs.get("byte_size", 4096) + num_threads = kwargs.get("num_threads") max_objs = kwargs.get("max_objs") verify_stats = kwargs.get("verify_stats", True) check_ec = kwargs.get("check_ec", True) @@ -436,6 +438,8 @@ def bench_write(self, pool_name: str, **kwargs) -> bool: if nocleanup: cmd = f"{cmd} --no-cleanup" org_objs = self.get_cephdf_stats(pool_name=pool_name)["stats"]["objects"] + if num_threads: + cmd = f"{cmd} -t {num_threads}" if max_objs: cmd = f"{cmd} --max-objects {max_objs}" if kwargs.get("background"): @@ -1176,6 +1180,21 @@ def enable_file_logging(self) -> bool: return False return True + def disable_file_logging(self) -> bool: + """ + Disable the cluster logging + Returns: True -> pass, False -> fail + """ + try: + cmd = "ceph config set global log_to_file false" + self.node.shell([cmd]) + cmd = "ceph config set global mon_cluster_log_to_file false" + self.node.shell([cmd]) + except Exception: + log.error("Error while disabling config to log into file") + return False + return True + def get_ec_profiles(self) -> list: """ Fetches all the EC profiles present on the cluster diff --git a/suites/reef/rados/tier-2_rados_test_omap.yaml b/suites/reef/rados/tier-2_rados_test_omap.yaml index bd43b6a889a..f47cccd7882 100644 --- a/suites/reef/rados/tier-2_rados_test_omap.yaml +++ b/suites/reef/rados/tier-2_rados_test_omap.yaml @@ -149,19 +149,14 @@ tests: obj_end: 50 num_keys_obj: 100 delete_pool: true - - test: name: Preempt scrub messages checks desc: Checking preempt messages in the OSDs module: test_rados_preempt_scrub.py polarion-id: CEPH-83572916 config: - verify_osd_preempt: - configurations: - pool-1: - pool_name: preempt_pool - pool_type: replicated - pg_num: 16 + pool_name: preempt_pool + pg_num: 1 delete_pool: true - test: diff --git a/suites/squid/rados/tier-2_rados_test_omap.yaml b/suites/squid/rados/tier-2_rados_test_omap.yaml index 8c1294202a9..0b096eb1037 100644 --- a/suites/squid/rados/tier-2_rados_test_omap.yaml +++ b/suites/squid/rados/tier-2_rados_test_omap.yaml @@ -157,12 +157,8 @@ tests: module: test_rados_preempt_scrub.py polarion-id: CEPH-83572916 config: - verify_osd_preempt: - configurations: - pool-1: - pool_name: preempt_pool - pool_type: replicated - pg_num: 16 + pool_name: preempt_pool + pg_num: 1 delete_pool: true - test: diff --git a/tests/rados/test_rados_preempt_scrub.py b/tests/rados/test_rados_preempt_scrub.py index 03f9ab3e577..6f45bfde1fc 100644 --- a/tests/rados/test_rados_preempt_scrub.py +++ b/tests/rados/test_rados_preempt_scrub.py @@ -2,13 +2,14 @@ This file contains the methods to verify the preempt messages in the OSD logs. """ +import re +import subprocess import time import traceback from ceph.ceph_admin import CephAdmin from ceph.rados.core_workflows import RadosOrchestrator from tests.rados.monitor_configurations import MonConfigMethods -from tests.rados.stretch_cluster import wait_for_clean_pg_sets from utility.log import Log from utility.utils import method_should_succeed @@ -18,13 +19,21 @@ def run(ceph_cluster, **kw): """ CEPH-83572916 - Verify that the preempted messages are generated at OSD logs during scrubbing - 1.Create pool and write data in to the pool - 2.set the osd_shallow_scrub_chunk_max and osd_scrub_chunk_max values to 250 - 3.Start writing the data into pool and start scrub parallely - 4.Once the scrub started check the OSD logs contain the- - - head preempted - - WaitReplicas::react(const GotReplicas&) PREEMPTED - 5. Once the tests are complete delete the pool and remove the set values of the parameter + 1.Create pool with 1 PG with the pg_autoscaler off + 2.Create objects + 3. Set the following paramters- + 3.1 osd_scrub_sleep ->0 + 3.2 osd_deep_scrub_keys ->1 + 3.3 osd_scrub_interval_randomize_ratio ->0 + 3.4 osd_deep_scrub_randomize_ratio -> 0 + 3.5 osd_scrub_min_interval -> 10 + 3.6 osd_scrub_max_interval -> 2000 + 3.7 osd_deep_scrub_interval -> 600 + 3.8 min_size -> 2 + 3.9 debug_osd -> 20/20 + 4. Rewrite the data into the four objects + 5. Search the "preempted" string in the active OSDs + 6. Once the tests are complete delete the pool and remove the set values of the parameter """ log.info(run.__doc__) @@ -32,196 +41,214 @@ def run(ceph_cluster, **kw): cephadm = CephAdmin(cluster=ceph_cluster, **config) rados_object = RadosOrchestrator(node=cephadm) mon_obj = MonConfigMethods(rados_obj=rados_object) - ceph_nodes = kw.get("ceph_nodes") installer = ceph_cluster.get_nodes(role="installer")[0] - pool_target_configs = config["verify_osd_preempt"]["configurations"] + log_lines = ["preempted"] + pool_name = config["pool_name"] try: - scrub_status_flag = False - # Before starting the tests check that scrubbing is not progress in the cluster - time_end = time.time() + 60 * 15 - while time.time() < time_end: - scrub_status = check_scrub_status(rados_object) - if scrub_status is False: - scrub_status_flag = True - log.info("Scrubbing is not in progress.Continuing the test execution") - break - log.info("Waiting for the scrub to complete") - time.sleep(30) - - if scrub_status_flag is False: - log.error( - "Scrubbing is in progress cannot continue the further execution of test" - ) + # enable the file logging + if not rados_object.enable_file_logging(): + log.error("Error while setting config to enable logging into file") return 1 - for entry in pool_target_configs.values(): - log.debug( - f"Creating {entry['pool_type']} pool on the cluster with name {entry['pool_name']}" - ) - method_should_succeed( - rados_object.create_pool, - **entry, - ) - log.info(f"Created the pool {entry['pool_name']}") - rados_object.bench_write( - pool_name=entry["pool_name"], rados_write_duration=90 - ) - flag = wait_for_clean_pg_sets(rados_object) - if not flag: - log.error( - "The cluster did not reach active + Clean state after add capacity" - ) - return 1 - mon_obj.set_config( - section="osd", name="osd_shallow_scrub_chunk_max", value="250" - ) - mon_obj.set_config(section="osd", name="osd_scrub_chunk_max", value="250") - mon_obj.set_config(section="osd", name="debug_osd", value="10/10") - - rados_object.bench_write( - pool_name=entry["pool_name"], rados_write_duration=300, background=True - ) - - log_lines = [ - "head preempted", - "WaitReplicas::react(const GotReplicas&) PREEMPTED", - ] - - init_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") - - rados_object.run_scrub(pool=entry["pool_name"]) - scrub_status_flag = False + init_time, _ = installer.exec_command( + cmd="sudo date '+%Y-%m-%dT%H:%M:%S.%3N+0000'" + ) + init_time = init_time.strip() + rados_object.configure_pg_autoscaler(**{"default_mode": "off"}) + method_should_succeed(rados_object.create_pool, **config) + + if not rados_object.bench_write( + pool_name=pool_name, rados_write_duration=20, max_objs=15, num_threads=1 + ): + log.error(f"Failed to write objects into {pool_name}") + return 1 + acting_set = rados_object.get_pg_acting_set(pool_name=pool_name) + log.info(f"The PG acting set is -{acting_set}") + set_preempt_parameter_value(mon_obj, rados_object, pool_name) - # Wait for the scrub to start for 15 minutes - time_end = time.time() + 60 * 45 - count = 1 - while time.time() < time_end: - scrub_status = check_scrub_status(rados_object) - if scrub_status is True: - scrub_status_flag = True - log.info("Scrubbing is in progress") - break - log.info("Waiting for the scrub to start") - time.sleep(30) - count = count + 1 - log.info(f"The count is -{count}") - total_time = count * 30 - log.info( - f"The total time to start user initiated scrub is - {total_time} seconds" + object_list = rados_object.get_object_list(pool_name) + rados_object.run_deep_scrub(pool=pool_name) + time.sleep(5) + for _ in range(30): + tmp_date = subprocess.check_output("date", text=True).strip() + for object in object_list: + if re.search(r".*object.*", object): + cmd_rewrite = ( + f"echo {tmp_date} | rados -p {pool_name} put {object} -" + ) + rados_object.client.exec_command(cmd=cmd_rewrite, sudo=True) + get_preempt_parameter_value(mon_obj, rados_object, pool_name) + # Wait time for starting the scheduled scrubbing + time.sleep(180) + + # Check for 30 minutes + time_end = time.time() + 60 * 30 + found_flag = False + while time.time() < time_end: + time.sleep(10) + end_time, _ = installer.exec_command( + cmd="sudo date '+%Y-%m-%dT%H:%M:%S.%3N+0000'" ) - # check for the scrub is initiated or it is timeout - if scrub_status_flag is False: - log.error("Scrub is not initiated") - return 1 - log.info("Scrub initiated") - time.sleep(30) - end_time, _ = installer.exec_command(cmd="sudo date '+%Y-%m-%d %H:%M:%S'") - osd_list = [] - for node in ceph_nodes: - if node.role == "osd": - node_osds = rados_object.collect_osd_daemon_ids(node) - osd_list = osd_list + node_osds - log.info(f"The number of OSDs in the cluster are-{len(osd_list)}") - - log_osd_count = 0 - for osd_id in osd_list: + end_time = end_time.strip() + for osd_id in acting_set: + time.sleep(5) + log.info(f"Checking the logs at: {osd_id}") if verify_preempt_log( - rados_obj=rados_object, - osd=osd_id, - start_time=init_time, + init_time=init_time, end_time=end_time, + rados_object=rados_object, + osd_id=osd_id, lines=log_lines, ): log.info(f"The preempted lines found at {osd_id}") - log_osd_count = log_osd_count + 1 - time.sleep(10) - if log_osd_count == 2: + found_flag = True + if found_flag: + log.info(f"Logs messages are noticed at - {osd_id} osd") break - mon_daemon = rados_object.run_ceph_command(cmd="ceph mon stat")["leader"] - - # checking the ceph logs - mon_log = rados_object.get_journalctl_log( - start_time=init_time, - end_time=end_time, - daemon_type="mon", - daemon_id=mon_daemon, - ) - log_mon_count = 0 - - log.info(f"The mgr log lines are - {mon_log}") - for line in log_lines: - if line in mon_log: - log.info(f"The {line} found in the logs") - log_mon_count = log_mon_count + 1 - if log_osd_count == 0 and log_mon_count == 0: - log.error("Log lines not found in any of the OSDs and in ceph") - return 1 + if found_flag: + break + if not found_flag: + log.error("The preempted messages are not appeared") + return 1 except Exception as e: log.info(e) log.info(traceback.format_exc()) return 1 finally: + log.info("Execution of finally block") if config.get("delete_pool"): - method_should_succeed(rados_object.delete_pool, entry["pool_name"]) + method_should_succeed(rados_object.delete_pool, pool_name) log.info("deleted the pool successfully") - mon_obj.remove_config(section="osd", name="osd_shallow_scrub_chunk_max") - mon_obj.remove_config(section="osd", name="osd_scrub_chunk_max") - mon_obj.remove_config(section="osd", name="debug_osd") - # intentional wait for 5 seconds - time.sleep(5) + unset_preempt_parameter_value(mon_obj, rados_object, pool_name) + rados_object.disable_file_logging() # log cluster health rados_object.log_cluster_health() return 0 -def verify_preempt_log( - rados_obj: RadosOrchestrator, osd, start_time, end_time, lines -) -> bool: +def verify_preempt_log(init_time, end_time, rados_object, osd_id, lines) -> bool: """ Retrieve the preempt log using journalctl command Args: - rados_obj: Rados object - osd: osd id - start_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10') + init_time: time to start reading the journalctl logs - format ('2022-07-20 09:40:10') end_time: time to stop reading the journalctl logs - format ('2022-07-20 10:58:49') - lines: Log lines to search in the journalctl logs - Returns: True-> if the lines are exist in the journalctl logs - False -> if the lines are not exist in the journalctl logs + rados_object: Rados object + osd_id : osd id + lines: Log lines to search in the osd logs + Returns: True-> if the lines are exist in the osd logs + False -> if the lines are not exist in the osd logs """ log.info("Checking for the preempt messages in the OSD logs") - log_lines = rados_obj.get_journalctl_log( - start_time=start_time, - end_time=end_time, - daemon_type="osd", - daemon_id=osd, + fsid = rados_object.run_ceph_command(cmd="ceph fsid")["fsid"] + host = rados_object.fetch_host_node(daemon_type="osd", daemon_id=osd_id) + cmd_get_log_lines = f'awk \'$1 >= "{init_time}" && $1 <= "{end_time}"\' /var/log/ceph/{fsid}/ceph-osd.{osd_id}.log' + log_lines, err = host.exec_command( + sudo=True, + cmd=cmd_get_log_lines, ) - log.debug(f"Journalctl logs are : {log_lines}") for line in lines: - if line not in log_lines: - log.error(f" Did not find logging on OSD : {osd}") - return False - log.info(f"Found the log lines on OSD : {osd}") - return True + if line in log_lines: + log.info(f" Found the log lines at OSD : {osd_id}") + return True + log.info(f"Did not found the log lines on OSD : {osd_id}") + return False -def check_scrub_status(osd_object): +def set_preempt_parameter_value(mon_object, rados_object, pool_name): """ - Method is used to check the scrub is in progress or not - Args: - osd_obj: Rados object - - Returns: True-> if the scrubbing is in progress - False -> if the scrubbing is not in progress + Method is used to configure the parameter values: + mon_object : Monitor object + rados_object : Rados Object + pool_name: pool name + Return -> none """ - - status_report = osd_object.run_ceph_command(cmd="ceph report") - for entry in status_report["num_pg_by_state"]: - if "scrubbing" in entry["state"]: - log.info("Scrubbing is in progress") - return True - log.info("Scrubbing is not in-progress") - return False + try: + mon_object.set_config(section="osd", name="osd_scrub_sleep", value=0) + mon_object.set_config(section="osd", name="osd_deep_scrub_keys", value=1) + mon_object.set_config( + section="osd", name="osd_scrub_interval_randomize_ratio", value="0.0" + ) + mon_object.set_config( + section="osd", name="osd_deep_scrub_randomize_ratio", value="0.0" + ) + mon_object.set_config(section="osd", name="osd_scrub_min_interval", value=10) + mon_object.set_config(section="osd", name="osd_deep_scrub_interval", value=600) + rados_object.set_pool_property(pool=pool_name, props="min_size", value=2) + mon_object.set_config(section="osd", name="debug_osd", value="20/20") + time.sleep(10) + except Exception as error: + log.error(f"Configuration of parameter is failed. Error : {error}") + raise Exception("Error: Setting configuration failed") + return None + + +def unset_preempt_parameter_value(mon_object, rados_object, pool_name): + """ + Method is used to set the default parameter values : + mon_object : Monitor object + rados_object : Rados Object + pool_name: pool name + Return -> None + """ + try: + mon_object.remove_config(section="osd", name="osd_scrub_sleep") + mon_object.remove_config(section="osd", name="osd_deep_scrub_keys") + mon_object.remove_config( + section="osd", name="osd_scrub_interval_randomize_ratio" + ) + mon_object.remove_config(section="osd", name="osd_deep_scrub_randomize_ratio") + mon_object.remove_config(section="osd", name="osd_scrub_min_interval") + mon_object.remove_config(section="osd", name="osd_scrub_max_interval") + mon_object.remove_config(section="osd", name="osd_deep_scrub_interval") + rados_object.set_pool_property(pool=pool_name, props="min_size", value=2) + mon_object.remove_config(section="osd", name="debug_osd") + except Exception as error: + log.error(f"Configuration of parameter to default is failed. Error : {error}") + raise Exception("Error: Unsetting configuration failed") + return None + + +def get_preempt_parameter_value(mon_object, rados_object, pool_name): + """ + Method is used to display the parameter values : + mon_object : Monitor object + rados_object : Rados Object + pool_name: pool name + Return -> None + """ + try: + log.info("The scrub parameter values are: ") + out_put = mon_object.get_config(section="osd", param="osd_scrub_sleep") + log.info(f"The osd_scrub_sleep parameter value is - {out_put}") + out_put = mon_object.get_config(section="osd", param="osd_deep_scrub_keys") + log.info(f"The osd_deep_scrub_keys parameter value is - {out_put}") + out_put = mon_object.get_config( + section="osd", param="osd_scrub_interval_randomize_ratio" + ) + log.info( + f"The osd_scrub_interval_randomize_ratio parameter value is - {out_put}" + ) + out_put = mon_object.get_config( + section="osd", param="osd_deep_scrub_randomize_ratio" + ) + log.info(f"The osd_deep_scrub_randomize_ratio parameter value is - {out_put}") + out_put = mon_object.get_config(section="osd", param="osd_scrub_min_interval") + log.info(f"The osd_scrub_min_interval parameter value is - {out_put}") + out_put = mon_object.get_config(section="osd", param="osd_scrub_max_interval") + log.info(f"The osd_scrub_max_interval parameter value is - {out_put}") + out_put = mon_object.get_config(section="osd", param="osd_deep_scrub_interval") + log.info(f"The osd_deep_scrub_interval parameter value is - {out_put}") + out_put = mon_object.get_config(section="osd", param="osd_deep_scrub_interval") + log.info(f"The osd_deep_scrub_interval parameter value is - {out_put}") + out_put = rados_object.get_pool_property(pool=pool_name, props="min_size") + log.info(f"The min_size parameter value is - {out_put}") + out_put = rados_object.get_pool_property(pool=pool_name, props="size") + log.info(f"The size parameter value is - {out_put}") + except Exception as error: + log.error(f"Display the parameter values is failed. Error : {error}") + raise Exception("Error: Display configuration failed") + return None