From 0712d2599c917bb47db8e9eb4857f8b29ce0ecb7 Mon Sep 17 00:00:00 2001 From: gargsans-yb Date: Thu, 16 Jan 2025 14:14:22 +0000 Subject: [PATCH] [#24852]yugabyted: Using clockbound time sync service in yugabyted deployments Summary: Adding clockbound configuration as part of yugabyted deployments: * If the node is started with `enhance_time_sync_via_clockbound` flag, then, yugabyted will do a hard check for clockbound, i.e., yugabyted will fail to start if clockbound is not installed and configured. It will give an appropriate error msg. * If the node is not started with `enhance_time_sync_via_clockbound` flag, then yugabyted will do a soft check for clockbound, i.e., yugabyted will give a warning with appropriate message. * If the checks pass, yugabyted will start the node with time_source as clockbound. If this flag is set to some other value by the user, yugabyted will fail if `enhance_time_sync_via_clockbound` flag was used, else, it will log it it and continue to start the node without setting time_source. Jira: DB-13966 Test Plan: ./yb_build.sh --java-test 'org.yb.yugabyted.*' Reviewers: nikhil Reviewed By: nikhil Subscribers: yugabyted-dev, sgarg-yb Differential Revision: https://phorge.dev.yugabyte.com/D41289 --- bin/configure_clockbound.sh | 0 bin/yugabyted | 178 +++++++++--------- .../cmd/server/handlers/api_cluster_info.go | 6 +- 3 files changed, 94 insertions(+), 90 deletions(-) mode change 100644 => 100755 bin/configure_clockbound.sh diff --git a/bin/configure_clockbound.sh b/bin/configure_clockbound.sh old mode 100644 new mode 100755 diff --git a/bin/yugabyted b/bin/yugabyted index 0628a45ddf9c..5f94797e4b30 100755 --- a/bin/yugabyted +++ b/bin/yugabyted @@ -125,9 +125,11 @@ PREREQS_ERROR_MSGS = { ' please free the port and restart the node.', 'ycql_metric_port': 'YCQL metrics port {} is already in use. For accessing the YCQL metrics,' \ ' please free the port and restart the node.', + 'clockbound_fail': 'Failed to validate system configuration for clockbound. Please run ' \ + 'bin/configure_clockbound.sh script to install and configure clockbound.', 'clockbound': 'Clockbound is recommended on AWS/Azure/GCP clusters.' \ - ' It can reduce read restart errors significantly in concurrent workloads.' \ - ' Relevant flag: --enhance_time_sync_via_clockbound.', + ' It can reduce read restart errors significantly in concurrent workloads. Please run ' \ + 'bin/configure_clockbound.sh script to install and configure clockbound.', } QUICK_START_LINKS = { 'mac' : 'https://docs.yugabyte.com/preview/quick-start/', @@ -683,7 +685,7 @@ def using_time_sync_service(): 'aws.com', 'google.com'] cmd = ['chronyc', 'sources'] - out, err, ret_code = run_process(cmd, timeout=1, log_cmd=True) + out, _, ret_code = run_process(cmd, timeout=1, log_cmd=True) if ret_code == 0: for source in allow_list: if source in out: @@ -693,7 +695,7 @@ def using_time_sync_service(): def is_phc_configured(): cmd = ['systemctl', 'status', 'clockbound'] - out, err, retcode = run_process(cmd, timeout=1, log_cmd=True) + out, _, retcode = run_process(cmd, timeout=1, log_cmd=True) return retcode == 0 and 'PHC' in out # Check if ip is ipv6 @@ -741,9 +743,6 @@ class ControlScript(object): atexit.register(self.kill_children) Output.script_exit_func = self.kill_children - if self.configs.temp_data.get("enhance_time_sync_via_clockbound"): - self.assert_system_configured_for_clockbound() - if self.configs.saved_data.get("read_replica"): self.start_rr_process() else: @@ -2819,11 +2818,16 @@ class ControlScript(object): prereqs_warn_flag = True # TODO: Uncomment this block when clockbound becomes GA. - # # Configuring clockbound is strongly recommended for AWS clusters. - # if using_time_sync_service() and not self.configs.temp_data[ - # "enhance_time_sync_via_clockbound"]: - # prereqs_warn.add('clockbound') - # prereqs_warn_flag = True + # Configuring clockbound is strongly recommended for AWS clusters. + if not self.assert_system_configured_for_clockbound(): + if self.configs.temp_data["enhance_time_sync_via_clockbound"]: + prereqs_failed.add('clockbound_fail') + prereqs_failed_flag = True + elif using_time_sync_service(): + prereqs_warn.add('clockbound') + prereqs_warn_flag = True + else: + self.configs.temp_data["is_clockbound_configured"] = True (failed_ports, warning_ports, mandatory_port_available, recommended_port_available) = self.check_ports() @@ -2904,13 +2908,13 @@ class ControlScript(object): # Get pre-req failures and warnings prereqs_failed_flag, prereqs_failed, prereqs_warn_flag, prereqs_warn, \ mandatory_port_available, recommended_port_available = check - if prereqs_warn_flag: - if OS_NAME == "Linux": - help_links.append("- Quick start for Linux: " + - Output.make_underline(QUICK_START_LINKS['linux'])) - else: - help_links.append("- Quick start for macOS: " + - Output.make_underline(QUICK_START_LINKS['mac'])) + # if prereqs_warn_flag: + if OS_NAME == "Linux": + help_links.append("- Quick start for Linux: " + + Output.make_underline(QUICK_START_LINKS['linux'])) + else: + help_links.append("- Quick start for macOS: " + + Output.make_underline(QUICK_START_LINKS['mac'])) if not mandatory_port_available or not recommended_port_available: help_links.append("- Default ports: " + Output.make_underline(DEFAULT_PORTS_LINK)) @@ -3100,6 +3104,38 @@ class ControlScript(object): master_rpc_port, master_addresses) was_already_setup = self.configs.saved_data.get("cluster_member", False) + warnings = [] + warnings_for_ui = [] + warning_help_msg="" + is_first_run = True + + # Do the pre-req check before forming master and tserver commands + if is_first_run: + ulimits_failed = self.script.set_rlimits(print_info=True) + if ulimits_failed: + msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format( + ", ".join(ulimits_failed)) + ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \ + "larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK) + self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg)) + + prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed) + + if prereqs_check_result['status']==Output.ANIMATION_SUCCESS: + Output.print_out(prereqs_check_result['msg']) + elif prereqs_check_result['status']==Output.ANIMATION_WARNING: + + warnings.extend(list(prereqs_check_result['msg'].values())[:-1]) + warning_help_msg = prereqs_check_result['msg']["help_msg"] + + prereqs_check_result['msg'].pop("help_msg") + warnings_for_ui = [] + for k in prereqs_check_result['msg'].keys(): + warnings_for_ui.extend([k]) + elif prereqs_check_result['status']==Output.ANIMATION_FAIL: + Output.print_and_log(prereqs_check_result['msg']) + sys.exit(1) + common_gflags = self.get_common_flags() yb_master_cmd = self.get_master_cmd(common_gflags) @@ -3125,7 +3161,6 @@ class ControlScript(object): self.processes = {} return - is_first_run = True callhome_thread = None masters_list_update_thread = None #Start the different thread for extracting the YBC binaries @@ -3160,37 +3195,6 @@ class ControlScript(object): # Start or initialize yb-master and yb-tserver. if is_first_run: - # Output.init_animation("Running system checks...") - warnings = [] - warnings_for_ui = [] - warning_help_msg="" - ulimits_failed = self.script.set_rlimits(print_info=True) - if ulimits_failed: - msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format( - ", ".join(ulimits_failed)) - ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \ - "larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK) - self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg)) - - prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed) - # Output.update_animation(msg=prereqs_check_result['msg'], - # status=prereqs_check_result['status']) - if prereqs_check_result['status']==Output.ANIMATION_SUCCESS: - Output.print_out(prereqs_check_result['msg']) - elif prereqs_check_result['status']==Output.ANIMATION_WARNING: - - warnings.extend(list(prereqs_check_result['msg'].values())[:-1]) - warning_help_msg = prereqs_check_result['msg']["help_msg"] - - prereqs_check_result['msg'].pop("help_msg") - warnings_for_ui = [] - for k in prereqs_check_result['msg'].keys(): - warnings_for_ui.extend([k]) - elif prereqs_check_result['status']==Output.ANIMATION_FAIL: - Output.print_and_log(prereqs_check_result['msg']) - sys.exit(1) - - Output.init_animation("Starting the YugabyteDB Processes...") self.post_install_yb() @@ -3414,7 +3418,28 @@ class ControlScript(object): if join_ip: master_addresses = "{}:{},{}".format(get_url_from_ip(join_ip), master_rpc_port, master_addresses) - was_already_setup = self.configs.saved_data.get("cluster_member", False) + + is_first_run = True + warnings = [] + warning_help_msg="" + if is_first_run: + ulimits_failed = self.script.set_rlimits(print_info=True) + if ulimits_failed: + msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format( + ", ".join(ulimits_failed)) + ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \ + "larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK) + self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg)) + + prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed) + if prereqs_check_result['status']==Output.ANIMATION_SUCCESS: + Output.print_out(prereqs_check_result['msg']) + elif prereqs_check_result['status']==Output.ANIMATION_WARNING: + warnings.extend(list(prereqs_check_result['msg'].values())[:-1]) + warning_help_msg = prereqs_check_result['msg']["help_msg"] + elif prereqs_check_result['status']==Output.ANIMATION_FAIL: + Output.print_and_log(prereqs_check_result['msg']) + sys.exit(1) common_gflags = self.get_common_flags() @@ -3436,7 +3461,6 @@ class ControlScript(object): self.processes = {} return - is_first_run = True callhome_thread = None masters_list_update_thread = None self.stop_callhome = False @@ -3463,26 +3487,6 @@ class ControlScript(object): # Start or initialize yb-master and yb-tserver. if is_first_run: - warnings = [] - warning_help_msg="" - ulimits_failed = self.script.set_rlimits(print_info=True) - if ulimits_failed: - msg = "Failed to meet recommended settings. Ulimits too low - {}.\n".format( - ", ".join(ulimits_failed)) - ulimit_warn_msg = msg + "Note {} will still run, although it may fail for " \ - "larger workloads. For more info, see {}".format(SCRIPT_NAME, CONFIG_LINK) - self.alerts.append((ALERT_WARNING, ULIMIT_ERR_CODE, ulimit_warn_msg)) - - prereqs_check_result = self.prereqs_check(ulimits=ulimits_failed) - if prereqs_check_result['status']==Output.ANIMATION_SUCCESS: - Output.print_out(prereqs_check_result['msg']) - elif prereqs_check_result['status']==Output.ANIMATION_WARNING: - warnings.extend(list(prereqs_check_result['msg'].values())[:-1]) - warning_help_msg = prereqs_check_result['msg']["help_msg"] - elif prereqs_check_result['status']==Output.ANIMATION_FAIL: - Output.print_and_log(prereqs_check_result['msg']) - sys.exit(1) - Output.init_animation("Starting the YugabyteDB Processes...") self.post_install_yb() @@ -3652,14 +3656,18 @@ class ControlScript(object): def config_time_source_clockbound(self, flags): # Configure tserver flag time_source=clockbound - # when --enhance_time_sync_via_clockbound is set. - if self.configs.temp_data["enhance_time_sync_via_clockbound"]: + # when clockbound is installed and configured. + if self.configs.temp_data["is_clockbound_configured"]: # Check database configuration. time_source = self.get_flag_value(flags, "time_source") if time_source and time_source != "clockbound": - raise ValueError( - "Cannot configure time_source with" - " --enhance_time_sync_via_clockbound.") + if self.configs.temp_data["enhance_time_sync_via_clockbound"]: + raise ValueError("--time_source gflag is already set to {}.".format( + time_source) + "Cannot configure time_source with" + + " --enhance_time_sync_via_clockbound.") + else: + Output.log("--time_source gflag is already set to {}.".format(time_source) + + " Cannot configure time_source to clockbound.") # Configure time_source=clockbound if not already. if not time_source: @@ -4088,18 +4096,13 @@ class ControlScript(object): # Sets YW metrics to use local database. os.environ["USE_NATIVE_METRICS"] = "true" + # Returns true if the system has been configured for clock bound. + # Runs `configure_clockbound.sh --validate` and returns true if it returns 0. def assert_system_configured_for_clockbound(self): - Output.init_animation("Validating system config for clockbound...") configure_clockbound_path = find_binary_location("configure_clockbound.sh") cmd = ["bash", configure_clockbound_path, "--validate"] - out, err, retcode = run_process(cmd) - if retcode == 0: - Output.update_animation("System configured for clockbound.") - else: - Output.update_animation("Failed to validate system configuration for clockbound.", - status=Output.ANIMATION_FAIL) - Output.log_error_and_exit( - Output.make_red("ERROR") + ": Did you run configure_clockbound.sh script?") + _, _, retcode = run_process(cmd) + return retcode == 0 # Runs post_install script for linux computers. def post_install_yb(self): @@ -8580,6 +8583,7 @@ class Configs(object): "xcluster_target_addresses": "", "xcluster_bootstrap_done": "", "enhance_time_sync_via_clockbound": False, + "is_clockbound_configured": False, } self.config_file = config_file diff --git a/yugabyted-ui/apiserver/cmd/server/handlers/api_cluster_info.go b/yugabyted-ui/apiserver/cmd/server/handlers/api_cluster_info.go index d43f10e29787..e2fcfb2bda28 100644 --- a/yugabyted-ui/apiserver/cmd/server/handlers/api_cluster_info.go +++ b/yugabyted-ui/apiserver/cmd/server/handlers/api_cluster_info.go @@ -66,9 +66,9 @@ var WARNING_MSGS = map[string]string{ "insecure" :"Cluster started in an insecure mode without " + "authentication and encryption enabled. For non-production use only, " + "not to be used without firewalls blocking the internet traffic.", - "clockbound": "Clockbound is recommended on AWS clusters. It can reduce read restart errors" + - " significantly in concurrent workloads." + - " Relevant flag: --enhance_time_sync_via_clockbound.", + "clockbound": "Clockbound is recommended on AWS/Azure/GCP clusters. " + + "It can reduce read restart errors significantly in concurrent workloads. " + + "Please run configure_clockbound.sh script to install and configure clockbound.", } type SlowQueriesFuture struct {