diff --git a/Standards/scs-0214-v1-k8s-node-distribution.md b/Standards/scs-0214-v1-k8s-node-distribution.md index 061fe37f8..e4f53c8bd 100644 --- a/Standards/scs-0214-v1-k8s-node-distribution.md +++ b/Standards/scs-0214-v1-k8s-node-distribution.md @@ -67,7 +67,7 @@ This standard formulates the requirement for the distribution of Kubernetes node to provide a fault-tolerant and available Kubernetes cluster infrastructure. Since some providers only have small environments to work with and therefore couldn't comply with this standard, it will be treated as a RECOMMENDED standard, where providers -can OPT-OUT. +can OPT OUT. If the standard is used by a provider, the following decisions are binding and valid: @@ -83,7 +83,11 @@ If the standard is used by a provider, the following decisions are binding and v ## Conformance Tests -Conformance Tests will be written in another issue +The script `k8s-node-distribution-check.py` checks the nodes available with a user-provided +kubeconfig file. It then determines based on the labels `kubernetes.io/hostname`, `topology.kubernetes.io/zone`, +`topology.kubernetes.io/region` and `node-role.kubernetes.io/control-plane`, if a distribution +of the available nodes is present. If this isn't the case, the script produces an error. +If also produces warnings and informational outputs, if e.g. labels don't seem to be set. [k8s-ha]: https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/high-availability/ [k8s-large-clusters]: https://kubernetes.io/docs/setup/best-practices/cluster-large/ diff --git a/Tests/kaas/k8s-node-distribution/config.yaml.template b/Tests/kaas/k8s-node-distribution/config.yaml.template new file mode 100644 index 000000000..0f96da24d --- /dev/null +++ b/Tests/kaas/k8s-node-distribution/config.yaml.template @@ -0,0 +1,24 @@ +## Configuration file for the K8s Version Recency Test + +logging: + level: INFO + version: 1 + disable_existing_loggers: False + formatters: + k8s-node-distribution-check: + format: "%(levelname)s: %(message)s" + handlers: + console: + class: logging.StreamHandler + formatter: k8s-node-distribution-check + stream: ext://sys.stdout + file: + class: logging.handlers.WatchedFileHandler + formatter: k8s-node-distribution-check + filename: MY-LOG-FILE-NAME.log + root: # Configuring the default (root) logger is highly recommended + handlers: [console] + loggers: + k8s-node-distribution-check: + handlers: [console, file] + propagate: no \ No newline at end of file diff --git a/Tests/kaas/k8s-node-distribution/k8s-node-distribution-check.py b/Tests/kaas/k8s-node-distribution/k8s-node-distribution-check.py new file mode 100755 index 000000000..7932c3bc6 --- /dev/null +++ b/Tests/kaas/k8s-node-distribution/k8s-node-distribution-check.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python3 +# vim: set ts=4 sw=4 et: +# +""" +K8s Node Distribution Check +https://github.com/SovereignCloudStack/standards + +Return codes: +0: Node distribution according to standard +1: Help command used +2: Node distribution either not according to standard or not detected due to missing labels + +One or more K8s clusters are checked by providing their kubeconfigs. +scs-0214-v1 requires a K8s cluster to be distributed over multiple +physical machines within different failure zones of an infrastructure. +For now, this is hard to test, since we would need to know a multitude +of things about the providers infrastructure, including things like +circuitry and physical distribution of different machines. +This test is for now only designed to work on the K8s layer, which is +done by checking several well-known labels (https://kubernetes.io/docs/reference/labels-annotations-taints/) +and comparing them. This doesn't always attest to a good distribution +and does require these labels to be set, but should yield overall pretty +good initial results. + + kubernetes.io/hostname + topology.kubernetes.io/zone + topology.kubernetes.io/region + node-role.kubernetes.io/control-plane + +(c) Hannes Baum , 6/2023 +License: CC-BY-SA 4.0 +""" + +import asyncio +import getopt +import kubernetes_asyncio +import logging +import logging.config +import sys +import yaml + + +logging_config = { + "level": "INFO", + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "k8s-node-distribution-check": { + "format": "%(levelname)s: %(message)s" + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "k8s-node-distribution-check", + "stream": "ext://sys.stdout" + } + }, + "root": { + "handlers": ["console"] + } +} + +logger = logging.getLogger(__name__) + + +class ConfigException(BaseException): + """Exception raised if a configuration error occurs""" + + +class HelpException(BaseException): + """Exception raised if the help functionality is called""" + + +class DistributionException(BaseException): + """Exception raised if the distribution seems to be not enough""" + + +class Config: + config_path = "./config.yaml" + kubeconfig = None + logging = None + + +def print_usage(): + print(""" +K8s Node Distribution Compliance Check + +Usage: k8s-node-distribution-check.py [-h] [-c|--config PATH/TO/CONFIG] -k|--kubeconfig PATH/TO/KUBECONFIG + +The script is used to determine the compliance with the SCS standard SCS-0214-v1 describing the distribution +and availability of K8s nodes. + +The following return values are possible: + 0 - Distribution according to the standard could be verified. + 1 - The help output was delivered. + 2 - No distribution according to the standard could be detected for the nodes available. + +The following arguments can be set: + -c/--config PATH/TO/CONFIG - Path to the config file of the test script + -k/--kubeconfig PATH/TO/KUBECONFIG - Path to the kubeconfig of the server we want to check + -h - Output help +""") + + +def parse_arguments(argv): + """Parse cli arguments from the script call""" + config = Config() + + try: + opts, args = getopt.gnu_getopt(argv, "c:k:h", ["config", "kubeconfig", "help"]) + except getopt.GetoptError: + raise ConfigException + + for opt in opts: + if opt[0] == "-h" or opt[0] == "--help": + raise HelpException + if opt[0] == "-c" or opt[0] == "--config": + config.config_path = opt[1] + if opt[0] == "-k" or opt[0] == "--kubeconfig": + config.kubeconfig = opt[1] + + return config + + +def setup_logging(config_log): + + logging.config.dictConfig(config_log) + loggers = [ + logging.getLogger(name) + for name in logging.root.manager.loggerDict + if not logging.getLogger(name).level + ] + + for log in loggers: + log.setLevel(config_log['level']) + + +def initialize_config(config): + """Initialize the configuration for the test script""" + + try: + with open(config.config_path, "r") as f: + config.logging = yaml.safe_load(f)['logging'] + except OSError: + logger.warning(f"The config file under {config.config_path} couldn't be found, " + f"falling back to the default config.") + finally: + # Setup logging if the config file with the relevant information could be loaded before + # Otherwise, we initialize logging with the included literal + setup_logging(config.logging or logging_config) + + if config.kubeconfig is None: + raise ConfigException("A kubeconfig needs to be set in order to test a k8s cluster version.") + + return config + + +async def get_k8s_cluster_labelled_nodes(kubeconfig, interesting_labels): + """Get the labels per node of the k8s cluster under test, ignoring labels we don't need.""" + await kubernetes_asyncio.config.load_kube_config(kubeconfig) + + nodes = list() + + async with kubernetes_asyncio.client.ApiClient() as api: + core_api = kubernetes_asyncio.client.CoreV1Api(api) + result = await core_api.list_node() + for node in result.items: + filtered_labels = { + label: value + for label, value in node.metadata.labels.items() + if label in interesting_labels + } + nodes.append(filtered_labels) + + return nodes + + +def compare_labels(node_list, labels, node_type="master"): + + label_data = {key: list() for key in labels} + + for node in node_list: + for key in labels: + try: + label_data[key].append(node[key]) + except KeyError: + logger.warning(f"The label for {key.split('/')[1]}s don't seem to be set for all nodes.") + + for label in labels: + if len(label_data[label]) < len(node_list): + logger.warning(f"The label for {label.split('/')[1]}s doesn't seem to be set for all nodes.") + if len(set(label_data[label])) <= 1: + logger.warning(f"There seems to be no distribution across multiple {label.split('/')[1]}s " + "or labels aren't set correctly across nodes.") + else: + logger.info(f"The nodes are distributed across {str(len(set(label_data[label])))} {label.split('/')[1]}s.") + return + + if node_type == "master": + raise DistributionException("The distribution of nodes described in the standard couldn't be detected.") + elif node_type == "worker": + logger.warning("No node distribution could be detected for the worker nodes. " + "This produces only a warning, since it is just a recommendation.") + return + + +async def main(argv): + try: + config = initialize_config(parse_arguments(argv)) + except (OSError, ConfigException, HelpException) as e: + if hasattr(e, 'message'): + logger.error(e.message) + print_usage() + return 1 + + # It is important to note, that the order of these labels matters for this test. + # Since we want to check if nodes are distributed, we want to do this from bigger + # infrastructure parts to smaller ones. So we first look if nodes are distributed + # across regions, then zones and then hosts. If one of these requirements is fulfilled, + # we don't need to check anymore, since a distribution was already detected. + labels = ( + "topology.kubernetes.io/region", + "topology.kubernetes.io/zone", + "kubernetes.io/hostname", + ) + + nodes = await get_k8s_cluster_labelled_nodes(config.kubeconfig, labels + ("node-role.kubernetes.io/control-plane", )) + + if len(nodes) < 2: + logger.error("The tested cluster only contains a single node, which can't comply with the standard.") + return 2 + + labelled_master_nodes = [node for node in nodes if "node-role.kubernetes.io/control-plane" in node] + try: + if len(labelled_master_nodes) >= 1: + worker_nodes = [node for node in nodes if "node-role.kubernetes.io/control-plane" not in node] + # Compare the labels of both types, since we have enough of them with labels + compare_labels(labelled_master_nodes, labels, "master") + compare_labels(worker_nodes, labels, "worker") + else: + compare_labels(nodes, labels) + except DistributionException as e: + logger.error(str(e)) + return 2 + + return 0 + + +if __name__ == "__main__": + return_code = asyncio.run(main(sys.argv[1:])) + sys.exit(return_code)