diff --git a/modules/python/clusterloader2/cri/config/config.yaml b/modules/python/clusterloader2/cri/config/config.yaml new file mode 100644 index 000000000..aebf52aac --- /dev/null +++ b/modules/python/clusterloader2/cri/config/config.yaml @@ -0,0 +1,92 @@ +name: resource-consumer + +{{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 10}} +{{$memory := DefaultParam .CL2_RESOURCE_CONSUME_MEMORY 100}} +{{$cpu := DefaultParam .CL2_RESOURCE_CONSUME_CPU 100}} +{{$repeats := DefaultParam .CL2_REPEATS 1}} +{{$nodePools := DefaultParam .CL2_NODEPOOL 1}} +{{$agentPoolPrefix := DefaultParam .CL2_AGENTPOOL_PREFIX "userpool"}} +{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}} + +namespace: + number: 1 + prefix: resource-consumer + deleteStaleNamespaces: true + deleteAutomanagedNamespaces: true + enableExistingNamespaces: false + +tuningSets: +- name: Uniform1qps + qpsLoad: + qps: 1 + +steps: + - name: Start measurements + measurements: + - Identifier: ResourceUsageSummary + Method: ResourceUsageSummary + Params: + action: start + labelSelector: group = resource-consumer + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: start + checkIfPodsAreUpdated: true + apiVersion: apps/v1 + kind: Deployment + labelSelector: group = resource-consumer + operationTimeout: {{$operationTimeout}} + +{{range $i := Loop $nodePools}} + {{range $j := Loop $repeats}} + - name: Create deployment {{$j}} + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 1 + tuningSet: Uniform1qps + objectBundle: + - basename: resource-consumer + objectTemplatePath: deployment_template.yaml + templateFillMap: + Replicas: {{$deploymentSize}} + Group: resource-consumer + Memory: {{$memory}}K + CPU: --millicores={{$cpu}} + AgentPool: {{$agentPoolPrefix}}{{$i}} + + - name: Waiting for latency pods to be running + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + + - name: Deleting deployments + phases: + - namespaceRange: + min: 1 + max: 1 + replicasPerNamespace: 0 + tuningSet: Uniform1qps + objectBundle: + - basename: resource-consumer + objectTemplatePath: deployment_template.yaml + + - name: Waiting for latency pods to be deleted + measurements: + - Identifier: WaitForRunningLatencyDeployments + Method: WaitForControlledPodsRunning + Params: + action: gather + {{end}} +{{end}} + + - name: Collect measurements + measurements: + - Identifier: ResourceUsageSummary + Method: ResourceUsageSummary + Params: + action: gather diff --git a/modules/python/clusterloader2/cri/config/deployment_template.yaml b/modules/python/clusterloader2/cri/config/deployment_template.yaml new file mode 100644 index 000000000..732a8de6e --- /dev/null +++ b/modules/python/clusterloader2/cri/config/deployment_template.yaml @@ -0,0 +1,55 @@ +{{$Memory := DefaultParam .Memory "1000M"}} +{{$CPU := DefaultParam .CPU "--millicores=100"}} +{{$AgentPool := DefaultParam .AgentPool "userpool1"}} + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + group: {{.Group}} +spec: + replicas: {{.Replicas}} + selector: + matchLabels: + name: {{.Name}} + template: + metadata: + labels: + name: {{.Name}} + group: {{.Group}} + spec: + nodeSelector: + agentpool: {{$AgentPool}} + containers: + - name: resource-consumer-memory + image: registry.k8s.io/e2e-test-images/resource-consumer:1.9 + command: + - stress + args: + - --vm + - "1" + - --vm-bytes + - {{$Memory}} + - --vm-hang + - "0" + - --timeout + - "3600" + resources: + requests: + memory: "10Mi" + - name: resource-consumer-cpu + image: registry.k8s.io/e2e-test-images/resource-consumer:1.9 + command: + - ./consume-cpu/consume-cpu + args: + - --duration-sec=3600 + - {{$CPU}} + resources: + requests: + cpu: "10m" + tolerations: + - key: "cri-resource-consume" + operator: "Equal" + value: "true" + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py new file mode 100644 index 000000000..9b25fb282 --- /dev/null +++ b/modules/python/clusterloader2/cri/cri.py @@ -0,0 +1,149 @@ +import json +import os +import argparse +import re + +from datetime import datetime, timezone +from utils import parse_xml_to_json, run_cl2_command, get_measurement +from kubernetes_client import KubernetesClient + +DAEMONSETS_PER_NODE_MAP = { + "aws": 3, + "aks": 6 +} + +def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, provider, override_file): + client = KubernetesClient(os.path.expanduser("~/.kube/config")) + nodes = client.get_nodes(label_selector=f"cri-resource-consume=true") + if len(nodes) == 0: + raise Exception("No nodes found with the label cri-resource-consume=true") + + node = nodes[0] + allocatable_cpu = node.status.allocatable["cpu"] + allocatable_memory = node.status.allocatable["memory"] + print(f"Node {node.metadata.name} has allocatable cpu of {allocatable_cpu} and allocatable memory of {allocatable_memory}") + + cpu_value = int(allocatable_cpu.replace("m", "")) + memory_value = int(allocatable_memory.replace("Ki", "")) + print(f"Node {node.metadata.name} has cpu value of {cpu_value} and memory value of {memory_value}") + + # Calculate request cpu and memory for each pod + pod_count = max_pods - DAEMONSETS_PER_NODE_MAP[provider] + replica = pod_count * node_count + cpu_request = cpu_value // pod_count - 20 + memory_request = int(memory_value * 1.024 // pod_count - 20) + print(f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request}K, total replica: {replica}") + + with open(override_file, 'w') as file: + file.write(f"CL2_DEPLOYMENT_SIZE: {replica}\n") + file.write(f"CL2_RESOURCE_CONSUME_MEMORY: {memory_request}\n") + file.write(f"CL2_RESOURCE_CONSUME_CPU: {cpu_request}\n") + file.write(f"CL2_REPEATS: {repeats}\n") + file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") + file.write(f"CL2_PROMETHEUS_TOLERATE_MASTER: true\n") + + file.close() + +def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider): + run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True) + +def collect_clusterloader2( + node_count, + max_pods, + repeats, + cl2_report_dir, + cloud_info, + run_id, + run_url, + result_file +): + details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2) + json_data = json.loads(details) + testsuites = json_data["testsuites"] + + if testsuites: + status = "success" if testsuites[0]["failures"] == 0 else "failure" + else: + raise Exception(f"No testsuites found in the report! Raw data: {details}") + + template = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "node_count": node_count, + "max_pods": max_pods, + "churn_rate": repeats, + "status": status, + "group": None, + "measurement": None, + "percentile": None, + "data": None, + "test_details": details, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url + } + + content = "" + for f in os.listdir(cl2_report_dir): + file_path = os.path.join(cl2_report_dir, f) + with open(file_path, 'r') as f: + measurement, group_name = get_measurement(file_path) + if not measurement: + continue + print(measurement, group_name) + data = json.loads(f.read()) + + for percentile, items in data.items(): + template["measurement"] = measurement + template["group"] = group_name + template["percentile"] = percentile + for item in items: + template["data"] = item + content += json.dumps(template) + "\n" + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, 'w') as f: + f.write(content) + +def main(): + parser = argparse.ArgumentParser(description="CRI Kubernetes resources.") + subparsers = parser.add_subparsers(dest="command") + + # Sub-command for override_config_clusterloader2 + parser_override = subparsers.add_parser("override", help="Override CL2 config file") + parser_override.add_argument("node_count", type=int, help="Number of nodes") + parser_override.add_argument("max_pods", type=int, help="Number of maximum pods per node") + parser_override.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment") + parser_override.add_argument("operation_timeout", type=str, default="2m", help="Operation timeout") + parser_override.add_argument("provider", type=str, help="Cloud provider name") + parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file") + + # Sub-command for execute_clusterloader2 + parser_execute = subparsers.add_parser("execute", help="Execute scale up operation") + parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image") + parser_execute.add_argument("cl2_config_dir", type=str, help="Path to the CL2 config directory") + parser_execute.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") + parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file") + parser_execute.add_argument("provider", type=str, help="Cloud provider name") + + # Sub-command for collect_clusterloader2 + parser_collect = subparsers.add_parser("collect", help="Collect scale up data") + parser_collect.add_argument("node_count", type=int, help="Number of nodes") + parser_collect.add_argument("max_pods", type=int, help="Number of maximum pods per node") + parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment") + parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") + parser_collect.add_argument("cloud_info", type=str, help="Cloud information") + parser_collect.add_argument("run_id", type=str, help="Run ID") + parser_collect.add_argument("run_url", type=str, help="Run URL") + parser_collect.add_argument("result_file", type=str, help="Path to the result file") + + args = parser.parse_args() + + if args.command == "override": + override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cl2_override_file) + elif args.command == "execute": + execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider) + elif args.command == "collect": + collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/modules/python/clusterloader2/utils.py b/modules/python/clusterloader2/utils.py index cfb01aa98..4039722eb 100644 --- a/modules/python/clusterloader2/utils.py +++ b/modules/python/clusterloader2/utils.py @@ -12,6 +12,7 @@ } NETWORK_METRIC_PREFIXES = ["APIResponsivenessPrometheus", "InClusterNetworkLatency", "NetworkProgrammingLatency"] PROM_QUERY_PREFIX = "GenericPrometheusQuery" +RESOURCE_USAGE_SUMMARY_PREFIX = "ResourceUsageSummary" def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, enable_exec_service=False): docker_client = DockerClient() @@ -59,6 +60,9 @@ def get_measurement(file_path): group_name = file_name.split("_")[1] measurement_name = file_name.split("_")[0][len(PROM_QUERY_PREFIX)+1:] return measurement_name, group_name + if file_name.startswith(RESOURCE_USAGE_SUMMARY_PREFIX): + group_name = file_name.split("_")[1] + return RESOURCE_USAGE_SUMMARY_PREFIX, group_name return None, None def parse_xml_to_json(file_path, indent = 0): diff --git a/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml b/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml new file mode 100644 index 000000000..dcb2ec25a --- /dev/null +++ b/pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml @@ -0,0 +1,59 @@ +trigger: none +schedules: + - cron: "0 */12 * * *" + displayName: "Every 12 Hour" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: cri-resource-consume + SCENARIO_VERSION: main + +stages: + - stage: azure_westeurope + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - westeurope + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241016" + topology: cri-resource-consume + matrix: + n3-p330: + node_count: 3 + max_pods: 110 + repeats: 1 + operation_timeout: 5m + max_parallel: 1 + timeout_in_minutes: 120 + credential_type: service_connection + ssh_key_enabled: false + - stage: aws_westeurope + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: aws + regions: + - us-west-1 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241016" + topology: cri-resource-consume + matrix: + n3-p330: + node_count: 3 + max_pods: 110 + repeats: 1 + operation_timeout: 5m + max_parallel: 1 + timeout_in_minutes: 120 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/cri-resource-consume/terraform-inputs/aws.tfvars b/scenarios/perf-eval/cri-resource-consume/terraform-inputs/aws.tfvars new file mode 100644 index 000000000..6ee92941a --- /dev/null +++ b/scenarios/perf-eval/cri-resource-consume/terraform-inputs/aws.tfvars @@ -0,0 +1,96 @@ +scenario_type = "perf-eval" +scenario_name = "cri-resource-consume" +deletion_delay = "120h" +owner = "aks" + +network_config_list = [ + { + role = "client" + vpc_name = "client-vpc" + vpc_cidr_block = "10.0.0.0/16" + subnet = [ + { + name = "client-subnet" + cidr_block = "10.0.0.0/24" + zone_suffix = "a" + map_public_ip_on_launch = true + }, + { + name = "client-subnet-2" + cidr_block = "10.0.1.0/24" + zone_suffix = "b" + map_public_ip_on_launch = true + } + ] + security_group_name = "client-sg" + route_tables = [ + { + name = "internet-rt" + cidr_block = "0.0.0.0/0" + } + ], + route_table_associations = [ + { + name = "client-subnet-rt-assoc" + subnet_name = "client-subnet" + route_table_name = "internet-rt" + }, + { + name = "client-subnet-rt-assoc-2" + subnet_name = "client-subnet-2" + route_table_name = "internet-rt" + } + ] + sg_rules = { + ingress = [] + egress = [ + { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_block = "0.0.0.0/0" + } + ] + } + } +] + +eks_config_list = [{ + role = "client" + eks_name = "cri-resource-consume" + vpc_name = "client-vpc" + policy_arns = ["AmazonEKSClusterPolicy", "AmazonEKSVPCResourceController", "AmazonEKSWorkerNodePolicy", "AmazonEKS_CNI_Policy", "AmazonEC2ContainerRegistryReadOnly"] + eks_managed_node_groups = [ + { + name = "default" + ami_type = "AL2_x86_64" + instance_types = ["m5.4xlarge"] + min_size = 3 + max_size = 3 + desired_size = 3 + capacity_type = "ON_DEMAND" + }, + { + name = "userpool1" + ami_type = "AL2_x86_64" + instance_types = ["m5.4xlarge"] + min_size = 3 + max_size = 3 + desired_size = 3 + capacity_type = "ON_DEMAND" + taints = [ + { + key = "cri-resource-consume" + value = "true" + effect = "NO_SCHEDULE" + } + ] + } + ] + + eks_addons = [ + { + name = "coredns" + } + ] +}] diff --git a/scenarios/perf-eval/cri-resource-consume/terraform-inputs/azure.tfvars b/scenarios/perf-eval/cri-resource-consume/terraform-inputs/azure.tfvars new file mode 100644 index 000000000..a145373e8 --- /dev/null +++ b/scenarios/perf-eval/cri-resource-consume/terraform-inputs/azure.tfvars @@ -0,0 +1,35 @@ +scenario_type = "perf-eval" +scenario_name = "cri-resource-consume" +deletion_delay = "240h" +owner = "aks" + +aks_config_list = [ + { + role = "client" + aks_name = "cri-resource-consume" + dns_prefix = "cl2" + subnet_name = "aks-network" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + } + default_node_pool = { + name = "default" + node_count = 3 + vm_size = "Standard_D16s_v3" + os_disk_type = "Managed" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "userpool1" + node_count = 3 + vm_size = "Standard_D16s_v3" + node_taints = ["cri-resource-consume=true:NoSchedule"] + } + ] + kubernetes_version = "1.30" + } +] diff --git a/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/aws.json b/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/aws.json new file mode 100644 index 000000000..cb30052b1 --- /dev/null +++ b/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/aws.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "us-east-2" +} diff --git a/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/azure.json b/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/azure.json new file mode 100644 index 000000000..ea27a572c --- /dev/null +++ b/scenarios/perf-eval/cri-resource-consume/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} diff --git a/steps/engine/clusterloader2/cri/collect.yml b/steps/engine/clusterloader2/cri/collect.yml new file mode 100644 index 000000000..2f404fea5 --- /dev/null +++ b/steps/engine/clusterloader2/cri/collect.yml @@ -0,0 +1,27 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: region + type: string + +steps: +- template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} +- script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + $NODE_COUNT $MAX_PODS $REPEATS \ + $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $TEST_RESULTS_FILE + workingDirectory: modules/python/clusterloader2 + env: + CLOUD: ${{ parameters.cloud }} + RUN_URL: $(RUN_URL) + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/cri/cri.py + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/cri/results + displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/cri/execute.yml b/steps/engine/clusterloader2/cri/execute.yml new file mode 100644 index 000000000..54e1ce8c6 --- /dev/null +++ b/steps/engine/clusterloader2/cri/execute.yml @@ -0,0 +1,30 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: {} + - name: region + type: string + +steps: + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE override \ + $NODE_COUNT $MAX_PODS $REPEATS $OPERATION_TIMEOUT $CLOUD ${CL2_CONFIG_DIR}/overrides.yaml + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR ${HOME}/.kube/config $CLOUD + workingDirectory: modules/python/clusterloader2 + env: + ${{ if eq(parameters.cloud, 'azure') }}: + CLOUD: aks + ${{ else }}: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/cri/cri.py + CL2_IMAGE: ${{ parameters.engine_input.image }} + CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/cri/config + CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/cri/results + displayName: "Run Benchmark" diff --git a/steps/topology/cri-resource-consume/collect-clusterloader2.yml b/steps/topology/cri-resource-consume/collect-clusterloader2.yml new file mode 100644 index 000000000..ee0c8a1bb --- /dev/null +++ b/steps/topology/cri-resource-consume/collect-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/cri/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/cri-resource-consume/execute-clusterloader2.yml b/steps/topology/cri-resource-consume/execute-clusterloader2.yml new file mode 100644 index 000000000..fcdab04db --- /dev/null +++ b/steps/topology/cri-resource-consume/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/cri/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/cri-resource-consume/validate-resources.yml b/steps/topology/cri-resource-consume/validate-resources.yml new file mode 100644 index 000000000..2afe6fafe --- /dev/null +++ b/steps/topology/cri-resource-consume/validate-resources.yml @@ -0,0 +1,13 @@ +parameters: +- name: cloud + type: string +- name: engine + type: string +- name: regions + type: object + +steps: +- template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: client + region: ${{ parameters.regions[0] }}