Skip to content

Commit

Permalink
add CRI resource consume test
Browse files Browse the repository at this point in the history
  • Loading branch information
anson627 committed Dec 27, 2024
1 parent ff8ea16 commit 79a5d23
Show file tree
Hide file tree
Showing 15 changed files with 629 additions and 2 deletions.
92 changes: 92 additions & 0 deletions modules/python/clusterloader2/cri/config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
name: resource-consumer

{{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 10}}
{{$memory := DefaultParam .CL2_RESOURCE_CONSUME_MEMORY 100}}
{{$cpu := DefaultParam .CL2_RESOURCE_CONSUME_CPU 100}}
{{$repeats := DefaultParam .CL2_REPEATS 1}}
{{$nodePools := DefaultParam .CL2_NODEPOOL 1}}
{{$agentPoolPrefix := DefaultParam .CL2_AGENTPOOL_PREFIX "userpool"}}
{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}}

namespace:
number: 1
prefix: resource-consumer
deleteStaleNamespaces: true
deleteAutomanagedNamespaces: true
enableExistingNamespaces: false

tuningSets:
- name: Uniform1qps
qpsLoad:
qps: 1

steps:
- name: Start measurements
measurements:
- Identifier: ResourceUsageSummary
Method: ResourceUsageSummary
Params:
action: start
labelSelector: group = resource-consumer
- Identifier: WaitForRunningLatencyDeployments
Method: WaitForControlledPodsRunning
Params:
action: start
checkIfPodsAreUpdated: true
apiVersion: apps/v1
kind: Deployment
labelSelector: group = resource-consumer
operationTimeout: {{$operationTimeout}}

{{range $i := Loop $nodePools}}
{{range $j := Loop $repeats}}
- name: Create deployment {{$j}}
phases:
- namespaceRange:
min: 1
max: 1
replicasPerNamespace: 1
tuningSet: Uniform1qps
objectBundle:
- basename: resource-consumer
objectTemplatePath: deployment_template.yaml
templateFillMap:
Replicas: {{$deploymentSize}}
Group: resource-consumer
Memory: {{$memory}}K
CPU: --millicores={{$cpu}}
AgentPool: {{$agentPoolPrefix}}{{$i}}

- name: Waiting for latency pods to be running
measurements:
- Identifier: WaitForRunningLatencyDeployments
Method: WaitForControlledPodsRunning
Params:
action: gather

- name: Deleting deployments
phases:
- namespaceRange:
min: 1
max: 1
replicasPerNamespace: 0
tuningSet: Uniform1qps
objectBundle:
- basename: resource-consumer
objectTemplatePath: deployment_template.yaml

- name: Waiting for latency pods to be deleted
measurements:
- Identifier: WaitForRunningLatencyDeployments
Method: WaitForControlledPodsRunning
Params:
action: gather
{{end}}
{{end}}

- name: Collect measurements
measurements:
- Identifier: ResourceUsageSummary
Method: ResourceUsageSummary
Params:
action: gather
55 changes: 55 additions & 0 deletions modules/python/clusterloader2/cri/config/deployment_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{{$Memory := DefaultParam .Memory "1000M"}}
{{$CPU := DefaultParam .CPU "--millicores=100"}}
{{$AgentPool := DefaultParam .AgentPool "userpool1"}}

apiVersion: apps/v1
kind: Deployment
metadata:
name: {{.Name}}
labels:
group: {{.Group}}
spec:
replicas: {{.Replicas}}
selector:
matchLabels:
name: {{.Name}}
template:
metadata:
labels:
name: {{.Name}}
group: {{.Group}}
spec:
nodeSelector:
agentpool: {{$AgentPool}}
containers:
- name: resource-consumer-memory
image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
command:
- stress
args:
- --vm
- "1"
- --vm-bytes
- {{$Memory}}
- --vm-hang
- "0"
- --timeout
- "3600"
resources:
requests:
memory: "10Mi"
- name: resource-consumer-cpu
image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
command:
- ./consume-cpu/consume-cpu
args:
- --duration-sec=3600
- {{$CPU}}
resources:
requests:
cpu: "10m"
tolerations:
- key: "cri-resource-consume"
operator: "Equal"
value: "true"
effect: "NoSchedule"
152 changes: 152 additions & 0 deletions modules/python/clusterloader2/cri/cri.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import json
import os
import argparse
import re

from datetime import datetime, timezone
from utils import parse_xml_to_json, run_cl2_command, get_measurement
from kubernetes_client import KubernetesClient

DAEMONSETS_PER_NODE_MAP = {
"aws": 3,
"aks": 6
}

def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, provider, override_file):
client = KubernetesClient(os.path.expanduser("~/.kube/config"))
nodes = client.get_nodes(label_selector="cri-resource-consume=true")
if len(nodes) == 0:
raise Exception("No nodes found with the label cri-resource-consume=true")

node = nodes[0]
allocatable_cpu = node.status.allocatable["cpu"]
allocatable_memory = node.status.allocatable["memory"]
print(f"Node {node.metadata.name} has allocatable cpu of {allocatable_cpu} and allocatable memory of {allocatable_memory}")

cpu_value = int(allocatable_cpu.replace("m", ""))
memory_value = int(allocatable_memory.replace("Ki", ""))
print(f"Node {node.metadata.name} has cpu value of {cpu_value} and memory value of {memory_value}")

# Calculate request cpu and memory for each pod
pod_count = max_pods - DAEMONSETS_PER_NODE_MAP[provider]
replica = pod_count * node_count
cpu_request = cpu_value // pod_count - 20
memory_request = int(memory_value * 1.024 // pod_count - 20)
print(f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request}K, total replica: {replica}")

with open(override_file, 'w') as file:
file.write(f"CL2_DEPLOYMENT_SIZE: {replica}\n")
file.write(f"CL2_RESOURCE_CONSUME_MEMORY: {memory_request}\n")
file.write(f"CL2_RESOURCE_CONSUME_CPU: {cpu_request}\n")
file.write(f"CL2_REPEATS: {repeats}\n")
file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n")
file.write(f"CL2_PROMETHEUS_TOLERATE_MASTER: true\n")
file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 30.0\n")
file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n")
file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n")

file.close()

def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider):
run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True)

def collect_clusterloader2(
node_count,
max_pods,
repeats,
cl2_report_dir,
cloud_info,
run_id,
run_url,
result_file
):
details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2)
json_data = json.loads(details)
testsuites = json_data["testsuites"]

if testsuites:
status = "success" if testsuites[0]["failures"] == 0 else "failure"
else:
raise Exception(f"No testsuites found in the report! Raw data: {details}")

template = {
"timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
"node_count": node_count,
"max_pods": max_pods,
"churn_rate": repeats,
"status": status,
"group": None,
"measurement": None,
"percentile": None,
"data": None,
"test_details": details,
"cloud_info": cloud_info,
"run_id": run_id,
"run_url": run_url
}

content = ""
for f in os.listdir(cl2_report_dir):
file_path = os.path.join(cl2_report_dir, f)
with open(file_path, 'r') as f:
measurement, group_name = get_measurement(file_path)
if not measurement:
continue
print(measurement, group_name)
data = json.loads(f.read())

for percentile, items in data.items():
template["measurement"] = measurement
template["group"] = group_name
template["percentile"] = percentile
for item in items:
template["data"] = item
content += json.dumps(template) + "\n"

os.makedirs(os.path.dirname(result_file), exist_ok=True)
with open(result_file, 'w') as f:
f.write(content)

def main():
parser = argparse.ArgumentParser(description="CRI Kubernetes resources.")
subparsers = parser.add_subparsers(dest="command")

# Sub-command for override_config_clusterloader2
parser_override = subparsers.add_parser("override", help="Override CL2 config file")
parser_override.add_argument("node_count", type=int, help="Number of nodes")
parser_override.add_argument("max_pods", type=int, help="Number of maximum pods per node")
parser_override.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
parser_override.add_argument("operation_timeout", type=str, default="2m", help="Operation timeout")
parser_override.add_argument("provider", type=str, help="Cloud provider name")
parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file")

# Sub-command for execute_clusterloader2
parser_execute = subparsers.add_parser("execute", help="Execute scale up operation")
parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image")
parser_execute.add_argument("cl2_config_dir", type=str, help="Path to the CL2 config directory")
parser_execute.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory")
parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file")
parser_execute.add_argument("provider", type=str, help="Cloud provider name")

# Sub-command for collect_clusterloader2
parser_collect = subparsers.add_parser("collect", help="Collect scale up data")
parser_collect.add_argument("node_count", type=int, help="Number of nodes")
parser_collect.add_argument("max_pods", type=int, help="Number of maximum pods per node")
parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory")
parser_collect.add_argument("cloud_info", type=str, help="Cloud information")
parser_collect.add_argument("run_id", type=str, help="Run ID")
parser_collect.add_argument("run_url", type=str, help="Run URL")
parser_collect.add_argument("result_file", type=str, help="Path to the result file")

args = parser.parse_args()

if args.command == "override":
override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cl2_override_file)
elif args.command == "execute":
execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider)
elif args.command == "collect":
collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file)

if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions modules/python/clusterloader2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
}
NETWORK_METRIC_PREFIXES = ["APIResponsivenessPrometheus", "InClusterNetworkLatency", "NetworkProgrammingLatency"]
PROM_QUERY_PREFIX = "GenericPrometheusQuery"
RESOURCE_USAGE_SUMMARY_PREFIX = "ResourceUsageSummary"

def run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file="config.yaml", overrides=False, enable_prometheus=False, enable_exec_service=False):
docker_client = DockerClient()
Expand Down Expand Up @@ -59,6 +60,9 @@ def get_measurement(file_path):
group_name = file_name.split("_")[1]
measurement_name = file_name.split("_")[0][len(PROM_QUERY_PREFIX)+1:]
return measurement_name, group_name
if file_name.startswith(RESOURCE_USAGE_SUMMARY_PREFIX):
group_name = file_name.split("_")[1]
return RESOURCE_USAGE_SUMMARY_PREFIX, group_name
return None, None

def parse_xml_to_json(file_path, indent = 0):
Expand Down
4 changes: 2 additions & 2 deletions modules/terraform/azure/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Set environment variables for a specific test scenario. In this guide, we'll use
Run the following commands from the root of the repository:
```bash
SCENARIO_TYPE=perf-eval
SCENARIO_NAME=nap-c4n10p100
SCENARIO_NAME=cri-resource-consume
RUN_ID=$(date +%s)
CLOUD=azure
REGION=eastus2
Expand Down Expand Up @@ -53,7 +53,7 @@ export ARM_SUBSCRIPTION_ID=$(az account show --query id -o tsv)
Create Resource Group for testing

```bash
az group create --name $RUN_ID --location $REGION --tags "run_id=$RUN_ID" "scenario=${SCENARIO_TYPE}-${SCENARIO_NAME}" "owner=aks" "creation_date=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" "deletion_due_time=$(date -u -d '+2 hour' +'%Y-%m-%dT%H:%M:%SZ')"
az group create --name $RUN_ID --location $REGION --tags "run_id=$RUN_ID" "scenario=${SCENARIO_TYPE}-${SCENARIO_NAME}" "owner=aks" "creation_date=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" "deletion_due_time=$(date -u -v +2H +'%Y-%m-%dT%H:%M:%SZ')"
```

Set `INPUT_JSON` variable. This variable is not exhaustive and may vary depending on the scenario. For a full list of what can be set, look for `json_input` in file [`modules/terraform/azure/variables.tf`](../../../modules/terraform/azure/variables.tf) as the list will keep changing as we add more features.
Expand Down
59 changes: 59 additions & 0 deletions pipelines/perf-eval/CRI Benchmark/cri-resource-consume.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
trigger: none
schedules:
- cron: "0 */12 * * *"
displayName: "Every 12 Hour"
branches:
include:
- main
always: true

variables:
SCENARIO_TYPE: perf-eval
SCENARIO_NAME: cri-resource-consume
SCENARIO_VERSION: main

stages:
- stage: azure_westeurope
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- westeurope
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241016"
topology: cri-resource-consume
matrix:
n3-p330:
node_count: 3
max_pods: 110
repeats: 1
operation_timeout: 5m
max_parallel: 1
timeout_in_minutes: 120
credential_type: service_connection
ssh_key_enabled: false
- stage: aws_westeurope
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: aws
regions:
- us-west-1
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241016"
topology: cri-resource-consume
matrix:
n3-p330:
node_count: 3
max_pods: 110
repeats: 1
operation_timeout: 5m
max_parallel: 1
timeout_in_minutes: 120
credential_type: service_connection
ssh_key_enabled: false
Loading

0 comments on commit 79a5d23

Please sign in to comment.