Merge branch 'main' of https://github.com/Azure/telescope into networ…

…k-churn
Azure · Dec 30, 2024 · 5024f5c · 5024f5c
2 parents d69ce5d + 34f11b2
commit 5024f5c
Show file tree

Hide file tree

Showing 82 changed files with 2,121 additions and 106 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/terraform-validation.yml b/.github/workflows/terraform-validation.yml
@@ -23,7 +23,7 @@ jobs:
       - name: Setup Terraform
         uses: hashicorp/setup-terraform@v2
         with:
-          terraform_version: 1.6.0
+          terraform_version: 1.10.0
 
       - name: Terraform Format Check
         if: always()
@@ -60,7 +60,7 @@ jobs:
       - name: Terraform Lint Check
         run: tflint --recursive --config "$GITHUB_WORKSPACE/.tflint.hcl" --minimum-failure-severity=warning
 
-      - name: Terraform Test
+      - name: Terraform AWS Test
         working-directory: ${{ env.TERRAFORM_AWS_MODULES_DIR }}
         run: terraform test
         env:
@@ -80,7 +80,7 @@ jobs:
         id: setup-matrix-scenarios
         run: |
           set -eux
-          matrix=$(find $GITHUB_WORKSPACE/scenarios/ -name "*.tfvars" | awk -F'/' '{split($11, file_name, "."); split(file_name[1], cloud_region, "-");region= (length(cloud_region) > 1) ? substr($11, index($11, "-") + 1) : ""; cloud=cloud_region[1]; gsub(".json", "", region); print "{\"cloud\": \"" cloud "\", \"file_name\": \"" file_name[1] "\", " (region != "" ? "\"region\": \"" region "\", " : "") "\"scenario_type\": \"" $8 "\", \"scenario_name\": \"" $9 "\"},"}' | sort | uniq | sed 's/,$/,/')
+          matrix=$(find $GITHUB_WORKSPACE/scenarios/ -name "*.tfvars" | grep -E '(aws|azure)' | awk -F'/' '{split($11, file_name, "."); split(file_name[1], cloud_region, "-");region= (length(cloud_region) > 1) ? substr($11, index($11, "-") + 1) : ""; cloud=cloud_region[1]; gsub(".json", "", region); print "{\"cloud\": \"" cloud "\", \"file_name\": \"" file_name[1] "\", " (region != "" ? "\"region\": \"" region "\", " : "") "\"scenario_type\": \"" $8 "\", \"scenario_name\": \"" $9 "\"},"}' | sort | uniq | sed 's/,$/,/')
           matrix_array="[${matrix%,}]"
 
           file_changes=$(git diff --name-only -r origin/main HEAD)

diff --git a/README.md b/README.md
@@ -2,13 +2,19 @@
 
 # Telescope
 
-Telescope is a framework built to test and compare cloud products and services, focusing on evaluating scalability and performance. It enables users to make informed, data-driven decisions for their multi-cloud strategies. Currently, Telescope supports Azure and AWS, with plans to include GCP in the near future.
+Telescope is a framework built to test and compare cloud products and services, focusing on evaluating scalability and performance. It enables users to make informed, data-driven decisions for their multi-cloud strategies on Azure, AWS and GCP.
 
-The currently available test scenarios are:
-1. Kubernetes API server benchmark using [kperf](https://github.com/Azure/kperf/pkgs/container/kperf)
-2. Kubernetes Autoscaling benchmark using [clusterloader2](https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/)
+The current supported Kubernetes test scenarios are:
+1. [API Server Benchmark](pipelines/perf-eval/API%20Server%20Benchmark)
+2. [Autoscale Benchmark](pipelines/perf-eval/Autoscale%20Benchmark)
+3. [Container Networking Benchmark](pipelines/perf-eval/CNI%20Benchmark)
+4. [Container Storage Benchmark](pipelines/perf-eval/CSI%20Benchmark/)
+5. Container Runtime Benchmark (incoming)
 
-with more coming soon.
+The current integrated test tools are:
+1. [kperf](https://github.com/Azure/kperf/pkgs/container/kperf)
+2. [clusterloader2](https://github.com/kubernetes/perf-tests/blob/master/clusterloader2/)
+3. [resource-comsumer](https://github.com/kubernetes/kubernetes/blob/master/test/images/resource-consumer/README.md) (incoming)
 
 ## Design
 ![design](./docs/imgs/design.jpeg)
@@ -24,30 +30,10 @@ Telescope offers three primary reusable components:
 
 1. **Terraform modules** to manage test resource setup and provide reproducibility.
 2. **Python modules** for seamless integration with testing and measurement tools.
-3. **Azure services** including Pipeline, Blob Storage, Event Hub, and Data Explorer for continuous monitoring.
-
-## Quick Start
-1. Setup test framework by running commands as follows:
-```bash
-az login
-aws configure
-
-export AZDO_PERSONAL_ACCESS_TOKEN=<Azure DevOps Personal Access Token>
-export AZDO_ORG_SERVICE_URL=https://dev.azure.com/<Azure DevOps Org Name>
-export AZDO_GITHUB_SERVICE_CONNECTION_PAT=<GitHub Personal Access Token>
-export TF_VAR_resource_group_name=<Resource Group Name>
-export TF_VAR_storage_account_name=<Storage Account Name>
-export TF_VAR_kusto_cluster_name=<Kusto Cluster Name>
-
-cd modules/terraform/setup
-make all
-```
-
-2. Run pipeline or wait for scheduled run on Azure DevOps
-![pipeline](./docs/imgs/pipeline.jpeg)
-
-3. Import [dashboard](./dashboards/example.json) and check test results on Azure Data Explorer
-![results](./docs/imgs/results.jpeg)
+3. **Data Analytics** including Blob Storage, Event Hub, and Data Explorer for continuous monitoring.
+
+## Setup
+[Read more](modules/terraform/setup/README.md)
 
 ## Contributing
 

diff --git a/modules/.DS_Store b/modules/.DS_Store
diff --git a/modules/python/clusterloader2/cri/config/config.yaml b/modules/python/clusterloader2/cri/config/config.yaml
@@ -0,0 +1,92 @@
+name: resource-consumer
+
+{{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 10}}
+{{$memory := DefaultParam .CL2_RESOURCE_CONSUME_MEMORY 100}}
+{{$cpu := DefaultParam .CL2_RESOURCE_CONSUME_CPU 100}}
+{{$repeats := DefaultParam .CL2_REPEATS 1}}
+{{$nodePools := DefaultParam .CL2_NODEPOOL 1}}
+{{$agentPoolPrefix := DefaultParam .CL2_AGENTPOOL_PREFIX "userpool"}}
+{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "5m"}}
+
+namespace:
+  number: 1
+  prefix: resource-consumer
+  deleteStaleNamespaces: true
+  deleteAutomanagedNamespaces: true
+  enableExistingNamespaces: false
+
+tuningSets:
+- name: Uniform1qps
+  qpsLoad:
+    qps: 1
+
+steps:
+  - name: Start measurements
+    measurements:
+      - Identifier: ResourceUsageSummary
+        Method: ResourceUsageSummary
+        Params:
+          action: start
+          labelSelector: group = resource-consumer
+      - Identifier: WaitForRunningLatencyDeployments
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: start
+          checkIfPodsAreUpdated: true
+          apiVersion: apps/v1
+          kind: Deployment
+          labelSelector: group = resource-consumer
+          operationTimeout: {{$operationTimeout}}
+
+{{range $i := Loop $nodePools}}
+  {{range $j := Loop $repeats}}
+  - name: Create deployment {{$j}}
+    phases:
+    - namespaceRange:
+          min: 1
+          max: 1
+      replicasPerNamespace: 1
+      tuningSet: Uniform1qps
+      objectBundle:
+      - basename: resource-consumer
+        objectTemplatePath: deployment_template.yaml
+        templateFillMap:
+          Replicas: {{$deploymentSize}}
+          Group: resource-consumer
+          Memory: {{$memory}}K
+          CPU: --millicores={{$cpu}}
+          AgentPool: {{$agentPoolPrefix}}{{$i}}
+
+  - name: Waiting for latency pods to be running
+    measurements:
+      - Identifier: WaitForRunningLatencyDeployments
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+
+  - name: Deleting deployments
+    phases:
+      - namespaceRange:
+          min: 1
+          max: 1
+        replicasPerNamespace: 0
+        tuningSet: Uniform1qps
+        objectBundle:
+          - basename: resource-consumer
+            objectTemplatePath: deployment_template.yaml
+
+  - name: Waiting for latency pods to be deleted
+    measurements:
+      - Identifier: WaitForRunningLatencyDeployments
+        Method: WaitForControlledPodsRunning
+        Params:
+          action: gather
+  {{end}}
+{{end}}
+
+  - name: Collect measurements
+    measurements:
+      - Identifier: ResourceUsageSummary
+        Method: ResourceUsageSummary
+        Params:
+          action: gather
diff --git a/modules/python/clusterloader2/cri/config/deployment_template.yaml b/modules/python/clusterloader2/cri/config/deployment_template.yaml
@@ -0,0 +1,55 @@
+{{$Memory := DefaultParam .Memory "1000M"}}
+{{$CPU := DefaultParam .CPU "--millicores=100"}}
+{{$AgentPool := DefaultParam .AgentPool "userpool1"}}
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{.Name}}
+  labels:
+    group: {{.Group}}
+spec:
+  replicas: {{.Replicas}}
+  selector:
+    matchLabels:
+      name: {{.Name}}
+  template:
+    metadata:
+      labels:
+        name: {{.Name}}
+        group: {{.Group}}
+    spec:
+      nodeSelector:
+        agentpool: {{$AgentPool}}
+      containers:
+      - name: resource-consumer-memory
+        image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
+        command:
+          - stress
+        args:
+          - --vm
+          - "1"
+          - --vm-bytes
+          - {{$Memory}}
+          - --vm-hang
+          - "0"
+          - --timeout
+          - "3600"
+        resources:
+          requests:
+            memory: "10Mi"
+      - name: resource-consumer-cpu
+        image: registry.k8s.io/e2e-test-images/resource-consumer:1.9
+        command:
+          - ./consume-cpu/consume-cpu
+        args:
+          - --duration-sec=3600
+          - {{$CPU}}
+        resources:
+          requests:
+            cpu: "10m"
+      tolerations:
+      - key: "cri-resource-consume"
+        operator: "Equal"
+        value: "true"
+        effect: "NoSchedule"
diff --git a/modules/python/clusterloader2/cri/cri.py b/modules/python/clusterloader2/cri/cri.py
@@ -0,0 +1,151 @@
+import json
+import os
+import argparse
+import re
+
+from datetime import datetime, timezone
+from utils import parse_xml_to_json, run_cl2_command, get_measurement
+from kubernetes_client import KubernetesClient
+
+DAEMONSETS_PER_NODE_MAP = {
+    "aws": 3,
+    "aks": 6
+}
+
+def override_config_clusterloader2(node_count, max_pods, repeats, operation_timeout, provider, override_file):
+    client = KubernetesClient(os.path.expanduser("~/.kube/config"))
+    nodes = client.get_nodes(label_selector="cri-resource-consume=true")
+    if len(nodes) == 0:
+        raise Exception("No nodes found with the label cri-resource-consume=true")
+
+    node = nodes[0]
+    allocatable_cpu = node.status.allocatable["cpu"]
+    allocatable_memory = node.status.allocatable["memory"]
+    print(f"Node {node.metadata.name} has allocatable cpu of {allocatable_cpu} and allocatable memory of {allocatable_memory}")
+
+    cpu_value = int(allocatable_cpu.replace("m", ""))
+    memory_value = int(allocatable_memory.replace("Ki", ""))
+    print(f"Node {node.metadata.name} has cpu value of {cpu_value} and memory value of {memory_value}")
+
+    # Calculate request cpu and memory for each pod
+    pod_count = max_pods - DAEMONSETS_PER_NODE_MAP[provider]
+    replica = pod_count * node_count
+    cpu_request = cpu_value // pod_count - 20
+    memory_request = int(memory_value * 1.024 // pod_count - 20)
+    print(f"CPU request for each pod: {cpu_request}m, memory request for each pod: {memory_request}K, total replica: {replica}")
+
+    with open(override_file, 'w') as file:
+        file.write(f"CL2_DEPLOYMENT_SIZE: {replica}\n")
+        file.write(f"CL2_RESOURCE_CONSUME_MEMORY: {memory_request}\n")
+        file.write(f"CL2_RESOURCE_CONSUME_CPU: {cpu_request}\n")
+        file.write(f"CL2_REPEATS: {repeats}\n")
+        file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n")
+        file.write(f"CL2_PROMETHEUS_TOLERATE_MASTER: true\n")
+        file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 30.0\n")
+        file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n")
+        file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n")
+
+    file.close()
+
+def execute_clusterloader2(cl2_image, cl2_config_dir, cl2_report_dir, kubeconfig, provider):
+    run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, overrides=True, enable_prometheus=True)
+
+def collect_clusterloader2(
+    node_count,
+    max_pods,
+    repeats,
+    cl2_report_dir,
+    cloud_info,
+    run_id,
+    run_url,
+    result_file
+):
+    details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2)
+    json_data = json.loads(details)
+    testsuites = json_data["testsuites"]
+
+    if testsuites:
+        status = "success" if testsuites[0]["failures"] == 0 else "failure"
+    else:
+        raise Exception(f"No testsuites found in the report! Raw data: {details}")
+
+    template = {
+        "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
+        "node_count": node_count,
+        "max_pods": max_pods,
+        "churn_rate": repeats,
+        "status": status,
+        "group": None,
+        "measurement": None,
+        "percentile": None,
+        "data": None,
+        "cloud_info": cloud_info,
+        "run_id": run_id,
+        "run_url": run_url
+    }
+
+    content = ""
+    for f in os.listdir(cl2_report_dir):
+        file_path = os.path.join(cl2_report_dir, f)
+        with open(file_path, 'r') as f:
+            measurement, group_name = get_measurement(file_path)
+            if not measurement:
+                continue
+            print(measurement, group_name)
+            data = json.loads(f.read())
+
+            for percentile, items in data.items():
+                template["measurement"] = measurement
+                template["group"] = group_name
+                template["percentile"] = percentile
+                for item in items:
+                    template["data"] = item
+                    content += json.dumps(template) + "\n"
+
+    os.makedirs(os.path.dirname(result_file), exist_ok=True)
+    with open(result_file, 'w') as f:
+        f.write(content)
+
+def main():
+    parser = argparse.ArgumentParser(description="CRI Kubernetes resources.")
+    subparsers = parser.add_subparsers(dest="command")
+
+    # Sub-command for override_config_clusterloader2
+    parser_override = subparsers.add_parser("override", help="Override CL2 config file")
+    parser_override.add_argument("node_count", type=int, help="Number of nodes")
+    parser_override.add_argument("max_pods", type=int, help="Number of maximum pods per node")
+    parser_override.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
+    parser_override.add_argument("operation_timeout", type=str, default="2m", help="Operation timeout")
+    parser_override.add_argument("provider", type=str, help="Cloud provider name")
+    parser_override.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file")
+
+    # Sub-command for execute_clusterloader2
+    parser_execute = subparsers.add_parser("execute", help="Execute scale up operation")
+    parser_execute.add_argument("cl2_image", type=str, help="Name of the CL2 image")
+    parser_execute.add_argument("cl2_config_dir", type=str, help="Path to the CL2 config directory")
+    parser_execute.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory")
+    parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file")
+    parser_execute.add_argument("provider", type=str, help="Cloud provider name")
+
+    # Sub-command for collect_clusterloader2
+    parser_collect = subparsers.add_parser("collect", help="Collect scale up data")
+    parser_collect.add_argument("node_count", type=int, help="Number of nodes")
+    parser_collect.add_argument("max_pods", type=int, help="Number of maximum pods per node")
+    parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the resource consumer deployment")
+    parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory")
+    parser_collect.add_argument("cloud_info", type=str, help="Cloud information")
+    parser_collect.add_argument("run_id", type=str, help="Run ID")
+    parser_collect.add_argument("run_url", type=str, help="Run URL")
+    parser_collect.add_argument("result_file", type=str, help="Path to the result file")
+
+    args = parser.parse_args()
+
+    if args.command == "override":
+        override_config_clusterloader2(args.node_count, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cl2_override_file)
+    elif args.command == "execute":
+        execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.kubeconfig, args.provider)
+    elif args.command == "collect":
+        collect_clusterloader2(args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, args.result_file)
+
+if __name__ == "__main__":
+    main()