Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft #515

Draft
wants to merge 37 commits into
base: main
Choose a base branch
from
Draft

draft #515

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
1497e79
init commit: comparative testing
jshr-w Dec 13, 2024
9ede0d2
switch to 1 repeat
jshr-w Dec 13, 2024
b5c014f
run 3 in parallel
jshr-w Dec 13, 2024
182a705
try update run_id
jshr-w Dec 13, 2024
7b7d295
attempt 2
jshr-w Dec 13, 2024
dc706ae
attempt3
jshr-w Dec 13, 2024
19f9ab6
attempt4
jshr-w Dec 13, 2024
f991bc8
add name, output name
jshr-w Dec 13, 2024
2833974
python...
jshr-w Dec 13, 2024
3f7db7b
1k50k
jshr-w Dec 13, 2024
a1f6b89
add watch measurements, up to 100k pods
jshr-w Dec 13, 2024
681d221
tiny test
jshr-w Dec 13, 2024
7ea524a
fix
jshr-w Dec 13, 2024
9f3e762
up to 100k
jshr-w Dec 13, 2024
ce3557c
fix
jshr-w Dec 13, 2024
e6839a6
fix
jshr-w Dec 13, 2024
57ff7b5
fix
jshr-w Dec 13, 2024
8b1abbf
let's try 100k for real
jshr-w Dec 14, 2024
d00f7f7
schedule... 🙏
jshr-w Dec 14, 2024
4132291
scale test 1kn25kp
jshr-w Dec 16, 2024
a325a16
temp
jshr-w Dec 16, 2024
f24ecaf
hmm
jshr-w Dec 16, 2024
a9ee3a0
revert
jshr-w Dec 17, 2024
3c328ea
new clusters
jshr-w Dec 17, 2024
345dd82
temporarily disable svc
jshr-w Dec 18, 2024
350a6df
overwrite
jshr-w Dec 18, 2024
85e9854
fix
jshr-w Dec 18, 2024
ccc7b0e
schedule
jshr-w Dec 18, 2024
c46e015
trying a different cluster
jshr-w Dec 20, 2024
b3a944c
re-enable svcs
jshr-w Jan 3, 2025
b1c1f21
add flowcontrol metrics (init)
jshr-w Jan 7, 2025
5ebdb57
new clusters, standard service churn test
jshr-w Jan 8, 2025
7af17c4
temp repeat 1
jshr-w Jan 8, 2025
13c7d71
10 repeats
jshr-w Jan 8, 2025
72903b7
run every 12 hours
jshr-w Jan 14, 2025
67f2b17
RG as argument
jshr-w Jan 28, 2025
185904a
remove ces stage
jshr-w Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion jobs/competitive-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ parameters:
- name: ssh_key_enabled
type: boolean
default: true
- name: rg
type: string

jobs:
- job: ${{ parameters.cloud }}
Expand All @@ -61,7 +63,7 @@ jobs:
parameters:
cloud: ${{ parameters.cloud }}
region: ${{ parameters.regions[0] }}
run_id: ${{ parameters.run_id }}
run_id: ${{ parameters.rg }}
test_modules_dir: ${{ parameters.test_modules_dir }}
retry_attempt_count: ${{ parameters.retry_attempt_count }}
credential_type: ${{ parameters.credential_type }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,4 @@ steps:
path: /modules/measurements.yaml
params:
action: gather
group: {{$groupName}}
group: {{$groupName}}
109 changes: 108 additions & 1 deletion modules/python/clusterloader2/slo/config/modules/measurements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
{{$NETWORK_LATENCY_THRESHOLD := DefaultParam .CL2_NETWORK_LATENCY_THRESHOLD "0s"}}
{{$PROBE_MEASUREMENTS_PING_SLEEP_DURATION := DefaultParam .CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION "1s"}}
{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}}
{{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}}

# Probe measurements shared parameter
{{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}}
Expand Down Expand Up @@ -105,4 +106,110 @@ steps:
- name: Perc90
query: quantile(0.90, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
- name: Perc50
query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
query: quantile(0.5, max_over_time(process_resident_memory_bytes{endpoint="apiserver"}[%v:]) / 1024 / 1024)
{{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}}
- Identifier: TerminatedWatchesMetrics
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Terminated Watches
metricVersion: v1
unit: count
dimensions:
- resource
queries:
- name: Terminated watches
query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource)
- Identifier: WatchCacheInitializations
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Watch Cache Initializations
metricVersion: v1
unit: count
dimensions:
- resource
queries:
- name: Watch cache reinitializations
query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource)
{{end}}
- Identifier: ApiserverFlowcontrolCurrentExecutingRequests
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Apiserver Flowcontrol Current Executing Requests
metricVersion: v1
unit: count
enableViolations: true
dimensions:
- instance
- priority_level
queries:
- name: CurrentExecutingByPriorityLevel
query: sum(increase(apiserver_flowcontrol_current_executing_requests[%v:])) by (priority_level)
- name: CurrentExecutingByInstancePriorityLevel
query: sum(increase(apiserver_flowcontrol_current_executing_requests[%v:])) by (instance, priority_level)
- Identifier: ApiserverFlowcontrolRequestWaitDuration
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Apiserver Flowcontrol Request Wait Duration
metricVersion: v1
unit: s
enableViolations: true
dimensions:
- instance
- priority_level
queries:
- name: RequestWaitDurationByPriorityLevel
query: histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket[%v:])) by (le, priority_level))
- name: RequestWaitDurationByInstancePriorityLevel
query: histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_wait_duration_seconds_bucket[%v:])) by (le, instance, priority_level))
- Identifier: ApiserverFlowcontrolRequestExecution
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Apiserver Flowcontrol Request Execution
metricVersion: v1
unit: s
enableViolations: true
dimensions:
- instance
- priority_level
queries:
- name: RequestExecutionByPriorityLevel
query: histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket[%v:])) by (le, priority_level))
- name: RequestExecutionByInstancePriorityLevel
query: histogram_quantile(0.99, sum(rate(apiserver_flowcontrol_request_execution_seconds_bucket[%v:])) by (le, instance, priority_level))
- Identifier: ApiserverFlowcontrolRequestsRatebyPriorityLevel
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Apiserver Flowcontrol Requests Rate by Priority Level
metricVersion: v1
unit: count
enableViolations: true
dimensions:
- instance
- priority_level
queries:
- name: RequestRateByPriorityLevel
query: sum(irate(apiserver_flowcontrol_dispatched_requests_total[%v:])) by (priority_level)
- name: RequestRateByInstancePriorityLevel
query: sum(irate(apiserver_flowcontrol_dispatched_requests_total[%v:])) by (instance, priority_level)
- Identifier: ApiserverFlowcontrolRequestsMaxInqueue
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Apiserver Flowcontrol Max Inqueue
metricVersion: v1
unit: count
enableViolations: true
dimensions:
- instance
- priority_level
queries:
- name: MaxInqueueByPriorityLevel
query: max(apiserver_flowcontrol_current_inqueue_requests[%v:]) by (priority_level)
- name: MaxInqueueByInstancePriorityLevel
query: max(apiserver_flowcontrol_current_inqueue_requests[%v:]) by (instance, priority_level)
13 changes: 11 additions & 2 deletions modules/python/clusterloader2/slo/slo.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def configure_clusterloader2(
file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n")
file.write("CL2_PROMETHEUS_NODE_SELECTOR: \"prometheus: \\\"true\\\"\"\n")
file.write("CL2_POD_STARTUP_LATENCY_THRESHOLD: 3m\n")
file.write("CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT: true\n") # TODO: Consider gating

if cilium_enabled:
file.write("CL2_CILIUM_METRICS_ENABLED: true\n")
Expand Down Expand Up @@ -116,7 +117,9 @@ def collect_clusterloader2(
run_url,
service_test,
result_file,
test_type="default_config",
test_type,
start_timestamp,
name,
):
details = parse_xml_to_json(os.path.join(cl2_report_dir, "junit.xml"), indent = 2)
json_data = json.loads(details)
Expand Down Expand Up @@ -147,6 +150,8 @@ def collect_clusterloader2(
"run_id": run_id,
"run_url": run_url,
"test_type": test_type,
"start_timestamp": start_timestamp,
"name": name,
}
content = ""
for f in os.listdir(cl2_report_dir):
Expand Down Expand Up @@ -230,9 +235,12 @@ def main():
parser_collect.add_argument("result_file", type=str, help="Path to the result file")
parser_collect.add_argument("test_type", type=str, nargs='?', default="default-config",
help="Description of test type")
parser_collect.add_argument("start_timestamp", type=str, help="Test start timestamp")
parser_collect.add_argument("name", type=str, help="Test Name")

args = parser.parse_args()

startTimestamp = ""
if args.command == "configure":
configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods,
args.repeats, args.operation_timeout, args.provider, args.cilium_enabled,
Expand All @@ -245,7 +253,8 @@ def main():
elif args.command == "collect":
collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats,
args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url,
args.service_test, args.result_file, args.test_type)
args.service_test, args.result_file, args.test_type, args.start_timestamp,
args.name)

if __name__ == "__main__":
main()
106 changes: 106 additions & 0 deletions pipelines/perf-eval/CNI Benchmark/deploymentchurn-1kcomparison.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
trigger: none
schedules:
- cron: "0 */12 * * *"
displayName: "Every 12 Hours Daily"
branches:
include:
- jshr/churn1k-comparison
always: true

variables:
SCENARIO_TYPE: perf-eval
SCENARIO_NAME: deploymentchurn-1kcomparison
SCENARIO_VERSION: main

stages:
# - stage: azure_cilium_ces
# dependsOn: []
# jobs:
# - template: /jobs/competitive-test.yml
# parameters:
# cloud: azure
# regions:
# - $(LOCATION)
# engine: clusterloader2
# engine_input:
# image: "ghcr.io/azure/clusterloader2:v20241022"
# topology: cilium-usercluster
# matrix:
# azure_cilium_ces:
# cpu_per_node: 4
# node_count: 1000
# node_per_step: 1000
# max_pods: 110
# repeats: 10
# scale_timeout: "15m"
# cilium_enabled: True
# network_policy: cilium
# network_dataplane: cilium
# service_test: True
# cl2_config_file: load-config.yaml
# name: azure_cilium_ces
# max_parallel: 1
# timeout_in_minutes: 720
# credential_type: service_connection
# ssh_key_enabled: false
# rg: $(CES_RG)
- stage: azure_cilium_disablecrd
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- $(LOCATION)
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241022"
topology: cilium-usercluster
matrix:
azure_cilium_disablecrd:
cpu_per_node: 4
node_count: 1000
node_per_step: 1000
max_pods: 110
repeats: 10
scale_timeout: "15m"
cilium_enabled: True
network_policy: cilium
network_dataplane: cilium
service_test: True
cl2_config_file: load-config.yaml
name: azure_cilium_disablecrd
max_parallel: 1
timeout_in_minutes: 720
credential_type: service_connection
ssh_key_enabled: false
rg: $(DISABLECRD_RG)
- stage: azure_cni
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- $(LOCATION)
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241022"
topology: cilium-usercluster
matrix:
azure_cni:
cpu_per_node: 4
node_count: 1000
node_per_step: 1000
max_pods: 110
repeats: 10
scale_timeout: "15m"
cilium_enabled: False
service_test: True
cl2_config_file: load-config.yaml
name: azure_cni
max_parallel: 1
timeout_in_minutes: 720
credential_type: service_connection
ssh_key_enabled: false
rg: $(AZCNI_RG)
19 changes: 10 additions & 9 deletions steps/engine/clusterloader2/cilium/scale-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,19 @@ steps:
for np in $usernodepools; do
currentnodes=$(az aks nodepool show --cluster-name $aks_name --name $np --resource-group $aks_rg | jq '.count')

# TODO: Temporarily disable cluster scale-down/scale-up for faster iteration
# disable autoscaler before scaling nodepool to desire node count
az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --disable-cluster-autoscaler
if [ "$currentnodes" != "${{ parameters.nodes_per_nodepool }}" ]; then
az aks nodepool scale --cluster-name $aks_name --name $np --resource-group $aks_rg -c ${{ parameters.nodes_per_nodepool }}
fi
# az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --disable-cluster-autoscaler
# if [ "$currentnodes" != "${{ parameters.nodes_per_nodepool }}" ]; then
# az aks nodepool scale --cluster-name $aks_name --name $np --resource-group $aks_rg -c ${{ parameters.nodes_per_nodepool }}
# fi

# turn on autoscaler if test necessitates it
if [ "true" = "${{ parameters.enable_autoscale }}" ]; then
az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --enable-cluster-autoscaler --min-count 0 --max-count 500
fi
az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --node-taints "slo=true:NoSchedule" --labels slo=true
sleep 300
# if [ "true" = "${{ parameters.enable_autoscale }}" ]; then
# az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --enable-cluster-autoscaler --min-count 0 --max-count 500
# fi
# az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --node-taints "slo=true:NoSchedule" --labels slo=true
# sleep 300
done
env:
ROLE: ${{ parameters.role }}
Expand Down
2 changes: 1 addition & 1 deletion steps/engine/clusterloader2/slo/collect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ steps:
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \
$CPU_PER_NODE $NODE_COUNT $MAX_PODS $REPEATS \
$CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST $TEST_RESULTS_FILE \
$TEST_TYPE
$TEST_TYPE $START $NAME
workingDirectory: modules/python/clusterloader2
env:
CLOUD: ${{ parameters.cloud }}
Expand Down
7 changes: 7 additions & 0 deletions steps/execute-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ parameters:
default: {}

steps:
- script: |
echo "Set the start time for test execution"
startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "Start: $startTimestamp"
echo "##vso[task.setvariable variable=START]$startTimestamp"
displayName: set up timestamp variable

- template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self
parameters:
cloud: ${{ parameters.cloud }}
Expand Down