Skip to content

Commit

Permalink
Preemptible checker (#1430)
Browse files Browse the repository at this point in the history
* the initial version of preemption checker

* added the cancel api call first before triggering the retry

* added the support for ondemand and spot mutation based on pod labels

* bugfix for metadata key

* added project_id instead of project_name

* added Dockerfile and requirement.txt and some verbose logs

* updated ds.yaml

* bug fix
  • Loading branch information
emamihe authored Oct 19, 2023
1 parent d8ded0c commit c62ecb3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 1 deletion.
10 changes: 9 additions & 1 deletion admissioncontroller/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ def deployment_webhook_mutate():
request_info = request.get_json()
namespace=request_info["request"]["namespace"]
hasNodeSelector="nodeSelector" in request_info["request"]["object"]["spec"].keys()
if namespace.startswith("zombie-") and not hasNodeSelector:
labels=request_info["request"]["object"]["metadata"]["labels"] if "labels" in request_info["request"]["object"]["metadata"].keys() else {}
if namespace.startswith("zombie-") and "x-infra-instance" in labels.keys():
if labels["x-infra-instance"]=="ondemand":
return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"effect":"NoExecute", "key":"workload-type",
"operator":"Equal", "value":"large-testnet"}]}, {"op":"add", "path":"/spec/nodeSelector", "value": {"nodetype":"large-network"}}]))
else:
return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"operator":"Exists"}]}, {"op":"add", "path":"/spec/affinity", "value": {"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"weight":100,"preference":{"matchExpressions":[{"key":"nodetype","operator":"In","values":["xlarge-network"]}]}},{"weight":50,"preference":{"matchExpressions":[{"key":"nodetype","operator":"In","values":["large-network"]}]}}]}} }]))

elif namespace.startswith("zombie-") and not hasNodeSelector:
return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"effect":"NoExecute", "key":"workload-type",
"operator":"Equal", "value":"large-testnet"}]}, {"op":"add", "path":"/spec/nodeSelector", "value": {"nodetype":"large-network"}}]))
else:
Expand Down
6 changes: 6 additions & 0 deletions scripts/preemptible_checker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM python:alpine3.15
COPY . /app
WORKDIR /app
RUN pip3 install -r requirement.txt
USER 1000:1000
CMD ["python3", "main.py"]
32 changes: 32 additions & 0 deletions scripts/preemptible_checker/ds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: preemptible-checker
namespace: preemptible-checker
spec:
selector:
matchLabels:
name: preemptible-checker
template:
metadata:
labels:
name: preemptible-checker
spec:
containers:
- name: checker
image: emamihe/preemptible_checker:1.0
env:
- name: GITLAB_TOKEN
valueFrom:
secretKeyRef:
name: gitlab-token
key: token
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
tolerations:
- operator: "Exists"
nodeSelector:
cloud.google.com/gke-spot: "true"
serviceAccountName: admin
53 changes: 53 additions & 0 deletions scripts/preemptible_checker/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
from kubernetes import config, client
import requests
import time

config.load_incluster_config()
v1 = client.CoreV1Api()

node_name = os.getenv("NODE_NAME")
gitlab_token=os.getenv("GITLAB_TOKEN")

def is_node_being_preempted():
try:
response = requests.get(f"http://metadata.google.internal/computeMetadata/v1/instance/maintenance-event", headers={"Metadata-Flavor":"Google"})
if response.text == "TERMINATE_ON_HOST_MAINTENANCE":
return True
return False
except requests.RequestException:
return False

while True:
time.sleep(1)
if not is_node_being_preempted():
continue

pods = v1.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}").items

zombie_pods = [pod for pod in pods if pod.metadata.namespace.startswith('zombie-')]

evicted_namespaces=[]
for pod in zombie_pods:
ns = pod.metadata.namespace
if not ns in evicted_namespaces:
evicted_namespaces+=[ns]

print(f"found {len(evicted_namespaces)} namespace that needed to be evicted")
for evicted_namespace in evicted_namespaces:
namespace = v1.read_namespace(name=evicted_namespace)
job_id = namespace.metadata.labels.get('jobId', None)
project_id = namespace.metadata.labels.get('projectId', None)
if job_id and project_id:
headers = {
"PRIVATE-TOKEN": gitlab_token
}
job_cancel_url = f"https://gitlab.parity.io/api/v4/projects/{project_id}/jobs/{job_id}/cancel"
job_retry_url = f"https://gitlab.parity.io/api/v4/projects/{project_id}/jobs/{job_id}/retry"
cancel_response = requests.post(job_cancel_url, headers=headers)
retry_response = requests.post(job_retry_url, headers=headers)
print(f"job id {job_id} in project id {project_id} belongs to namespace {evicted_namespace} retried")

print("waiting that node kills")
while True:
time.sleep(1)
2 changes: 2 additions & 0 deletions scripts/preemptible_checker/requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests
kubernetes

0 comments on commit c62ecb3

Please sign in to comment.