diff --git a/admissioncontroller/main.py b/admissioncontroller/main.py index 27b4125c8..39c2b616a 100644 --- a/admissioncontroller/main.py +++ b/admissioncontroller/main.py @@ -12,7 +12,15 @@ def deployment_webhook_mutate(): request_info = request.get_json() namespace=request_info["request"]["namespace"] hasNodeSelector="nodeSelector" in request_info["request"]["object"]["spec"].keys() - if namespace.startswith("zombie-") and not hasNodeSelector: + labels=request_info["request"]["object"]["metadata"]["labels"] if "labels" in request_info["request"]["object"]["metadata"].keys() else {} + if namespace.startswith("zombie-") and "x-infra-instance" in labels.keys(): + if labels["x-infra-instance"]=="ondemand": + return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"effect":"NoExecute", "key":"workload-type", + "operator":"Equal", "value":"large-testnet"}]}, {"op":"add", "path":"/spec/nodeSelector", "value": {"nodetype":"large-network"}}])) + else: + return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"operator":"Exists"}]}, {"op":"add", "path":"/spec/affinity", "value": {"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"weight":100,"preference":{"matchExpressions":[{"key":"nodetype","operator":"In","values":["xlarge-network"]}]}},{"weight":50,"preference":{"matchExpressions":[{"key":"nodetype","operator":"In","values":["large-network"]}]}}]}} }])) + + elif namespace.startswith("zombie-") and not hasNodeSelector: return admission_response_patch(True, "Adding allow label", json_patch = jsonpatch.JsonPatch([{"op": "add", "path": "/spec/tolerations", "value": [{"effect":"NoExecute", "key":"workload-type", "operator":"Equal", "value":"large-testnet"}]}, {"op":"add", "path":"/spec/nodeSelector", "value": {"nodetype":"large-network"}}])) else: diff --git a/scripts/preemptible_checker/Dockerfile b/scripts/preemptible_checker/Dockerfile new file mode 100644 index 000000000..c18a90156 --- /dev/null +++ b/scripts/preemptible_checker/Dockerfile @@ -0,0 +1,6 @@ +FROM python:alpine3.15 +COPY . /app +WORKDIR /app +RUN pip3 install -r requirement.txt +USER 1000:1000 +CMD ["python3", "main.py"] diff --git a/scripts/preemptible_checker/ds.yaml b/scripts/preemptible_checker/ds.yaml new file mode 100644 index 000000000..9f3dafced --- /dev/null +++ b/scripts/preemptible_checker/ds.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: preemptible-checker + namespace: preemptible-checker +spec: + selector: + matchLabels: + name: preemptible-checker + template: + metadata: + labels: + name: preemptible-checker + spec: + containers: + - name: checker + image: emamihe/preemptible_checker:1.0 + env: + - name: GITLAB_TOKEN + valueFrom: + secretKeyRef: + name: gitlab-token + key: token + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + tolerations: + - operator: "Exists" + nodeSelector: + cloud.google.com/gke-spot: "true" + serviceAccountName: admin diff --git a/scripts/preemptible_checker/main.py b/scripts/preemptible_checker/main.py new file mode 100644 index 000000000..9d6aa926a --- /dev/null +++ b/scripts/preemptible_checker/main.py @@ -0,0 +1,53 @@ +import os +from kubernetes import config, client +import requests +import time + +config.load_incluster_config() +v1 = client.CoreV1Api() + +node_name = os.getenv("NODE_NAME") +gitlab_token=os.getenv("GITLAB_TOKEN") + +def is_node_being_preempted(): + try: + response = requests.get(f"http://metadata.google.internal/computeMetadata/v1/instance/maintenance-event", headers={"Metadata-Flavor":"Google"}) + if response.text == "TERMINATE_ON_HOST_MAINTENANCE": + return True + return False + except requests.RequestException: + return False + +while True: + time.sleep(1) + if not is_node_being_preempted(): + continue + + pods = v1.list_pod_for_all_namespaces(field_selector=f"spec.nodeName={node_name}").items + + zombie_pods = [pod for pod in pods if pod.metadata.namespace.startswith('zombie-')] + + evicted_namespaces=[] + for pod in zombie_pods: + ns = pod.metadata.namespace + if not ns in evicted_namespaces: + evicted_namespaces+=[ns] + + print(f"found {len(evicted_namespaces)} namespace that needed to be evicted") + for evicted_namespace in evicted_namespaces: + namespace = v1.read_namespace(name=evicted_namespace) + job_id = namespace.metadata.labels.get('jobId', None) + project_id = namespace.metadata.labels.get('projectId', None) + if job_id and project_id: + headers = { + "PRIVATE-TOKEN": gitlab_token + } + job_cancel_url = f"https://gitlab.parity.io/api/v4/projects/{project_id}/jobs/{job_id}/cancel" + job_retry_url = f"https://gitlab.parity.io/api/v4/projects/{project_id}/jobs/{job_id}/retry" + cancel_response = requests.post(job_cancel_url, headers=headers) + retry_response = requests.post(job_retry_url, headers=headers) + print(f"job id {job_id} in project id {project_id} belongs to namespace {evicted_namespace} retried") + + print("waiting that node kills") + while True: + time.sleep(1) diff --git a/scripts/preemptible_checker/requirement.txt b/scripts/preemptible_checker/requirement.txt new file mode 100644 index 000000000..9c0bb23e3 --- /dev/null +++ b/scripts/preemptible_checker/requirement.txt @@ -0,0 +1,2 @@ +requests +kubernetes