diff --git a/cmd/sidecars/htcondor/handles.py b/cmd/sidecars/htcondor/handles.py index 3571f6f5..0a862cfe 100644 --- a/cmd/sidecars/htcondor/handles.py +++ b/cmd/sidecars/htcondor/handles.py @@ -26,7 +26,7 @@ parser.add_argument( "--dummy-job", action="store_true", - help="Whether the job should be a real job or a dummy sleep job for debugging purposes", + help="Whether the job should be a real job or a dummy sleep job", ) parser.add_argument("--port", help="Server port", type=int, default=8000) @@ -118,12 +118,12 @@ def prepare_mounts(pod, container_standalone): if "configMap" in vol.keys(): config_maps_paths = mountConfigMaps( pod, container_standalone) - print("bind as configmap", mount_var["name"], vol["name"]) + # print("bind as configmap", mount_var["name"], vol["name"]) for i, path in enumerate(config_maps_paths): mount_data.append(path) elif "secret" in vol.keys(): secrets_paths = mountSecrets(pod, container_standalone) - print("bind as secret", mount_var["name"], vol["name"]) + # print("bind as secret", mount_var["name"], vol["name"]) for i, path in enumerate(secrets_paths): mount_data.append(path) elif "emptyDir" in vol.keys(): @@ -217,16 +217,12 @@ def mountSecrets(pod, container_standalone): cmd = ["-rf", os.path.join(wd, data_root_folder, "secrets")] subprocess.run(["rm"] + cmd, check=True) for mountSpec in container["volumeMounts"]: - print(mountSpec["name"]) for vol in pod["spec"]["volumes"]: if vol["name"] != mountSpec["name"]: continue if "secret" in vol.keys(): secrets = container_standalone["secrets"] for secret in secrets: - print( - secret["metadata"]["name"], ":", vol["secret"]["secretName"] - ) if secret["metadata"]["name"] != vol["secret"]["secretName"]: continue pod_secret_dir = os.path.join( @@ -302,14 +298,22 @@ def parse_string_with_suffix(value_str): def produce_htcondor_singularity_script(containers, metadata, commands, input_files): executable_path = f"./{InterLinkConfigInst['DataRootFolder']}/{metadata['name']}.sh" sub_path = f"./{InterLinkConfigInst['DataRootFolder']}/{metadata['name']}.jdl" - requested_cpus = sum([int(c["resources"]["requests"]["cpu"]) - for c in containers]) - requested_memory = sum( - [ - parse_string_with_suffix(c["resources"]["requests"]["memory"]) - for c in containers - ] - ) + + requested_cpus = 0 + requested_memory = 0 + for c in containers: + if "resources" in c.keys(): + if "requests" in c["resources"].keys(): + if "cpu" in c["resources"]["requests"].keys(): + requested_cpus += int(c["resources"]["requests"]["cpu"]) + if "memory" in c["resources"]["requests"].keys(): + requested_memory += parse_string_with_suffix( + c["resources"]["requests"]["memory"]) + if requested_cpus == 0: + requested_cpus = 1 + if requested_memory == 0: + requested_memory = 1 + prefix_ = f"\n{InterLinkConfigInst['CommandPrefix']}" try: with open(executable_path, "w") as f: @@ -438,7 +442,6 @@ def handle_jid(jid, pod): ) else: logging.info("Job submission failed, couldn't retrieve JID") - # return "Job submission failed, couldn't retrieve JID", 500 def SubmitHandler(): @@ -454,9 +457,9 @@ def SubmitHandler(): # ELABORATE RESPONSE ########### pod = req.get("pod", {}) - print(pod) + # print(pod) containers_standalone = req.get("container", {}) - print("Requested pod metadata name is: ", pod["metadata"]["name"]) + # print("Requested pod metadata name is: ", pod["metadata"]["name"]) metadata = pod.get("metadata", {}) containers = pod.get("spec", {}).get("containers", []) singularity_commands = [] @@ -470,14 +473,15 @@ def SubmitHandler(): commstr1 = ["singularity", "exec"] envs = prepare_envs(container) image = "" + mounts = [""] if containers_standalone is not None: for c in containers_standalone: if c["name"] == container["name"]: container_standalone = c - mounts = prepare_mounts(pod, container_standalone) + mounts = prepare_mounts(pod, container_standalone) else: mounts = [""] - if container["image"].startswith("/"): + if container["image"].startswith("/") or ".io" in container["image"]: image_uri = metadata.get("Annotations", {}).get( "htcondor-job.knoc.io/image-root", None ) @@ -504,6 +508,8 @@ def SubmitHandler(): + mount.split(":")[1] + "," ) + if local_mounts[-1] == "": + local_mounts = [""] if "command" in container.keys() and "args" in container.keys(): singularity_command = ( @@ -585,9 +591,9 @@ def StatusHandler(): # ELABORATE RESPONSE ################# resp = [ { - "Name": [], - "Namespace": [], - "Status": [], + "name": [], + "namespace": [], + "containers": [] } ] try: @@ -599,29 +605,67 @@ def StatusHandler(): jid_job = f.read() podname = req["metadata"]["name"] podnamespace = req["metadata"]["namespace"] - resp[0]["Name"] = podname - resp[0]["Namespace"] = podnamespace - ok = True + resp[0]["name"] = podname + resp[0]["namespace"] = podnamespace process = os.popen(f"condor_q {jid_job} --json") preprocessed = process.read() process.close() job_ = json.loads(preprocessed) status = job_[0]["JobStatus"] - if status != 2 and status != 1: - ok = False - if ok: - resp[0]["Status"] = 0 + if status == 1: + state = {"waiting": { + } + } + readiness = False + elif status == 2: + state = {"running": { + "startedAt": "2006-01-02T15:04:05Z", + } + } + readiness = True else: - resp[0]["Status"] = 1 + state = {"terminated": { + "startedAt": "2006-01-02T15:04:05Z", + "finishedAt": "2006-01-02T15:04:05Z", + } + } + readiness = False + for c in req["spec"]["containers"]: + resp[0]["containers"].append({ + "name": c["name"], + "state": state, + "lastState": {}, + "ready": readiness, + "restartCount": 0, + "image": "NOT IMPLEMENTED", + "imageID": "NOT IMPLEMENTED" + }) return json.dumps(resp), 200 except Exception as e: return f"Something went wrong when retrieving pod status: {e}", 500 +def LogsHandler(): + logging.info("HTCondor Sidecar: received GetLogs call") + request_data_string = request.data.decode("utf-8") + # print(request_data_string) + req = json.loads(request_data_string) + if req is None or not isinstance(req, dict): + print("Invalid logs request body is: ", req) + logging.error("Invalid request data") + return "Invalid request data for getting logs", 400 + + resp = "NOT IMPLEMENTED" + + return json.dumps(resp), 200 + + app = Flask(__name__) app.add_url_rule("/create", view_func=SubmitHandler, methods=["POST"]) app.add_url_rule("/delete", view_func=StopHandler, methods=["POST"]) app.add_url_rule("/status", view_func=StatusHandler, methods=["GET"]) +app.add_url_rule("/getLogs", view_func=LogsHandler, methods=["POST"]) if __name__ == "__main__": app.run(port=args.port, host="0.0.0.0", debug=True) + diff --git a/docker/htcondor/Dockerfile b/docker/htcondor/Dockerfile new file mode 100644 index 00000000..d9401fce --- /dev/null +++ b/docker/htcondor/Dockerfile @@ -0,0 +1,18 @@ +# Deploy the application binary into a lean image +#FROM ubuntu:latest AS build-release-stage +FROM htcondor/mini:9.0.17-el7 + +ENV INTERLINKCONFIGPATH=/root/InterLinkConfig.yaml + +RUN yum update -y && \ + yum install -y epel-release && \ + yum update -y && \ + yum install -y apptainer + +RUN mkdir -p /cvmfs/grid.cern.ch/etc/grid-security + +RUN pip3 install pyyaml + +WORKDIR /utils + +CMD ["/bin/sh", "-c", "/start.sh & python3 handles.py"] diff --git a/examples/interlink-htcondor/README.md b/examples/interlink-htcondor/README.md new file mode 100644 index 00000000..21c8c4de --- /dev/null +++ b/examples/interlink-htcondor/README.md @@ -0,0 +1,126 @@ +# SLURM DEMO + +## Deploy interlink+HTCondor demo locally + +__N.B.__ in the demo the oauth2 proxy authN/Z is disabled. DO NOT USE THIS IN PRODUCTION unless you know what you are doing. + +### Requirements + +- Docker +- Minikube (kubernetes-version 1.24.3) +- Clone interlink repo + +```bash +git clone https://github.com/interTwin-eu/interLink.git +``` + +Move to example location: + +```bash +cd interLink/examples/interlink-htcondor +``` + +### Bootstrap a minikube cluster + +```bash +minikube start --kubernetes-version=1.24.3 +``` + +### Configure interLink + +First of all, you need a valid kubeconfig file to be passed to the interLink docker compose. With minikube you will +need the following script (check that `PATH_TO_KUBECONFIG` env is pointing to the correct kubeconfig first), +otherwise you can simply copy your own there. + +__N.B.__ the kubeconfig file should be a stand-alone one. So the certificate data should be loaded as strings not as path. + +```bash +export PATH_TO_KUBECONFIG=$HOME/.kube/config +export CA_DATA=$(cat $HOME/.minikube/ca.crt | base64 -w0) +export CERT_DATA=$(cat $HOME/.minikube/profiles/minikube/client.crt | base64 -w0) +export KEY_DATA=$(cat $HOME/.minikube/profiles/minikube/client.key | base64 -w0) + +mkdir -p interlink/config + +sed 's/certificate-authority:.*/certificate-authority-data: '$CA_DATA'/g' $PATH_TO_KUBECONFIG | sed 's/client-certificate:.*/client-certificate-data: '$CERT_DATA'/g' - | sed 's/client-key:.*/client-key-data: '$KEY_DATA'/g' - > interlink/config/kubeconfig.yaml +``` + +Then you need to provide the interLink IP address that should be reachable from the kubernetes pods. In case of this demo setup, that address __is the address of your machine__ + +```bash +INTERLINK_ADDR=XXX.XX.X.XXX + +sed -i 's/InterlinkURL:.*/InterlinkURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' interlink/config/InterLinkConfig.yaml | sed -i 's/InterlinkURL:.*/SidecarURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' interlink/config/InterLinkConfig.yaml + +sed -i 's/InterlinkURL:.*/InterlinkURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' vk/InterLinkConfig.yaml | sed -i 's/InterlinkURL:.*/SidecarURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' vk/InterLinkConfig.yaml +``` + +### Deploy virtualKubelet + +Create the `vk` namespace: + +```bash +kubectl create ns vk +``` + +Deploy the vk resources on the cluster with: + +```bash +kubectl apply -n vk -k vk/ +``` + +Check that both the pods and the node are in ready status + +```bash +kubectl get pod -n vk + +kubectl get node +``` + +### Deploy interLink via docker compose + +```bash +cd interlink + +docker compose up -d +``` + +Check logs for both interLink APIs and SLURM sidecar: + +```bash +docker logs interlink-interlink-1 + +docker logs interlink-docker-sidecar-1 +``` + +### Deploy a sample application + +```bash +kubectl apply -f ../test_pod.yaml +``` + +Then observe the application running and eventually succeeding via: + +```bash +kubectl get pod -n vk --watch +``` + +When finished, interrupt the watch with `Ctrl+C` and retrieve the logs with: + +```bash +kubectl logs -n vk test-pod-cfg-cowsay-dciangot +``` + +Also you can see with `squeue --me` the jobs appearing on the `interlink-docker-sidecar-1` container with: + +```bash +docker exec interlink-docker-sidecar-1 squeue --me +``` + +Or, if you need more debug, you can log into the sidecar and look for your POD_UID folder in `.local/interlink/jobs`: + +```bash +docker exec -ti interlink-docker-sidecar-1 bash + +ls -altrh .local/interlink/jobs +``` diff --git a/examples/interlink-htcondor/interlink/docker-compose-T2.yaml b/examples/interlink-htcondor/interlink/docker-compose-T2.yaml new file mode 100644 index 00000000..05cd1477 --- /dev/null +++ b/examples/interlink-htcondor/interlink/docker-compose-T2.yaml @@ -0,0 +1,52 @@ +version: '3.7' +services: + interlink: + build: + context: ../../../ + dockerfile: docker/Dockerfile.interlink + restart: always + network_mode: "host" + volumes: + - type: bind + source: ../../../cmd/sidecars/htcondor/ + target: /etc/interlink + environment: + - INTERLINKCONFIGPATH=/etc/interlink/InterLinkConfig.yaml + - KUBECONFIG=/etc/interlink/kubeconfig.yaml + # healthcheck: + # test: ["CMD", "/check.sh"] + # interval: 10s + # timeout: 10s + # retries: 3 + # start_period: 5s + docker-sidecar: + build: + context: ../../../ + dockerfile: docker/htcondor/Dockerfile + #command: /bin/sh -c "python3 handles.py --condor-config /utils/condor_config --schedd-host t2-cce-02.lnl.infn.it --collector-host t2-cce-02.lnl.infn.it:9619 --auth-method GSI --debug D_FULLDEBUG,D_SECURITY --proxy /utils/proxy_daniele_new_new_new" + command: /bin/sh -c "python3 handles.py --condor-config /utils/condor_config --schedd-host ce-01.recas.ba.infn.it --collector-host ce-01.recas.ba.infn.it:9619 --auth-method GSI --debug D_FULLDEBUG,D_SECURITY --proxy /utils/proxy_daniele_new_new_new" + restart: always + privileged: true + cap_add: + - SYS_ADMIN + network_mode: "host" + volumes: + - type: bind + source: ../../../cmd/sidecars/htcondor/ + target: /utils + - type: bind + source: ./certificates/ + target: /etc/grid-security/certificates + #environment: + #- X509_USER_PROXY=/utils/proxy_daniele_new_new_new + #- CONDOR_CONFIG=/utils/condor_config + #- _condor_COLLECTOR_HOST=ce-01.recas.ba.infn.it:9619 + #- _condor_SCHEDD_HOST=ce-01.recas.ba.infn.it + #- _condor_TOOL_DEBUG=D_FULLDEBUG,D_SECURITY + #- _condor_SEC_DEFAULT_AUTHENTICATION_METHODS=GSI + # healthcheck: + # test: ["CMD", "/check.sh"] + # interval: 10s + # timeout: 10s + # retries: 3 + # start_period: 5s diff --git a/examples/interlink-htcondor/interlink/docker-compose.yaml b/examples/interlink-htcondor/interlink/docker-compose.yaml new file mode 100644 index 00000000..a4d50406 --- /dev/null +++ b/examples/interlink-htcondor/interlink/docker-compose.yaml @@ -0,0 +1,40 @@ +version: '3.7' +services: + interlink: + build: + context: ../../../ + dockerfile: docker/Dockerfile.interlink + restart: always + network_mode: "host" + volumes: + - type: bind + source: ../../../cmd/sidecars/htcondor/ + target: /etc/interlink + environment: + - INTERLINKCONFIGPATH=/etc/interlink/InterLinkConfig.yaml + - KUBECONFIG=/etc/interlink/kubeconfig.yaml + # healthcheck: + # test: ["CMD", "/check.sh"] + # interval: 10s + # timeout: 10s + # retries: 3 + # start_period: 5s + docker-sidecar: + build: + context: ../../../ + dockerfile: docker/htcondor/Dockerfile + restart: always + privileged: true + cap_add: + - SYS_ADMIN + network_mode: "host" + volumes: + - type: bind + source: ../../../cmd/sidecars/htcondor/ + target: /utils + # healthcheck: + # test: ["CMD", "/check.sh"] + # interval: 10s + # timeout: 10s + # retries: 3 + # start_period: 5s diff --git a/examples/interlink-htcondor/test_pod.yaml b/examples/interlink-htcondor/test_pod.yaml new file mode 100644 index 00000000..63960bd2 --- /dev/null +++ b/examples/interlink-htcondor/test_pod.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-pod-cfg-cowsay-dciangot + namespace: vk + annotations: + slurm-job.knoc.io/flags: "--job-name=test-pod-cfg -t 2800 --ntasks=8 --nodes=1 --mem-per-cpu=2000" +spec: + restartPolicy: Never + containers: + - image: docker://ghcr.io/grycap/cowsay + command: ["/bin/sh"] + args: ["-c", "\"touch /tmp/test.txt && sleep 60 && echo \\\"hello muu\\\" | /usr/games/cowsay \" " ] + imagePullPolicy: Always + name: cowsayo + dnsPolicy: ClusterFirst + nodeSelector: + kubernetes.io/hostname: test-vk + tolerations: + - key: virtual-node.interlink/no-schedule + operator: Exists