Skip to content

Commit

Permalink
update sidecar and add examples
Browse files Browse the repository at this point in the history
  • Loading branch information
ttedeschi committed Dec 4, 2023
1 parent 7c6e62b commit 41bc568
Show file tree
Hide file tree
Showing 6 changed files with 332 additions and 31 deletions.
106 changes: 75 additions & 31 deletions cmd/sidecars/htcondor/handles.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
parser.add_argument(
"--dummy-job",
action="store_true",
help="Whether the job should be a real job or a dummy sleep job for debugging purposes",
help="Whether the job should be a real job or a dummy sleep job",
)
parser.add_argument("--port", help="Server port", type=int, default=8000)

Expand Down Expand Up @@ -118,12 +118,12 @@ def prepare_mounts(pod, container_standalone):
if "configMap" in vol.keys():
config_maps_paths = mountConfigMaps(
pod, container_standalone)
print("bind as configmap", mount_var["name"], vol["name"])
# print("bind as configmap", mount_var["name"], vol["name"])
for i, path in enumerate(config_maps_paths):
mount_data.append(path)
elif "secret" in vol.keys():
secrets_paths = mountSecrets(pod, container_standalone)
print("bind as secret", mount_var["name"], vol["name"])
# print("bind as secret", mount_var["name"], vol["name"])
for i, path in enumerate(secrets_paths):
mount_data.append(path)
elif "emptyDir" in vol.keys():
Expand Down Expand Up @@ -217,16 +217,12 @@ def mountSecrets(pod, container_standalone):
cmd = ["-rf", os.path.join(wd, data_root_folder, "secrets")]
subprocess.run(["rm"] + cmd, check=True)
for mountSpec in container["volumeMounts"]:
print(mountSpec["name"])
for vol in pod["spec"]["volumes"]:
if vol["name"] != mountSpec["name"]:
continue
if "secret" in vol.keys():
secrets = container_standalone["secrets"]
for secret in secrets:
print(
secret["metadata"]["name"], ":", vol["secret"]["secretName"]
)
if secret["metadata"]["name"] != vol["secret"]["secretName"]:
continue
pod_secret_dir = os.path.join(
Expand Down Expand Up @@ -302,14 +298,22 @@ def parse_string_with_suffix(value_str):
def produce_htcondor_singularity_script(containers, metadata, commands, input_files):
executable_path = f"./{InterLinkConfigInst['DataRootFolder']}/{metadata['name']}.sh"
sub_path = f"./{InterLinkConfigInst['DataRootFolder']}/{metadata['name']}.jdl"
requested_cpus = sum([int(c["resources"]["requests"]["cpu"])
for c in containers])
requested_memory = sum(
[
parse_string_with_suffix(c["resources"]["requests"]["memory"])
for c in containers
]
)

requested_cpus = 0
requested_memory = 0
for c in containers:
if "resources" in c.keys():
if "requests" in c["resources"].keys():
if "cpu" in c["resources"]["requests"].keys():
requested_cpus += int(c["resources"]["requests"]["cpu"])
if "memory" in c["resources"]["requests"].keys():
requested_memory += parse_string_with_suffix(
c["resources"]["requests"]["memory"])
if requested_cpus == 0:
requested_cpus = 1
if requested_memory == 0:
requested_memory = 1

prefix_ = f"\n{InterLinkConfigInst['CommandPrefix']}"
try:
with open(executable_path, "w") as f:
Expand Down Expand Up @@ -438,7 +442,6 @@ def handle_jid(jid, pod):
)
else:
logging.info("Job submission failed, couldn't retrieve JID")
# return "Job submission failed, couldn't retrieve JID", 500


def SubmitHandler():
Expand All @@ -454,9 +457,9 @@ def SubmitHandler():

# ELABORATE RESPONSE ###########
pod = req.get("pod", {})
print(pod)
# print(pod)
containers_standalone = req.get("container", {})
print("Requested pod metadata name is: ", pod["metadata"]["name"])
# print("Requested pod metadata name is: ", pod["metadata"]["name"])
metadata = pod.get("metadata", {})
containers = pod.get("spec", {}).get("containers", [])
singularity_commands = []
Expand All @@ -470,14 +473,15 @@ def SubmitHandler():
commstr1 = ["singularity", "exec"]
envs = prepare_envs(container)
image = ""
mounts = [""]
if containers_standalone is not None:
for c in containers_standalone:
if c["name"] == container["name"]:
container_standalone = c
mounts = prepare_mounts(pod, container_standalone)
mounts = prepare_mounts(pod, container_standalone)
else:
mounts = [""]
if container["image"].startswith("/"):
if container["image"].startswith("/") or ".io" in container["image"]:
image_uri = metadata.get("Annotations", {}).get(
"htcondor-job.knoc.io/image-root", None
)
Expand All @@ -504,6 +508,8 @@ def SubmitHandler():
+ mount.split(":")[1]
+ ","
)
if local_mounts[-1] == "":
local_mounts = [""]

if "command" in container.keys() and "args" in container.keys():
singularity_command = (
Expand Down Expand Up @@ -585,9 +591,9 @@ def StatusHandler():
# ELABORATE RESPONSE #################
resp = [
{
"Name": [],
"Namespace": [],
"Status": [],
"name": [],
"namespace": [],
"containers": []
}
]
try:
Expand All @@ -599,29 +605,67 @@ def StatusHandler():
jid_job = f.read()
podname = req["metadata"]["name"]
podnamespace = req["metadata"]["namespace"]
resp[0]["Name"] = podname
resp[0]["Namespace"] = podnamespace
ok = True
resp[0]["name"] = podname
resp[0]["namespace"] = podnamespace
process = os.popen(f"condor_q {jid_job} --json")
preprocessed = process.read()
process.close()
job_ = json.loads(preprocessed)
status = job_[0]["JobStatus"]
if status != 2 and status != 1:
ok = False
if ok:
resp[0]["Status"] = 0
if status == 1:
state = {"waiting": {
}
}
readiness = False
elif status == 2:
state = {"running": {
"startedAt": "2006-01-02T15:04:05Z",
}
}
readiness = True
else:
resp[0]["Status"] = 1
state = {"terminated": {
"startedAt": "2006-01-02T15:04:05Z",
"finishedAt": "2006-01-02T15:04:05Z",
}
}
readiness = False
for c in req["spec"]["containers"]:
resp[0]["containers"].append({
"name": c["name"],
"state": state,
"lastState": {},
"ready": readiness,
"restartCount": 0,
"image": "NOT IMPLEMENTED",
"imageID": "NOT IMPLEMENTED"
})
return json.dumps(resp), 200
except Exception as e:
return f"Something went wrong when retrieving pod status: {e}", 500


def LogsHandler():
logging.info("HTCondor Sidecar: received GetLogs call")
request_data_string = request.data.decode("utf-8")
# print(request_data_string)
req = json.loads(request_data_string)
if req is None or not isinstance(req, dict):
print("Invalid logs request body is: ", req)
logging.error("Invalid request data")
return "Invalid request data for getting logs", 400

resp = "NOT IMPLEMENTED"

return json.dumps(resp), 200


app = Flask(__name__)
app.add_url_rule("/create", view_func=SubmitHandler, methods=["POST"])
app.add_url_rule("/delete", view_func=StopHandler, methods=["POST"])
app.add_url_rule("/status", view_func=StatusHandler, methods=["GET"])
app.add_url_rule("/getLogs", view_func=LogsHandler, methods=["POST"])

if __name__ == "__main__":
app.run(port=args.port, host="0.0.0.0", debug=True)

18 changes: 18 additions & 0 deletions docker/htcondor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Deploy the application binary into a lean image
#FROM ubuntu:latest AS build-release-stage
FROM htcondor/mini:9.0.17-el7

ENV INTERLINKCONFIGPATH=/root/InterLinkConfig.yaml

RUN yum update -y && \
yum install -y epel-release && \
yum update -y && \
yum install -y apptainer

RUN mkdir -p /cvmfs/grid.cern.ch/etc/grid-security

RUN pip3 install pyyaml

WORKDIR /utils

CMD ["/bin/sh", "-c", "/start.sh & python3 handles.py"]
126 changes: 126 additions & 0 deletions examples/interlink-htcondor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# SLURM DEMO

## Deploy interlink+HTCondor demo locally

__N.B.__ in the demo the oauth2 proxy authN/Z is disabled. DO NOT USE THIS IN PRODUCTION unless you know what you are doing.

### Requirements

- Docker
- Minikube (kubernetes-version 1.24.3)
- Clone interlink repo

```bash
git clone https://github.com/interTwin-eu/interLink.git
```

Move to example location:

```bash
cd interLink/examples/interlink-htcondor
```

### Bootstrap a minikube cluster

```bash
minikube start --kubernetes-version=1.24.3
```

### Configure interLink

First of all, you need a valid kubeconfig file to be passed to the interLink docker compose. With minikube you will
need the following script (check that `PATH_TO_KUBECONFIG` env is pointing to the correct kubeconfig first),
otherwise you can simply copy your own there.

__N.B.__ the kubeconfig file should be a stand-alone one. So the certificate data should be loaded as strings not as path.

```bash
export PATH_TO_KUBECONFIG=$HOME/.kube/config
export CA_DATA=$(cat $HOME/.minikube/ca.crt | base64 -w0)
export CERT_DATA=$(cat $HOME/.minikube/profiles/minikube/client.crt | base64 -w0)
export KEY_DATA=$(cat $HOME/.minikube/profiles/minikube/client.key | base64 -w0)

mkdir -p interlink/config

sed 's/certificate-authority:.*/certificate-authority-data: '$CA_DATA'/g' $PATH_TO_KUBECONFIG | sed 's/client-certificate:.*/client-certificate-data: '$CERT_DATA'/g' - | sed 's/client-key:.*/client-key-data: '$KEY_DATA'/g' - > interlink/config/kubeconfig.yaml
```

Then you need to provide the interLink IP address that should be reachable from the kubernetes pods. In case of this demo setup, that address __is the address of your machine__

```bash
INTERLINK_ADDR=XXX.XX.X.XXX

sed -i 's/InterlinkURL:.*/InterlinkURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' interlink/config/InterLinkConfig.yaml | sed -i 's/InterlinkURL:.*/SidecarURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' interlink/config/InterLinkConfig.yaml

sed -i 's/InterlinkURL:.*/InterlinkURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' vk/InterLinkConfig.yaml | sed -i 's/InterlinkURL:.*/SidecarURL: "http:\/\/'$INTERLINK_IP_ADDRESS'"/g' vk/InterLinkConfig.yaml
```

### Deploy virtualKubelet

Create the `vk` namespace:

```bash
kubectl create ns vk
```

Deploy the vk resources on the cluster with:

```bash
kubectl apply -n vk -k vk/
```

Check that both the pods and the node are in ready status

```bash
kubectl get pod -n vk

kubectl get node
```

### Deploy interLink via docker compose

```bash
cd interlink

docker compose up -d
```

Check logs for both interLink APIs and SLURM sidecar:

```bash
docker logs interlink-interlink-1

docker logs interlink-docker-sidecar-1
```

### Deploy a sample application

```bash
kubectl apply -f ../test_pod.yaml
```

Then observe the application running and eventually succeeding via:

```bash
kubectl get pod -n vk --watch
```

When finished, interrupt the watch with `Ctrl+C` and retrieve the logs with:

```bash
kubectl logs -n vk test-pod-cfg-cowsay-dciangot
```

Also you can see with `squeue --me` the jobs appearing on the `interlink-docker-sidecar-1` container with:

```bash
docker exec interlink-docker-sidecar-1 squeue --me
```

Or, if you need more debug, you can log into the sidecar and look for your POD_UID folder in `.local/interlink/jobs`:

```bash
docker exec -ti interlink-docker-sidecar-1 bash

ls -altrh .local/interlink/jobs
```
Loading

0 comments on commit 41bc568

Please sign in to comment.