diff --git a/configschema.json b/configschema.json index d1a61da..96a19b5 100644 --- a/configschema.json +++ b/configschema.json @@ -75,6 +75,11 @@ "description": "Minimum memory allocation pod is allowed to have as limit or request. In megabytes (Mi)", "default": 256 }, + "maximum_pod_gpu_val":{ + "type": "integer", + "description": "Maximum GPU allocation pod is allowed to have in resources.gpus.", + "default": 1 + }, "spawner_host_id": { "type": "integer", "description": "Unique host_id for worker host. Each host should have at least one spawner and health check worker." diff --git a/service/kubernetes_templates.py b/service/kubernetes_templates.py index cb11262..735410e 100644 --- a/service/kubernetes_templates.py +++ b/service/kubernetes_templates.py @@ -232,10 +232,9 @@ def start_generic_pod(pod, image, revision: int): "mounts": [volumes, volume_mounts], "mem_request": pod.resources.get("mem_request"), "cpu_request": pod.resources.get("cpu_request"), - "gpu_request": pod.resources.get("gpu_request"), "mem_limit": pod.resources.get("mem_limit"), "cpu_limit": pod.resources.get("cpu_limit"), - "gpu_limit": pod.resources.get("gpu_limit"), + "gpus": pod.resources.get("gpus"), "user": None } diff --git a/service/kubernetes_utils.py b/service/kubernetes_utils.py index bbe413c..2201538 100644 --- a/service/kubernetes_utils.py +++ b/service/kubernetes_utils.py @@ -303,10 +303,9 @@ def create_pod(name: str, mounts: List = [], mem_request: str | None = None, cpu_request: str | None = None, - gpu_request: str | None = None, mem_limit: str | None = None, cpu_limit: str | None = None, - gpu_limit: str | None = None, + gpus: str | None = None, user: str | None = None, image_pull_policy: Literal["Always", "IfNotPresent", "Never"] = "Always"): """ @@ -382,21 +381,20 @@ def create_pod(name: str, resource_limits["memory"] = f"{mem_limit}Mi" if cpu_limit: resource_limits["cpu"] = f"{cpu_limit}m" - if gpu_limit: - resource_limits["nvidia.com/gpu"] = gpu_limit # Requests resource_requests = {} if mem_request: resource_requests["memory"] = f"{mem_request}Mi" if cpu_request: resource_requests["cpu"] = f"{cpu_request}m" - if gpu_request: - resource_requests["nvidia.com/gpu"] = gpu_request + # GPUs + if gpus: + resource_limits["nvidia.com/gpu"] = gpus # Define resource requirements if resource limits specified resources = client.V1ResourceRequirements(limits = resource_limits, requests = resource_requests) ## If GPU is requested. - if gpu_limit or gpu_request: + if gpus: node_selector = {"gpu": "v100"} toleration = client.V1Toleration( key="gpunode", diff --git a/service/models_pods.py b/service/models_pods.py index 779f570..386911f 100644 --- a/service/models_pods.py +++ b/service/models_pods.py @@ -104,8 +104,7 @@ class Resources(TapisModel): mem_request: int = Field(conf.default_pod_mem_request, description = "Memory allocation pod requests at startup. In megabytes (Mi)") mem_limit: int = Field(conf.default_pod_mem_limit, description = "Memory allocation pod is allowed to use. In megabytes (Mi)") # GPU - gpu_request: int = Field(0, description = "GPU allocation pod requests at startup. In integers of GPUs. (we only have 1 currently ;) )") - gpu_limit: int = Field(0, description = "GPU allocation pod is allowed to use. In integers of GPUs. (we only have 1 currently ;) )") + gpus: int = Field(0, description = "GPU allocation pod is allowed to use. In integers of GPUs. (we only have 1 currently ;) )") @validator('cpu_request', 'cpu_limit') def check_cpu_resources(cls, v): @@ -120,16 +119,16 @@ def check_cpu_resources(cls, v): def check_mem_resources(cls, v): if conf.minimum_pod_mem_val > v or v > conf.maximum_pod_mem_val: raise ValueError( - f"resources.mem_x out of bounds. Received: {v}. Maximum: {conf.minimum_pod_mem_val}. Minimum: {conf.maximum_pod_mem_val}.", + f"resources.mem_x out of bounds. Received: {v}. Maximum: {conf.maximum_pod_mem_val}. Minimum: {conf.minimum_pod_mem_val}.", " User requires extra role to break bounds. Contact admin." ) return v - @validator('gpu_request', 'gpu_limit') - def check_gpu_resources(cls, v): + @validator('gpus') + def check_gpus(cls, v): if 0 > v or v > conf.maximum_pod_gpu_val: raise ValueError( - f"resources.gpu_x out of bounds. Received: {v}. Maximum: {conf.minimum_pod_gpu_val}. Minimum: 0.", + f"resources.gpus out of bounds. Received: {v}. Maximum: {conf.maximum_pod_gpu_val}. Minimum: 0.", " User requires extra role to break bounds. Contact admin." ) return v @@ -140,8 +139,7 @@ def ensure_request_lessthan_limit(cls, values): cpu_limit = values.get("cpu_limit") mem_request = values.get("mem_request") mem_limit = values.get("mem_limit") - gpu_request = values.get("gpu_request") - gpu_limit = values.get("gpu_limit") + gpus = values.get("gpus") # There's no request/limit for gpus, just an int validated in check_gpus # Check cpu values if cpu_request and cpu_limit and cpu_request > cpu_limit: @@ -151,14 +149,10 @@ def ensure_request_lessthan_limit(cls, values): if mem_request and mem_limit and mem_request > mem_limit: raise ValueError(f"resources.mem_x found mem_request({mem_request}) > mem_limit({mem_limit}). Request must be less than or equal to limit.") - # Check gpu values - if gpu_request and gpu_limit and gpu_request > gpu_limit: - raise ValueError(f"resources.gpu_x found gpu_request({gpu_request}) > gpu_limit({gpu_limit}). Request must be less than or equal to limit.") - return values -class VolumeMount(TapisModel):# +class VolumeMount(TapisModel): type: str = Field("", description = "Type of volume to attach.") mount_path: str = Field("/tapis_volume_mount", description = "Path to mount volume to.") sub_path: str = Field("", description = "Path to mount volume to.") diff --git a/service/templates/traefik-template.j2 b/service/templates/traefik-template.j2 index aa9fdf6..d273cc2 100644 --- a/service/templates/traefik-template.j2 +++ b/service/templates/traefik-template.j2 @@ -23,14 +23,6 @@ http: service: pods-service query: "/error-handler/{status}" fallback: - service: error-service - - errors: - services: - error-service: - status: - - "500" - query: "/error-page" routers: dashboard: