Skip to content

Commit

Permalink
Simplifying gpu request to single variable
Browse files Browse the repository at this point in the history
  • Loading branch information
NotChristianGarcia committed Dec 15, 2023
1 parent d88dea1 commit e6fb469
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 30 deletions.
5 changes: 5 additions & 0 deletions configschema.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@
"description": "Minimum memory allocation pod is allowed to have as limit or request. In megabytes (Mi)",
"default": 256
},
"maximum_pod_gpu_val":{
"type": "integer",
"description": "Maximum GPU allocation pod is allowed to have in resources.gpus.",
"default": 1
},
"spawner_host_id": {
"type": "integer",
"description": "Unique host_id for worker host. Each host should have at least one spawner and health check worker."
Expand Down
3 changes: 1 addition & 2 deletions service/kubernetes_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,9 @@ def start_generic_pod(pod, image, revision: int):
"mounts": [volumes, volume_mounts],
"mem_request": pod.resources.get("mem_request"),
"cpu_request": pod.resources.get("cpu_request"),
"gpu_request": pod.resources.get("gpu_request"),
"mem_limit": pod.resources.get("mem_limit"),
"cpu_limit": pod.resources.get("cpu_limit"),
"gpu_limit": pod.resources.get("gpu_limit"),
"gpus": pod.resources.get("gpus"),
"user": None
}

Expand Down
12 changes: 5 additions & 7 deletions service/kubernetes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,10 +303,9 @@ def create_pod(name: str,
mounts: List = [],
mem_request: str | None = None,
cpu_request: str | None = None,
gpu_request: str | None = None,
mem_limit: str | None = None,
cpu_limit: str | None = None,
gpu_limit: str | None = None,
gpus: str | None = None,
user: str | None = None,
image_pull_policy: Literal["Always", "IfNotPresent", "Never"] = "Always"):
"""
Expand Down Expand Up @@ -382,21 +381,20 @@ def create_pod(name: str,
resource_limits["memory"] = f"{mem_limit}Mi"
if cpu_limit:
resource_limits["cpu"] = f"{cpu_limit}m"
if gpu_limit:
resource_limits["nvidia.com/gpu"] = gpu_limit
# Requests
resource_requests = {}
if mem_request:
resource_requests["memory"] = f"{mem_request}Mi"
if cpu_request:
resource_requests["cpu"] = f"{cpu_request}m"
if gpu_request:
resource_requests["nvidia.com/gpu"] = gpu_request
# GPUs
if gpus:
resource_limits["nvidia.com/gpu"] = gpus
# Define resource requirements if resource limits specified
resources = client.V1ResourceRequirements(limits = resource_limits, requests = resource_requests)

## If GPU is requested.
if gpu_limit or gpu_request:
if gpus:
node_selector = {"gpu": "v100"}
toleration = client.V1Toleration(
key="gpunode",
Expand Down
20 changes: 7 additions & 13 deletions service/models_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,7 @@ class Resources(TapisModel):
mem_request: int = Field(conf.default_pod_mem_request, description = "Memory allocation pod requests at startup. In megabytes (Mi)")
mem_limit: int = Field(conf.default_pod_mem_limit, description = "Memory allocation pod is allowed to use. In megabytes (Mi)")
# GPU
gpu_request: int = Field(0, description = "GPU allocation pod requests at startup. In integers of GPUs. (we only have 1 currently ;) )")
gpu_limit: int = Field(0, description = "GPU allocation pod is allowed to use. In integers of GPUs. (we only have 1 currently ;) )")
gpus: int = Field(0, description = "GPU allocation pod is allowed to use. In integers of GPUs. (we only have 1 currently ;) )")

@validator('cpu_request', 'cpu_limit')
def check_cpu_resources(cls, v):
Expand All @@ -120,16 +119,16 @@ def check_cpu_resources(cls, v):
def check_mem_resources(cls, v):
if conf.minimum_pod_mem_val > v or v > conf.maximum_pod_mem_val:
raise ValueError(
f"resources.mem_x out of bounds. Received: {v}. Maximum: {conf.minimum_pod_mem_val}. Minimum: {conf.maximum_pod_mem_val}.",
f"resources.mem_x out of bounds. Received: {v}. Maximum: {conf.maximum_pod_mem_val}. Minimum: {conf.minimum_pod_mem_val}.",
" User requires extra role to break bounds. Contact admin."
)
return v

@validator('gpu_request', 'gpu_limit')
def check_gpu_resources(cls, v):
@validator('gpus')
def check_gpus(cls, v):
if 0 > v or v > conf.maximum_pod_gpu_val:
raise ValueError(
f"resources.gpu_x out of bounds. Received: {v}. Maximum: {conf.minimum_pod_gpu_val}. Minimum: 0.",
f"resources.gpus out of bounds. Received: {v}. Maximum: {conf.maximum_pod_gpu_val}. Minimum: 0.",
" User requires extra role to break bounds. Contact admin."
)
return v
Expand All @@ -140,8 +139,7 @@ def ensure_request_lessthan_limit(cls, values):
cpu_limit = values.get("cpu_limit")
mem_request = values.get("mem_request")
mem_limit = values.get("mem_limit")
gpu_request = values.get("gpu_request")
gpu_limit = values.get("gpu_limit")
gpus = values.get("gpus") # There's no request/limit for gpus, just an int validated in check_gpus

# Check cpu values
if cpu_request and cpu_limit and cpu_request > cpu_limit:
Expand All @@ -151,14 +149,10 @@ def ensure_request_lessthan_limit(cls, values):
if mem_request and mem_limit and mem_request > mem_limit:
raise ValueError(f"resources.mem_x found mem_request({mem_request}) > mem_limit({mem_limit}). Request must be less than or equal to limit.")

# Check gpu values
if gpu_request and gpu_limit and gpu_request > gpu_limit:
raise ValueError(f"resources.gpu_x found gpu_request({gpu_request}) > gpu_limit({gpu_limit}). Request must be less than or equal to limit.")

return values


class VolumeMount(TapisModel):#
class VolumeMount(TapisModel):
type: str = Field("", description = "Type of volume to attach.")
mount_path: str = Field("/tapis_volume_mount", description = "Path to mount volume to.")
sub_path: str = Field("", description = "Path to mount volume to.")
Expand Down
8 changes: 0 additions & 8 deletions service/templates/traefik-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@ http:
service: pods-service
query: "/error-handler/{status}"
fallback:
service: error-service

errors:
services:
error-service:
status:
- "500"
query: "/error-page"

routers:
dashboard:
Expand Down

0 comments on commit e6fb469

Please sign in to comment.