Handle exceptions in Celery worker initialization, set GPU memory lim…

…it based on environment variable and add gpu limits to deployment configuration
GooeyAI · Aug 12, 2024 · a5e5fb1 · a5e5fb1
1 parent b62238b
commit a5e5fb1
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 2 deletions.
diff --git a/celeryconfig.py b/celeryconfig.py
@@ -1,7 +1,9 @@
 import os
+import traceback
 import typing
 
 from celery import Celery
+from celery.exceptions import WorkerShutdown
 from celery.signals import worker_init
 from kombu import Queue
 
@@ -33,8 +35,15 @@ def setup_queues(
     queue_prefix: str = os.environ.get("QUEUE_PREFIX", "gooey-gpu"),
 ):
     def init(**kwargs):
-        for model_id in model_ids:
-            load_fn(model_id)
+        model_id = None
+        try:
+            for model_id in model_ids:
+                load_fn(model_id)
+        except:
+            # for some reason, celery seems to swallow exceptions in init
+            print(f"Error loading {model_id}:")
+            traceback.print_exc()
+            raise WorkerShutdown()
 
     init_fns.append(init)
 

diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml
@@ -46,6 +46,10 @@ spec:
             value: "{{ $value }}"
           {{- end }}
           {{- end }}
+          {{- range $name, $value := .limits }}
+          - name: "RESOURCE_LIMITS_{{ $name | upper }}"
+            value: "{{ $value }}"
+          {{- end }}
         livenessProbe:
           exec:
             command: [ "bash", "-c", "celery inspect ping -d celery@$HOSTNAME" ]

diff --git a/gooey_gpu.py b/gooey_gpu.py
@@ -36,6 +36,17 @@
     or "/root/.cache/gooey-gpu/checkpoints"
 )
 
+try:
+    gpu_limit_gib = float(os.environ["RESOURCE_LIMITS_GPU"].removesuffix("Gi"))
+except (KeyError, ValueError):
+    print("RESOURCE_LIMITS_GPU environment variable not set to a valid value.")
+else:
+    total_mem_bytes = torch.cuda.mem_get_info()[1]
+    fraction = gpu_limit_gib * 1024**3 / total_mem_bytes
+    torch.cuda.set_per_process_memory_fraction(fraction)
+    print(f"GPU limit set to {gpu_limit_gib}Gi ({fraction:.2%})")
+
+
 if SENTRY_DSN:
     sentry_sdk.init(
         dsn=SENTRY_DSN,