Skip to content

Commit

Permalink
catch gpu task errors as user errors
Browse files Browse the repository at this point in the history
  • Loading branch information
devxpy committed Feb 29, 2024
1 parent b94f601 commit eadda21
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 6 deletions.
1 change: 1 addition & 0 deletions celeryapp/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def gui_runner(
is_api_call: bool = False,
):
page = page_cls(request=SimpleNamespace(user=AppUser.objects.get(id=user_id)))
page.setup_sentry()
sr = page.run_doc_sr(run_id, uid)
sr.is_api_call = is_api_call

Expand Down
4 changes: 2 additions & 2 deletions daras_ai_v2/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def get_tab_url(self, tab: str) -> str:
tab_name=MenuTabs.paths[tab],
)

def setup_render(self):
def setup_sentry(self):
with sentry_sdk.configure_scope() as scope:
scope.set_extra("base_url", self.app_url())
scope.set_transaction_name(
Expand All @@ -194,7 +194,7 @@ def refresh_state(self):
st.session_state.update(output)

def render(self):
self.setup_render()
self.setup_sentry()

if self.get_run_state(st.session_state) == RecipeRunState.running:
self.refresh_state()
Expand Down
4 changes: 4 additions & 0 deletions daras_ai_v2/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def __init__(self, message: str, sentry_level: str = "info"):
super().__init__(message)


class GPUError(UserError):
pass


FFMPEG_ERR_MSG = (
"Unsupported File Format\n\n"
"We encountered an issue processing your file as it appears to be in a format not supported by our system or may be corrupted. "
Expand Down
8 changes: 6 additions & 2 deletions daras_ai_v2/gpu_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from daras_ai.image_input import storage_blob_for
from daras_ai_v2 import settings
from daras_ai_v2.exceptions import raise_for_status
from daras_ai_v2.exceptions import raise_for_status, GPUError
from gooeysite.bg_db_conn import get_celery_result_db_safe


Expand Down Expand Up @@ -160,7 +160,11 @@ def call_celery_task(
task_name, kwargs=dict(pipeline=pipeline, inputs=inputs), queue=queue
)
s = time()
ret = get_celery_result_db_safe(result)
ret = get_celery_result_db_safe(result, propagate=False)
try:
result.maybe_throw()
except Exception as e:
raise GPUError(f"Error in GPU Task {queue}:{task_name} - {e}") from e
record_cost_auto(
model=queue, sku=ModelSku.gpu_ms, quantity=int((time() - s) * 1000)
)
Expand Down
6 changes: 4 additions & 2 deletions gooeysite/bg_db_conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,7 @@ def wrapper(*args, **kwargs):


@db_middleware
def get_celery_result_db_safe(result: "celery.result.AsyncResult") -> typing.Any:
return result.get(disable_sync_subtasks=False)
def get_celery_result_db_safe(
result: "celery.result.AsyncResult", **kwargs
) -> typing.Any:
return result.get(disable_sync_subtasks=False, **kwargs)

0 comments on commit eadda21

Please sign in to comment.