From 290f964ea5192c8067f8a2f03e1e530f4a893821 Mon Sep 17 00:00:00 2001 From: Siddharth Venkatesan Date: Wed, 22 Jan 2025 10:43:13 -0800 Subject: [PATCH] [patch][port] forward port of SERVING_HEALTH_CHECK_OVERRIDE from 0.28.0 (#2673) --- .../java/ai/djl/serving/models/ModelManager.java | 12 ++++++++++++ wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java | 12 +++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/serving/src/main/java/ai/djl/serving/models/ModelManager.java b/serving/src/main/java/ai/djl/serving/models/ModelManager.java index 482bb10b2..515cb22bc 100644 --- a/serving/src/main/java/ai/djl/serving/models/ModelManager.java +++ b/serving/src/main/java/ai/djl/serving/models/ModelManager.java @@ -29,6 +29,7 @@ import ai.djl.serving.wlm.WorkerPool; import ai.djl.serving.wlm.WorkerPoolConfig; import ai.djl.serving.workflow.Workflow; +import ai.djl.util.Utils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -416,6 +417,17 @@ public CompletableFuture> workerStatus() { if (wlm.getWorkerPool(wpc).isFullyScaled()) { data.put(modelName, new StatusResponse("Healthy")); } else { + boolean sageMakerHealthCheckOverride = + Boolean.parseBoolean( + Utils.getEnvOrSystemProperty( + "SAGEMAKER_HEALTH_CHECK_OVERRIDE")); + if (sageMakerHealthCheckOverride) { + logger.info( + "SAGEMAKER_HEALTH_CHECK_OVERRIDE is" + + " enabled. Failing ping as" + + " requested"); + hasFailure = true; + } data.put(modelName, new StatusResponse("Unhealthy")); } break; diff --git a/wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java b/wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java index eba1e2e02..9e1e00e0a 100644 --- a/wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java +++ b/wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java @@ -397,7 +397,17 @@ public Status getStatus() { // SIGKILL (9 + 128) System.exit(137); // NOPMD } - + boolean isHealthCheckOverrideEnabled = + Boolean.parseBoolean( + Utils.getEnvOrSystemProperty("SERVING_HEALTH_CHECK_OVERRIDE")); + if (isHealthCheckOverrideEnabled) { + logger.info( + "SERVING_HEALTH_CHECK_OVERRIDE is enabled. At least 1 model worker" + + " has exhausted all retries. Not marking model as failed." + + " Current failure count is {}", + failures); + return Status.READY; + } return Status.FAILED; } }