From 3c678b0bfa6dfa369606e7193ab5b1ce4a47fa5d Mon Sep 17 00:00:00 2001 From: jvstme <36324149+jvstme@users.noreply.github.com> Date: Thu, 26 Dec 2024 08:59:09 +0000 Subject: [PATCH] Fix certbot process getting stuck in dstack-proxy (#2143) **Problem**: if obtaining a certificate times out, `subprocess.run` cancels the certbot command with SIGKILL. However, the command is run with sudo, so SIGKILL actually kills the sudo process, while its child certbot process is adopted by init and keeps running indefinitely. This breaks adding or updating services on the gateway, since certbot refuses to run if another certbot process exists. Attempts to implement graceful cancelling by sending SIGTERM to the sudo process were not successful - for some reason sudo ignores SIGTERM originating from its parent. The solution in this commit uses the `timeout` shell command to set the timeout on the actual certbot process instead of the sudo process. --- .../_internal/proxy/gateway/services/nginx.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/dstack/_internal/proxy/gateway/services/nginx.py b/src/dstack/_internal/proxy/gateway/services/nginx.py index 51b4e7dd0..469c8f5cd 100644 --- a/src/dstack/_internal/proxy/gateway/services/nginx.py +++ b/src/dstack/_internal/proxy/gateway/services/nginx.py @@ -14,7 +14,8 @@ from dstack._internal.utils.common import run_async from dstack._internal.utils.logging import get_logger -CERTBOT_TIMEOUT = 30 +CERTBOT_TIMEOUT = 40 +CERTBOT_2ND_TIMEOUT = 5 CONFIGS_DIR = Path("/etc/nginx/sites-enabled") logger = get_logger(__name__) @@ -108,7 +109,8 @@ def write_conf(self, conf: str, conf_name: str) -> None: def run_certbot(domain: str, acme: ACMESettings) -> None: logger.info("Running certbot for %s", domain) - cmd = ["sudo", "certbot", "certonly"] + cmd = ["sudo", "timeout", "--kill-after", str(CERTBOT_2ND_TIMEOUT), str(CERTBOT_TIMEOUT)] + cmd += ["certbot", "certonly"] cmd += ["--non-interactive", "--agree-tos", "--register-unsafely-without-email"] cmd += ["--keep", "--nginx", "--domain", domain] @@ -119,13 +121,16 @@ def run_certbot(domain: str, acme: ACMESettings) -> None: cmd += ["--eab-kid", acme.eab_kid] cmd += ["--eab-hmac-key", acme.eab_hmac_key] - try: - r = subprocess.run(cmd, capture_output=True, timeout=CERTBOT_TIMEOUT) - except subprocess.TimeoutExpired as e: + r = subprocess.run( + cmd, + capture_output=True, + timeout=CERTBOT_TIMEOUT + CERTBOT_2ND_TIMEOUT + 1, # shouldn't happen + ) + if r.returncode == 124: raise ProxyError( f"Could not obtain {domain} TLS certificate in {CERTBOT_TIMEOUT}s." " Make sure DNS records are configured for this gateway." - ) from e + ) if r.returncode != 0: raise ProxyError(f"Error obtaining {domain} TLS certificate:\n{r.stderr.decode()}")