From 40b6853693fc5140a8ff1d7ac85c97f38966f744 Mon Sep 17 00:00:00 2001 From: Shane A Date: Tue, 23 Apr 2024 13:40:40 -0700 Subject: [PATCH] Catch and ignore CommError during W&B cancel check --- olmo/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/olmo/train.py b/olmo/train.py index e9ddf95f9..ab4d871b2 100644 --- a/olmo/train.py +++ b/olmo/train.py @@ -914,6 +914,7 @@ def check_if_cancelled(self) -> Tuple[bool, int]: # Finally, check if someone canceled the run from W&B by adding the 'cancel' / 'canceled' tag.. # We won't see it in the run object. So we have to use the import/export API to check. from requests.exceptions import RequestException + from wandb.errors import CommError try: api = wandb.Api(api_key=api_key) @@ -924,8 +925,8 @@ def check_if_cancelled(self) -> Tuple[bool, int]: cancel_reason = "Weights & Biases tag" extra_steps = self.cfg.extra_steps_after_cancel break - except RequestException: - pass + except (RequestException, CommError): + log.info("Failed to check if W&B run is cancelled, continuing run.") run_canceled = synchronize_flag(should_cancel, self.device) if run_canceled: