-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor/improve error handling flow (#36)
* refactor: improve error handling * fix: typo * chore: add more debug logging
- Loading branch information
1 parent
7ed7b3d
commit 374aa06
Showing
3 changed files
with
72 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from loguru import logger | ||
from client.fed_ledger import FedLedger | ||
import sys | ||
|
||
|
||
def handle_os_error(e: OSError): | ||
if "No space left on device" in str(e): | ||
logger.error("No more disk space, exiting with code 101") | ||
sys.exit(101) | ||
else: | ||
logger.error("Unknown OSError detected, exiting with code 100, will restart...") | ||
sys.exit(100) | ||
|
||
|
||
def handle_runtime_error(e: RuntimeError, assignment_id: str, client: FedLedger): | ||
if "CUDA error: device-side assert triggered" in str(e): | ||
logger.error( | ||
"CUDA device-side assert triggered error detected, exiting with code 100, will restart..." | ||
) | ||
sys.exit(100) | ||
if "out of memory" in str(e): | ||
logger.error( | ||
"CUDA out of memory error detected, will mark the assignment as failed" | ||
) | ||
client.mark_assignment_as_failed(assignment_id) | ||
else: | ||
logger.error( | ||
"Unknown RuntimeError detected, exiting with code 100, will restart..." | ||
) | ||
sys.exit(100) | ||
|
||
|
||
def handle_value_error(e: ValueError, assignment_id: str, client: FedLedger): | ||
if "FP16 Mixed precision training with AMP or APEX" in str(e): | ||
logger.error( | ||
"FP16 Mixed precision training with AMP or APEX error detected, exiting with code 101" | ||
) | ||
sys.exit(101) | ||
else: | ||
logger.error( | ||
"Unknown ValueError detected, exiting with code 100, will restart..." | ||
) | ||
sys.exit(100) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters