Skip to content

Commit

Permalink
cuda: fix launch cuda-checkpoint
Browse files Browse the repository at this point in the history
When the cuda-checkpoint tool is not installed, execvp() is expected to
fail and return -1. In this case, we need to call exit() to terminate
the child process that was created earlier with fork().

Update log messages to show a debug message when cuda-checkpoint is not
available in $PATH, and a warning --action flag is not supported.

Signed-off-by: Radostin Stoyanov <[email protected]>
  • Loading branch information
rst0git committed Jul 20, 2024
1 parent 1f3bd4f commit 40f19dd
Showing 1 changed file with 33 additions and 12 deletions.
45 changes: 33 additions & 12 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,12 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)

close_fds(STDERR_FILENO + 1);

return execvp(args[0], (char **)args);
execvp(args[0], (char **)args);

/* We can't use pr_error() as log file fd is closed. */
fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno));

_exit(EXIT_FAILURE);
} else { // parent
close(fd[WRITE]);

Expand Down Expand Up @@ -151,27 +156,35 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
return -1;
}

if (WEXITSTATUS(status) != EXIT_SUCCESS)
pr_debug("%s\n", buf);

close(fd[READ]);

return WEXITSTATUS(status);
}
}

static bool cuda_checkpoint_supports_flag(const char *flag)
/**
* Checks if a given flag is supported by the cuda-checkpoint utility
*
* Returns:
* 1 if the flag is supported,
* 0 if the flag is not supported,
* -1 if there was an error launching the cuda-checkpoint utility.
*/
static int cuda_checkpoint_supports_flag(const char *flag)
{
char msg_buf[2048];
const char *args[] = { CUDA_CHECKPOINT, "-h", NULL };
int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n");
return false;
}

if (strstr(msg_buf, flag) == NULL) {
return false;
}
if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0)
return -1;

return true;
if (strstr(msg_buf, flag) == NULL)
return 0;

return 1;
}

/* Retrieve the cuda restore thread TID from the root pid */
Expand Down Expand Up @@ -419,7 +432,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_

int cuda_plugin_init(int stage)
{
if (!cuda_checkpoint_supports_flag("--action")) {
int ret = cuda_checkpoint_supports_flag("--action");

if (ret == -1) {
pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT);
plugin_disabled = true;
return 0;
}

if (ret == 0) {
pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n");
plugin_disabled = true;
return 0;
Expand Down

0 comments on commit 40f19dd

Please sign in to comment.