Skip to content

Commit

Permalink
cuda: unlock on timeout error
Browse files Browse the repository at this point in the history
When attempting to checkpoint a container with CUDA processes,
CRIU could fail with the following error:

	Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 1
	Error (cuda_plugin.c:143): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call
	Error (cuda_plugin.c:384): cuda_plugin: PAUSE_DEVICES failed with

In this situation, the target process is locked, but CRIU fails due to
a timeout and exits with an error. We need to make sure that the target
PID is unlocked in such case.

Signed-off-by: Radostin Stoyanov <[email protected]>
  • Loading branch information
rst0git committed Aug 16, 2024
1 parent 5ba1f84 commit 39d29f3
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "cr_options.h"
#include "pid.h"
#include "proc_parse.h"
#include "seize.h"

#include <common/list.h>
#include <compel/infect.h>
Expand Down Expand Up @@ -379,18 +380,23 @@ int cuda_plugin_pause_devices(int pid)
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
if (alarm_timeouted())
goto unlock;
return -1;
}

if (add_pid_to_buf(&cuda_pids, pid)) {
pr_err("unable to track paused pid %d\n", pid);
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
}
return -1;
goto unlock;
}

return 0;
unlock:
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
}
return -1;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)

Expand Down

0 comments on commit 39d29f3

Please sign in to comment.