Skip to content

Commit

Permalink
criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_…
Browse files Browse the repository at this point in the history
…DEVICES to be used during pstree collection

PAUSE_DEVICES is called before a process is frozen and is used by the CUDA
plugin to place the process in a state that's ready to be checkpointed and
quiesce any pending work

CHECKPOINT_DEVICES is called after all processes in the tree have been frozen
and PAUSE'd and performs the actual checkpointing operation for CUDA
applications

Signed-off-by: Jesus Ramos <[email protected]>
  • Loading branch information
jesus-ramos committed Jun 7, 2024
1 parent 1d383a7 commit c8755f1
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 0 deletions.
6 changes: 6 additions & 0 deletions criu/include/criu-plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ enum {

CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9,

CR_PLUGIN_HOOK__PAUSE_DEVICES = 10,

CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,

CR_PLUGIN_HOOK__MAX
};

Expand All @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr,
const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);

enum {
CR_PLUGIN_STAGE__DUMP,
Expand Down
2 changes: 2 additions & 0 deletions criu/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma");
__assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map");
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");

#undef __assign_hook

Expand Down
18 changes: 18 additions & 0 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "pstree.h"
#include "criu-log.h"
#include <compel/ptrace.h>
#include "plugin.h"
#include "proc_parse.h"
#include "seccomp.h"
#include "seize.h"
Expand Down Expand Up @@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item)
goto free;
}

ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto free;
}

if (!opts.freeze_cgroup)
/* fails when meets a zombie */
__ignore_value(compel_interrupt_task(pid));
Expand Down Expand Up @@ -966,6 +972,7 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real;
int ret = -1;
struct proc_status_creds creds;
struct pstree_item *iter;

timing_start(TIME_FREEZING);

Expand All @@ -984,6 +991,11 @@ int collect_pstree(void)
if (opts.freeze_cgroup && freeze_processes())
goto err;

ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto err;
}

if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
Expand Down Expand Up @@ -1017,6 +1029,12 @@ int collect_pstree(void)
goto err;
}

for_each_pstree_item(iter) {
ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}

ret = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);
Expand Down

0 comments on commit c8755f1

Please sign in to comment.