From 1d383a7900ab2956474d829b96ce749e5bf98413 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:12:39 -0700 Subject: [PATCH 1/3] criu: Restore rseq_cs state slightly earlier in the restore sequence and run the plugin finalizer later in the dump sequence Restore rseq_cs state before calling RESUME_DEVICES_LATE as the CUDA plugin will temporarily unfreeze a thread during the plugin hook to assist with device restore Run the plugin finalizer later in the dump sequence since the finalizer is used by the CUDA plugin to handle some process cleanup Signed-off-by: Jesus Ramos --- criu/cr-dump.c | 4 +++- criu/cr-restore.c | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 199ff2e322..2e7ef30f00 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2035,7 +2035,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2089,6 +2088,9 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index deecb12946..4db2f4ecfc 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2224,6 +2224,11 @@ static int restore_root_task(struct pstree_item *init) } finalize_restore(); + + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for @@ -2255,10 +2260,6 @@ static int restore_root_task(struct pstree_item *init) if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; From c8755f13a5f35cc0dba919512820aa5c3fc3a82e Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:16:07 -0700 Subject: [PATCH 2/3] criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection PAUSE_DEVICES is called before a process is frozen and is used by the CUDA plugin to place the process in a state that's ready to be checkpointed and quiesce any pending work CHECKPOINT_DEVICES is called after all processes in the tree have been frozen and PAUSE'd and performs the actual checkpointing operation for CUDA applications Signed-off-by: Jesus Ramos --- criu/include/criu-plugin.h | 6 ++++++ criu/plugin.c | 2 ++ criu/seize.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 886832eaaa..392ea9f534 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -56,6 +56,10 @@ enum { CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__MAX }; @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index f3fea28566..58b5ea5bfe 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); + __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); + __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); #undef __assign_hook diff --git a/criu/seize.c b/criu/seize.c index 91090ae1a7..d392259bc5 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,6 +16,7 @@ #include "pstree.h" #include "criu-log.h" #include +#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item) goto free; } + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto free; + } + if (!opts.freeze_cgroup) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -966,6 +972,7 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret = -1; struct proc_status_creds creds; + struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -984,6 +991,11 @@ int collect_pstree(void) if (opts.freeze_cgroup && freeze_processes()) goto err; + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1017,6 +1029,12 @@ int collect_pstree(void) goto err; } + for_each_pstree_item(iter) { + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); From 9bc90e4db34e83f919be6b9f1cc8c455f381286d Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Fri, 31 May 2024 13:38:54 -0700 Subject: [PATCH 3/3] criu/plugin: Add NVIDIA CUDA plugin Adding support for the NVIDIA cuda-checkpoint utility, requires the use of an r555 or higher driver along with the cuda-checkpoint binary. Signed-off-by: Jesus Ramos --- Makefile | 15 +- Makefile.install | 7 +- plugins/cuda/Makefile | 42 ++++ plugins/cuda/README.md | 59 +++++ plugins/cuda/cuda_plugin.c | 459 +++++++++++++++++++++++++++++++++++++ 5 files changed, 578 insertions(+), 4 deletions(-) create mode 100644 plugins/cuda/Makefile create mode 100644 plugins/cuda/README.md create mode 100644 plugins/cuda/cuda_plugin.c diff --git a/Makefile b/Makefile index 6a17a30b5a..172d4b5177 100644 --- a/Makefile +++ b/Makefile @@ -165,7 +165,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: flog criu lib crit +all: flog criu lib crit cuda_plugin .PHONY: all # @@ -308,15 +308,19 @@ clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top clean-amdgpu_plugin +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -348,6 +352,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + crit: lib $(Q) $(MAKE) -C crit .PHONY: crit @@ -434,6 +442,7 @@ help: @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help ruff: diff --git a/Makefile.install b/Makefile.install index 6f5b31924d..49233babd1 100644 --- a/Makefile.install +++ b/Makefile.install @@ -49,12 +49,16 @@ install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: @@ -65,4 +69,5 @@ uninstall: $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile new file mode 100644 index 0000000000..2eabc0e314 --- /dev/null +++ b/plugins/cuda/Makefile @@ -0,0 +1,42 @@ +PLUGIN_NAME := cuda_plugin +PLUGIN_SOBJ := cuda_plugin.so + +DEPS_CUDA := $(PLUGIN_SOBJ) + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ + +COMPEL := ../../compel/compel-host + +CC := gcc +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC + +__nmk_dir ?= ../../scripts/nmk/scripts/ +include $(__nmk_dir)msg.mk + +all: $(DEPS_CUDA) + +cuda_plugin.so: cuda_plugin.c + $(call msg-gen, $@) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + +clean: + $(call msg-clean, $@) + $(Q) $(RM) $(PLUGIN_SOBJ) +.PHONY: clean + +mrproper: clean + +install: + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) + $(E) " INSTALL " $(PLUGIN_NAME) + $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(PLUGIN_NAME) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) +.PHONY: uninstall + diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md new file mode 100644 index 0000000000..7b91f69989 --- /dev/null +++ b/plugins/cuda/README.md @@ -0,0 +1,59 @@ +Checkpoint and Restore for CUDA applications with CRIU +====================================================== + +# Requirements +The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 +or higher GPU driver is required for CUDA CRIU integration support. + +## cuda-checkpoint +The cuda-checkpoint utility can be found at: +https://github.com/NVIDIA/cuda-checkpoint + +cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA +applications. Updating the cuda-checkpoint utility between driver releases +should not be necessary as the utility simply exposes some extra driver behavior +so driver updates are all that's needed to get access to newer features. + +# Checkpointing Procedure +cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, +checkpoint, restore, unlock. + +* lock - Used with the PAUSE_DEVICES hook while a process is still running to + quiesce the application into a state where it can be checkpointed +* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been + seized/frozen to perform the actual checkpointing operation +* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA + state and release the process back to it's running state + +These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA +plugin will re-wake when needed. + +# Known Limitations +* Currently GPU memory contents are brought into main system memory and CRIU + then checkpoints that as part of the normal procedure. On systems with many + GPU's with high GPU memory usage this can cause memory thrashing. A future + CUDA release will add support for dumping the memory contents to files to + alleviate this as well as support in the CRIU plugin. +* There's currently a small race between when a PAUSE_DEVICES hook is called on + a running process and a process calls cuInit() and finishes initializing CUDA + after the PAUSE is issued but before the process is frozen to checkpoint. This + will cause cuda-checkpoint to report that the process is in an illegal state + for checkpointing and it's recommended to just attempt the CRIU procedure + again, this should be very rare. +* Applications that use NVML will leave some leftover device references as NVML + is not currently supported for checkpointing. There will be support for this + in later drivers. A possible temporary workaround is to have the + {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} + remaining references for these applications as in most cases NVML is used to + get info such as gpu count and some capabilities and these values are never + accessed again and unlikely to change. +* CUDA applications that fork() but don't call exec() but also don't issue any + CUDA API calls will have some leftover references to /dev/nvidia* and fail to + checkpoint as a result. This can be worked around in a similar fashion to the + NVML case where the leftover references can be ignored as CUDA is not fork() + safe anyway. +* Restore currently requires that you restore on a system with similar GPU's and + same GPU count. +* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process + Service) are currently not supported for checkpointing. Future CUDA releases + will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c new file mode 100644 index 0000000000..b3f2fc8df7 --- /dev/null +++ b/plugins/cuda/cuda_plugin.c @@ -0,0 +1,459 @@ +#include "criu-log.h" +#include "plugin.h" +#include "util.h" +#include "cr_options.h" +#include "pid.h" +#include "proc_parse.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* cuda-checkpoint binary should live in your PATH */ +#define CUDA_CHECKPOINT "cuda-checkpoint" + +/* cuda-checkpoint --action flags */ +#define ACTION_LOCK "lock" +#define ACTION_CHECKPOINT "checkpoint" +#define ACTION_RESTORE "restore" +#define ACTION_UNLOCK "unlock" + +#define CUDA_CKPT_BUF_SIZE (128) + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "cuda_plugin: " + +/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver + * version doesn't support --action flag + */ +bool plugin_disabled = false; + +struct pid_info { + int pid; + char checkpointed; + struct list_head list; +}; + +/* Used to track which PID's we've paused CUDA operations on so far so we can + * release them after we're done with the DUMP + */ +struct list_head cuda_pids; + +static void dealloc_pid_buffer(struct list_head *pid_buf) +{ + struct pid_info *info; + struct pid_info *n; + + list_for_each_entry_safe(info, n, pid_buf, list) { + list_del(&info->list); + xfree(info); + } +} + +static int add_pid_to_buf(struct list_head *pid_buf, int pid) +{ + struct pid_info *new = xmalloc(sizeof(*new)); + + if (new == NULL) { + return -1; + } + + new->pid = pid; + new->checkpointed = 0; + list_add_tail(&new->list, pid_buf); + + return 0; +} + +static int update_checkpointed_pid(struct list_head *pid_buf, int pid) +{ + struct pid_info *info; + + list_for_each_entry(info, pid_buf, list) { + if (info->pid == pid) { + info->checkpointed = 1; + return 0; + } + } + + return -1; +} + +static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) +{ +#define READ 0 +#define WRITE 1 + int fd[2]; + + if (pipe(fd) != 0) { + pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + return -1; + } + + buf[0] = '\0'; + + int child_pid = fork(); + if (child_pid == -1) { + pr_err("Failed to fork to exec cuda-checkpoint\n"); + close(fd[READ]); + close(fd[WRITE]); + return -1; + } + + if (child_pid == 0) { // child + if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { + return -1; + } + if (dup2(fd[WRITE], STDERR_FILENO) == -1) { + return -1; + } + close(fd[READ]); + return execvp(args[0], (char **)args); + } else { // parent + close(fd[WRITE]); + + int bytes_read = read(fd[READ], buf, buf_size); + if (bytes_read > 0) { + buf[bytes_read - 1] = '\0'; + } + + // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough + struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; + while (true) { + int poll_status = poll(&read_poll, 1, -1); + if (poll_status == -1) { + close(fd[READ]); + pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); + return -1; + } + if (read_poll.revents & POLLHUP) { + break; + } + // POLLIN, read into scratch buffer to flush things out + char scratch[64]; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + } + + int status; + if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { + pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); + close(fd[READ]); + return -1; + } + + close(fd[READ]); + + return WEXITSTATUS(status); + } +} + +static bool cuda_checkpoint_supports_flag(const char *flag) +{ + char msg_buf[2048]; + const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; + int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); + return false; + } + + if (strstr(msg_buf, flag) == NULL) { + return false; + } + + return true; +} + +/* Retrieve the cuda restore thread TID from the root pid */ +static int get_cuda_restore_tid(int root_pid) +{ + char pid_buf[16]; + char pid_out[CUDA_CKPT_BUF_SIZE]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); + + const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; + int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); + return -1; + } + + return atoi(pid_out); +} + +static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, + int buf_size) +{ + char pid_buf[16]; + char timeout_buf[16]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, + NULL /* timeout_val */, NULL }; + if (timeout > 0) { + snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); + args[5] = "--timeout"; + args[6] = timeout_buf; + } + + return launch_cuda_checkpoint(args, msg_buf, buf_size); +} + +static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) +{ + /* Since we resumed a thread that CRIU previously already froze we need to + * INTERRUPT it once again, task was already SEIZE'd so we don't need to do + * a compel_interrupt_task() + */ + if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { + pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", + restore_tid); + return -1; + } + + struct proc_status_creds creds; + if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { + pr_err("compel_wait_task failed after interrupt\n"); + return -1; + } + + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { + pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { + pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) +{ + k_rtsigset_t block; + + if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { + pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + return -1; + } + + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { + pr_err("Failed to block signals on restore tid %d\n", restore_tid); + return -1; + } + + // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { + pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { + pr_err("Could not resume cuda restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +int cuda_plugin_checkpoint_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int int_ret; + int status; + k_rtsigset_t save_sigset; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + /* We can possibly hit a race with cuInit() where we are past the point of + * locking the process but at lock time cuInit() hadn't completed in which + * case cuda-checkpoint will report that we're in an invalid state to + * checkpoint + */ + if (restore_tid == -1) { + pr_info("No need to checkpoint devices on pid %d\n", pid); + return 0; + } + + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); + /* We need to resume the checkpoint thread to prepare the mappings for + * checkpointing + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); + goto interrupt; + } + status = update_checkpointed_pid(&cuda_pids, pid); + if (status) { + pr_err("Failed to track checkpointed pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); + } + } +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return status != 0 ? status : int_ret; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); + +int cuda_plugin_pause_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + if (restore_tid == -1) { + pr_info("no need to pause devices on pid %d\n", pid); + return 0; + } + + pr_info("pausing devices on pid %d\n", pid); + int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + return -1; + } + if (add_pid_to_buf(&cuda_pids, pid)) { + pr_err("unable to track paused pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) + +int resume_device(int pid, int checkpointed) +{ + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int status; + int ret = 0; + int int_ret; + k_rtsigset_t save_sigset; + + int restore_tid = get_cuda_restore_tid(pid); + if (restore_tid == -1) { + pr_info("No need to resume devices on pid %d\n", pid); + return 0; + } + + pr_info("resuming devices on pid %d\n", pid); + /* The resuming process has to stay frozen during this time otherwise + * attempting to access a UVM pointer will crash if we haven't restored the + * underlying mappings yet + */ + pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); + /* wakeup the restore thread so we can handle the restore for this pid, + * rseq_cs has to be restored before execution + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + if (checkpointed) { + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); + ret = -1; + goto interrupt; + } + } + + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return ret != 0 ? ret : int_ret; +} + +int cuda_plugin_resume_devices_late(int pid) +{ + if (plugin_disabled) { + return 0; + } + + return resume_device(pid, 1); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) + +int cuda_plugin_init(int stage) +{ + if (!cuda_checkpoint_supports_flag("--action")) { + pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); + plugin_disabled = true; + return 0; + } + + pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); + + /* In the DUMP stage track all the PID's we've paused CUDA operations on to + * release them when we're done if the user requested the leave-running option + */ + if (stage == CR_PLUGIN_STAGE__DUMP) { + INIT_LIST_HEAD(&cuda_pids); + } + + return 0; +} + +void cuda_plugin_fini(int stage, int ret) +{ + if (plugin_disabled) { + return; + } + + pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); + + /* Release all the paused PID's at the end of the DUMP stage in case the + * user provides the -R (leave-running) flag or an error occurred + */ + if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { + struct pid_info *info; + list_for_each_entry(info, &cuda_pids, list) { + resume_device(info->pid, info->checkpointed); + } + } + if (stage == CR_PLUGIN_STAGE__DUMP) { + dealloc_pid_buffer(&cuda_pids); + } +} +CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini)