Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

criu/plugin: Add NVIDIA CUDA plugin #2416

Merged
merged 3 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/
export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS

# Default target
all: flog criu lib crit
all: flog criu lib crit cuda_plugin
.PHONY: all

#
Expand Down Expand Up @@ -308,15 +308,19 @@ clean-amdgpu_plugin:
$(Q) $(MAKE) -C plugins/amdgpu clean
.PHONY: clean-amdgpu_plugin

clean-cuda_plugin:
$(Q) $(MAKE) -C plugins/cuda clean
.PHONY: clean-cuda_plugin

clean-top:
$(Q) $(MAKE) -C Documentation clean
$(Q) $(MAKE) $(build)=test/compel clean
$(Q) $(RM) .gitid
.PHONY: clean-top

clean: clean-top clean-amdgpu_plugin
clean: clean-top clean-amdgpu_plugin clean-cuda_plugin

mrproper-top: clean-top clean-amdgpu_plugin
mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin
$(Q) $(RM) $(CONFIG_HEADER)
$(Q) $(RM) $(VERSION_HEADER)
$(Q) $(RM) $(COMPEL_VERSION_HEADER)
Expand Down Expand Up @@ -348,6 +352,10 @@ amdgpu_plugin: criu
$(Q) $(MAKE) -C plugins/amdgpu all
.PHONY: amdgpu_plugin

cuda_plugin: criu
jesus-ramos marked this conversation as resolved.
Show resolved Hide resolved
$(Q) $(MAKE) -C plugins/cuda all
.PHONY: cuda_plugin

crit: lib
$(Q) $(MAKE) -C crit
.PHONY: crit
Expand Down Expand Up @@ -434,6 +442,7 @@ help:
@echo ' lint - Run code linters'
@echo ' indent - Indent C code'
@echo ' amdgpu_plugin - Make AMD GPU plugin'
@echo ' cuda_plugin - Make NVIDIA CUDA plugin'
.PHONY: help

ruff:
Expand Down
7 changes: 6 additions & 1 deletion Makefile.install
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,16 @@ install-amdgpu_plugin: amdgpu_plugin
$(Q) $(MAKE) -C plugins/amdgpu install
.PHONY: install-amdgpu_plugin

install-cuda_plugin: cuda_plugin
$(Q) $(MAKE) -C plugins/cuda install
.PHONY: install-cuda_plugin

install-compel: $(compel-install-targets)
$(Q) $(MAKE) $(build)=compel install
$(Q) $(MAKE) $(build)=compel/plugins install
.PHONY: install-compel

install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ;
install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ;
.PHONY: install

uninstall:
Expand All @@ -65,4 +69,5 @@ uninstall:
$(Q) $(MAKE) $(build)=compel $@
$(Q) $(MAKE) $(build)=compel/plugins $@
$(Q) $(MAKE) -C plugins/amdgpu $@
$(Q) $(MAKE) -C plugins/cuda $@
.PHONY: uninstall
4 changes: 3 additions & 1 deletion criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -2035,7 +2035,6 @@ static int cr_dump_finish(int ret)
if (bfd_flush_images())
ret = -1;

cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
cgp_fini();

if (!ret) {
Expand Down Expand Up @@ -2089,6 +2088,9 @@ static int cr_dump_finish(int ret)

if (arch_set_thread_regs(root_item, true) < 0)
return -1;

cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);

pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state);
timing_stop(TIME_FROZEN);
free_pstree(root_item);
Expand Down
9 changes: 5 additions & 4 deletions criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -2224,6 +2224,11 @@ static int restore_root_task(struct pstree_item *init)
}

finalize_restore();

/* just before releasing threads we have to restore rseq_cs */
if (restore_rseq_cs())
pr_err("Unable to restore rseq_cs state\n");

/*
* Some external devices such as GPUs might need a very late
* trigger to kick-off some events, memory notifiers and for
Expand Down Expand Up @@ -2255,10 +2260,6 @@ static int restore_root_task(struct pstree_item *init)
if (restore_freezer_state())
pr_err("Unable to restore freezer state\n");

/* just before releasing threads we have to restore rseq_cs */
if (restore_rseq_cs())
pr_err("Unable to restore rseq_cs state\n");

/* Detaches from processes and they continue run through sigreturn. */
if (finalize_restore_detach())
goto out_kill_network_unlocked;
Expand Down
6 changes: 6 additions & 0 deletions criu/include/criu-plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ enum {

CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9,

CR_PLUGIN_HOOK__PAUSE_DEVICES = 10,

CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,

CR_PLUGIN_HOOK__MAX
};

Expand All @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr,
const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);

enum {
CR_PLUGIN_STAGE__DUMP,
Expand Down
2 changes: 2 additions & 0 deletions criu/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma");
__assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map");
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");

#undef __assign_hook

Expand Down
18 changes: 18 additions & 0 deletions criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "pstree.h"
#include "criu-log.h"
#include <compel/ptrace.h>
#include "plugin.h"
#include "proc_parse.h"
#include "seccomp.h"
#include "seize.h"
Expand Down Expand Up @@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item)
goto free;
}

ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto free;
}

if (!opts.freeze_cgroup)
/* fails when meets a zombie */
__ignore_value(compel_interrupt_task(pid));
Expand Down Expand Up @@ -966,6 +972,7 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real;
int ret = -1;
struct proc_status_creds creds;
struct pstree_item *iter;

timing_start(TIME_FREEZING);

Expand All @@ -984,6 +991,11 @@ int collect_pstree(void)
if (opts.freeze_cgroup && freeze_processes())
goto err;

ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto err;
}

if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
Expand Down Expand Up @@ -1017,6 +1029,12 @@ int collect_pstree(void)
goto err;
}

for_each_pstree_item(iter) {
ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}

ret = 0;
timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN);
Expand Down
42 changes: 42 additions & 0 deletions plugins/cuda/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
PLUGIN_NAME := cuda_plugin
PLUGIN_SOBJ := cuda_plugin.so

DEPS_CUDA := $(PLUGIN_SOBJ)

PLUGIN_INCLUDE := -iquote../../include
PLUGIN_INCLUDE += -iquote../../criu/include
PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/
PLUGIN_INCLUDE += -iquote../../

COMPEL := ../../compel/compel-host

CC := gcc
PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC

__nmk_dir ?= ../../scripts/nmk/scripts/
include $(__nmk_dir)msg.mk

all: $(DEPS_CUDA)

cuda_plugin.so: cuda_plugin.c
$(call msg-gen, $@)
$(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS)

clean:
$(call msg-clean, $@)
$(Q) $(RM) $(PLUGIN_SOBJ)
.PHONY: clean

mrproper: clean

install:
$(Q) mkdir -p $(DESTDIR)$(PLUGINDIR)
$(E) " INSTALL " $(PLUGIN_NAME)
$(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR)
.PHONY: install

uninstall:
$(E) " UNINSTALL" $(PLUGIN_NAME)
$(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ)
.PHONY: uninstall

59 changes: 59 additions & 0 deletions plugins/cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
Checkpoint and Restore for CUDA applications with CRIU
======================================================

# Requirements
The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555
or higher GPU driver is required for CUDA CRIU integration support.

## cuda-checkpoint
The cuda-checkpoint utility can be found at:
https://github.com/NVIDIA/cuda-checkpoint

cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA
applications. Updating the cuda-checkpoint utility between driver releases
should not be necessary as the utility simply exposes some extra driver behavior
so driver updates are all that's needed to get access to newer features.

# Checkpointing Procedure
cuda-checkpoint exposes 4 actions used in the checkpointing process: lock,
checkpoint, restore, unlock.

* lock - Used with the PAUSE_DEVICES hook while a process is still running to
quiesce the application into a state where it can be checkpointed
* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been
seized/frozen to perform the actual checkpointing operation
* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA
state and release the process back to it's running state

These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA
plugin will re-wake when needed.

# Known Limitations
* Currently GPU memory contents are brought into main system memory and CRIU
then checkpoints that as part of the normal procedure. On systems with many
GPU's with high GPU memory usage this can cause memory thrashing. A future
CUDA release will add support for dumping the memory contents to files to
alleviate this as well as support in the CRIU plugin.
* There's currently a small race between when a PAUSE_DEVICES hook is called on
a running process and a process calls cuInit() and finishes initializing CUDA
after the PAUSE is issued but before the process is frozen to checkpoint. This
will cause cuda-checkpoint to report that the process is in an illegal state
for checkpointing and it's recommended to just attempt the CRIU procedure
again, this should be very rare.
rst0git marked this conversation as resolved.
Show resolved Hide resolved
* Applications that use NVML will leave some leftover device references as NVML
is not currently supported for checkpointing. There will be support for this
in later drivers. A possible temporary workaround is to have the
{DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N}
remaining references for these applications as in most cases NVML is used to
get info such as gpu count and some capabilities and these values are never
accessed again and unlikely to change.
* CUDA applications that fork() but don't call exec() but also don't issue any
CUDA API calls will have some leftover references to /dev/nvidia* and fail to
checkpoint as a result. This can be worked around in a similar fashion to the
NVML case where the leftover references can be ignored as CUDA is not fork()
safe anyway.
* Restore currently requires that you restore on a system with similar GPU's and
same GPU count.
* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process
Service) are currently not supported for checkpointing. Future CUDA releases
will add support for these.
Loading