criu/plugin: Add NVIDIA CUDA plugin

Adding support for the NVIDIA cuda-checkpoint utility, requires the use of an r555 or higher driver along with the cuda-checkpoint binary. Signed-off-by: Jesus Ramos <[email protected]>
checkpoint-restore · Jun 7, 2024 · c49326c · c49326c
1 parent 69bc3e7
commit c49326c
Show file tree

Hide file tree

Showing 5 changed files with 496 additions and 3 deletions.
diff --git a/Makefile b/Makefile
@@ -308,15 +308,19 @@ clean-amdgpu_plugin:
 	$(Q) $(MAKE) -C plugins/amdgpu clean
 .PHONY: clean-amdgpu_plugin
 
+clean-cuda_plugin:
+	$(Q) $(MAKE) -C plugins/cuda clean
+.PHONY: clean-cuda_plugin
+
 clean-top:
 	$(Q) $(MAKE) -C Documentation clean
 	$(Q) $(MAKE) $(build)=test/compel clean
 	$(Q) $(RM) .gitid
 .PHONY: clean-top
 
-clean: clean-top clean-amdgpu_plugin
+clean: clean-top clean-amdgpu_plugin clean-cuda_plugin
 
-mrproper-top: clean-top clean-amdgpu_plugin
+mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin
 	$(Q) $(RM) $(CONFIG_HEADER)
 	$(Q) $(RM) $(VERSION_HEADER)
 	$(Q) $(RM) $(COMPEL_VERSION_HEADER)
@@ -348,6 +352,10 @@ amdgpu_plugin: criu
 	$(Q) $(MAKE) -C plugins/amdgpu all
 .PHONY: amdgpu_plugin
 
+cuda_plugin: criu
+	$(Q) $(MAKE) -C plugins/cuda all
+.PHONY: cuda_plugin
+
 crit: lib
 	$(Q) $(MAKE) -C crit
 .PHONY: crit
@@ -434,6 +442,7 @@ help:
 	@echo '      lint            - Run code linters'
 	@echo '      indent          - Indent C code'
 	@echo '      amdgpu_plugin   - Make AMD GPU plugin'
+	@echo '      cuda_plugin     - Make NVIDIA CUDA plugin'
 .PHONY: help
 
 ruff:

diff --git a/Makefile.install b/Makefile.install
@@ -49,12 +49,16 @@ install-amdgpu_plugin: amdgpu_plugin
 	$(Q) $(MAKE) -C plugins/amdgpu install
 .PHONY: install-amdgpu_plugin
 
+install-cuda_plugin: cuda_plugin
+	$(Q) $(MAKE) -C plugins/cuda install
+.PHONY: install-cuda_plugin
+
 install-compel: $(compel-install-targets)
 	$(Q) $(MAKE) $(build)=compel install
 	$(Q) $(MAKE) $(build)=compel/plugins install
 .PHONY: install-compel
 
-install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ;
+install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ;
 .PHONY: install
 
 uninstall:
@@ -65,4 +69,5 @@ uninstall:
 	$(Q) $(MAKE) $(build)=compel $@
 	$(Q) $(MAKE) $(build)=compel/plugins $@
 	$(Q) $(MAKE) -C plugins/amdgpu $@
+	$(Q) $(MAKE) -C plugins/cuda $@
 .PHONY: uninstall
diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile
@@ -0,0 +1,41 @@
+PLUGIN_NAME := cuda_plugin
+PLUGIN_SOBJ := cuda_plugin.so
+
+DEPS_CUDA := $(PLUGIN_SOBJ)
+
+PLUGIN_INCLUDE  	:= -iquote../../include
+PLUGIN_INCLUDE  	+= -iquote../../criu/include
+PLUGIN_INCLUDE  	+= -iquote../../criu/arch/$(ARCH)/include/
+PLUGIN_INCLUDE  	+= -iquote../../
+
+COMPEL := ../../compel/compel-host
+
+CC := gcc
+PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC
+
+__nmk_dir ?= ../../scripts/nmk/scripts/
+include $(__nmk_dir)msg.mk
+
+all: $(DEPS_CUDA)
+
+cuda_plugin.so: cuda_plugin.c
+	$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS)
+
+clean:
+	$(call msg-clean, $@)
+	$(Q) $(RM) $(PLUGIN_SOBJ)
+.PHONY: clean
+
+mrproper: clean
+
+install:
+	$(Q) mkdir -p $(DESTDIR)$(PLUGINDIR)
+	$(E) "  INSTALL " $(PLUGIN_NAME)
+	$(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR)
+.PHONY: install
+
+uninstall:
+	$(E) " UNINSTALL" $(PLUGIN_NAME)
+	$(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ)
+.PHONY: uninstall
+
diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md
@@ -0,0 +1,58 @@
+Checkpoint and Restore for CUDA applications with CRIU
+======================================================
+
+# Requirements
+The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555
+or higher GPU driver is required for CUDA CRIU integration support.
+
+## cuda-checkpoint
+The cuda-checkpoint utility can be found at:
+https://github.com/NVIDIA/cuda-checkpoint
+
+cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA
+applications. Updating the cuda-checkpoint utility between driver releases
+should not be necessary as the utility simply exposes some extra driver behavior
+so driver updates are all that's needed to get access to newer features.
+
+# Checkpointing Procedure
+cuda-checkpoint exposes 4 actions used in the checkpointing process: lock,
+checkpoint, restore, unlock.
+
+* lock - Used with the PAUSE_DEVICES hook while a process is still running to
+  quiesce the application into a state where it can be checkpointed
+* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been
+  seized/frozen to perform the actual checkpointing operation
+* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA
+  state and release the process back to it's running state
+
+These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA
+plugin will re-wake when needed.
+
+# Known Limitations
+* Currently GPU memory contents are brought into main system memory and CRIU
+  then checkpoints that as part of the normal procedure. On systems with many
+  GPU's with high GPU memory usage this can cause memory thrashing. A future
+  release will add support for dumping the memory contents to files to alleviate
+  this.
+* There's currently a small race between when a PAUSE_DEVICES hook is called on
+  a running process and a process calls cuInit() and finishes intializing CUDA
+  after the PAUSE is issued but before the process is frozen to checkpoint. This
+  will cause cuda-checkpoint to report that the process is in an illegal state
+  for checkpointing and it's recommended to just attempt the CRIU procedure
+  again, this should be very rare.
+* Applications that use NVML will leave some leftover device references as NVML
+  is not currently supported for checkpointing. There will be support for this
+  in later drivers. A possible temporary workaround is to have the
+  {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N}
+  remaining references for these applications as in most cases NVML is used to
+  get info such as gpu count and some capabilities and these values are never
+  accessed again and unlikely to change.
+* CUDA applications that fork() but don't call exec() but also don't issue any
+  CUDA API calls will have some leftover references to /dev/nvidia* and fail to
+  checkpoint as a result. This can be worked around in a similar fashion to the
+  NVML case where the leftover references can be ignored as CUDA is not fork()
+  safe anyway.
+* Restore currently requires that you restore on a system with similar GPU's and
+  same GPU count.
+* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process
+  Service) are currently not supported for checkpointing.