Skip to content

Commit

Permalink
images/inventory: add fields for plugins
Browse files Browse the repository at this point in the history
This patch extends the inventory image with a new "plugins" entry that
contains *optional* boolean flags for the AMDGPU and CUDA plugins.

These fields are set when a plugin is initialized, and their values
indicate whether the checkpoint contains GPU state. In other words,
a missing field indicates that the plugin was not initialized during
checkpoint.

During restore, these fields are used to avoid loading unnecessary
plugins or to show an appropriate error message if a required plugin
is missing. If the "plugins" entry is not present in the inventory
image, all CRIU plugins are loaded to preserve backwards compatibility.

It is important to note that these fields are set to `true` only when
the checkpoint contains GPU state. This approach allows to migrate
processes (or containers) from a GPU-enabled system to a system without
a GPU, when the GPU is not utilized.

Examples:

1. The checkpoint was created without any CRIU plugins:
{
    "magic": "INVENTORY",
    "entries": [
	{
	    "plugins": {}
	}
    ]
}

2. The checkpoint was created with both AMD GPU and CUDA plugins
   installed but does not contain GPU state:

{
    "magic": "INVENTORY",
    "entries": [
        {
            "plugins": {
                "amdgpu": false,
                "cuda": false
            }
        }
    ]
}

3. The checkpoint was created with only with the CUDA plugin installed
   and contains GPU state:
{
    "magic": "INVENTORY",
    "entries": [
        {
            "plugins": {
                "cuda": true
            }
        }
    ]
}

Signed-off-by: Radostin Stoyanov <[email protected]>
  • Loading branch information
rst0git committed Sep 27, 2024
1 parent 55c8917 commit ecd6f4e
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 3 deletions.
6 changes: 3 additions & 3 deletions criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -2353,12 +2353,12 @@ int cr_restore_tasks(void)
if (init_service_fd())
return 1;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (check_img_inventory(/* restore = */ true) < 0)
goto err;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (init_stats(RESTORE_STATS))
goto err;

Expand Down
44 changes: 44 additions & 0 deletions criu/image.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,39 @@ TaskKobjIdsEntry *root_ids;
u32 root_cg_set;
Lsmtype image_lsm;

static PluginsEntry criu_plugins = PLUGINS_ENTRY__INIT;
bool enabled_plugins_inventory;

void __attribute__((used)) set_cuda_plugin(void)
{
criu_plugins.has_cuda = true;
}

void __attribute__((used)) set_cuda_plugin_enabled(void)
{
criu_plugins.cuda = true;
}

bool __attribute__((used)) get_cuda_plugin_enabled(void)
{
return criu_plugins.has_cuda && criu_plugins.cuda;
}

void __attribute__((used)) set_amdgpu_plugin(void)
{
criu_plugins.has_amdgpu = true;
}

void __attribute__((used)) set_amdgpu_plugin_enabled(void)
{
criu_plugins.amdgpu = true;
}

bool __attribute__((used)) get_amdgpu_plugin_enabled(void)
{
return criu_plugins.has_amdgpu && criu_plugins.amdgpu;
}

int check_img_inventory(bool restore)
{
int ret = -1;
Expand Down Expand Up @@ -99,6 +132,16 @@ int check_img_inventory(bool restore)
} else {
opts.network_lock_method = he->network_lock_method;
}

if (he->plugins == NULL) {
enabled_plugins_inventory = false;
} else {
criu_plugins.has_amdgpu = he->plugins->has_amdgpu;
criu_plugins.amdgpu = he->plugins->amdgpu;
criu_plugins.has_cuda = he->plugins->has_cuda;
criu_plugins.cuda = he->plugins->cuda;
enabled_plugins_inventory = true;
}
}

ret = 0;
Expand All @@ -121,6 +164,7 @@ int write_img_inventory(InventoryEntry *he)
if (!img)
return -1;

he->plugins = &criu_plugins;
ret = pb_write_one(img, he, PB_INVENTORY);

xfree(he->root_ids);
Expand Down
13 changes: 13 additions & 0 deletions criu/include/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,17 @@ extern int read_img_str(struct cr_img *, char **pstr, int size);

extern void close_image(struct cr_img *);

/* For backwards compatibility, if the inventory image does not contain
* a "plugins" field, we should load all plugins. */
extern bool enabled_plugins_inventory;

extern void set_cuda_plugin(void);
extern void set_cuda_plugin_enabled(void);
extern bool get_cuda_plugin_enabled(void);

extern void set_amdgpu_plugin(void);
extern void set_amdgpu_plugin_enabled(void);
extern bool get_amdgpu_plugin_enabled(void);


Check warning on line 192 in criu/include/image.h

View workflow job for this annotation

GitHub Actions / build

#endif /* __CR_IMAGE_H__ */
28 changes: 28 additions & 0 deletions criu/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ int cr_plugin_init(int stage)
char *path;
size_t i;
DIR *d;
bool cuda_plugin_required = get_cuda_plugin_enabled();
bool cuda_plugin_loaded = false;
bool amdgpu_plugin_required = get_amdgpu_plugin_enabled();
bool amdgpu_plugin_loaded = false;

INIT_LIST_HEAD(&cr_plugin_ctl.head);
for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++)
Expand Down Expand Up @@ -247,6 +251,18 @@ int cr_plugin_init(int stage)
if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3))
continue;

if (enabled_plugins_inventory && stage == CR_PLUGIN_STAGE__RESTORE) {
if (len == 16 && strncmp(de->d_name, "amdgpu_plugin.so", 16)) {
if (amdgpu_plugin_required == false)
continue; /* Skip unnecessary plugin */
amdgpu_plugin_loaded = true;
} else if (len == 14 && strncmp(de->d_name, "cuda_plugin.so", 14)) {
if (cuda_plugin_required == false)
continue; /* Skip unnecessary plugin */
cuda_plugin_loaded = true;
}
}

if (snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name) >= sizeof(path)) {
pr_err("Unable to build plugin path\n");
goto err;
Expand All @@ -256,6 +272,18 @@ int cr_plugin_init(int stage)
goto err;
}

if (enabled_plugins_inventory && stage == CR_PLUGIN_STAGE__RESTORE) {
if (amdgpu_plugin_required && !amdgpu_plugin_loaded) {
pr_err("AMD GPU plugin is required for restore\n");
goto err;
}

if (cuda_plugin_required && !cuda_plugin_loaded) {
pr_err("CUDA plugin is required for restore\n");
goto err;
}
}

exit_code = 0;
err:
closedir(d);
Expand Down
6 changes: 6 additions & 0 deletions images/inventory.proto
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ enum lsmtype {
APPARMOR = 2;
}

message plugins_entry {
optional bool amdgpu = 1;
optional bool cuda = 2;
};

message inventory_entry {
required uint32 img_version = 1;
optional bool fdinfo_per_id = 2;
Expand All @@ -21,4 +26,5 @@ message inventory_entry {
optional uint32 pre_dump_mode = 9;
optional bool tcp_close = 10;
optional uint32 network_lock_method = 11;
optional plugins_entry plugins = 12;
}
6 changes: 6 additions & 0 deletions plugins/amdgpu/amdgpu_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ int amdgpu_plugin_init(int stage)
kfd_max_buffer_size = 0;
getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size);

set_amdgpu_plugin();

return 0;
}

Expand Down Expand Up @@ -414,6 +416,10 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
if (ret)
pr_perror("%s(), Can't handle VMAs of input device", __func__);

/* Set the AMD GPU plugin as enabled in the inventory image
* only if the checkpoint contains GPU state (i.e., ret == 0). */
set_amdgpu_plugin_enabled();

return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
Expand Down
6 changes: 6 additions & 0 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,10 @@ int cuda_plugin_checkpoint_devices(int pid)
return 0;
}

/* Set the CUDA plugin as enabled in the inventory image
* only if the checkpoint contains GPU state (i.e., restore_tid != -1). */
set_cuda_plugin_enabled();

pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid);
/* We need to resume the checkpoint thread to prepare the mappings for
* checkpointing
Expand Down Expand Up @@ -463,6 +467,8 @@ int cuda_plugin_init(int stage)
{
int ret;

set_cuda_plugin();

if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
plugin_disabled = true;
Expand Down

0 comments on commit ecd6f4e

Please sign in to comment.