Skip to content

Commit

Permalink
images/inventory: add plugins field
Browse files Browse the repository at this point in the history
This patch extends the inventory image with a `plugins` field
that contains an array of names. This field indicates which
plugins were used during checkpoint, for example, to save GPU
state. In particular, the CUDA and AMDGPU plugins are added
to this field only when the checkpoint contains GPU state.
This allows to migrate a process that does not use GPU
from a GPU-enabled system to CPU-only environment.

During restore, this field is used to disable unnecessary
plugins and show appropriate error messages if required
CRIU plugin are missing.

To preserve backwards compatibility, we use an `optional`
`plugins_entry` that allows us to distinguish between an
empty and unset `plugins` field.

Signed-off-by: Radostin Stoyanov <[email protected]>
  • Loading branch information
rst0git committed Oct 7, 2024
1 parent 56bc739 commit bd51fdb
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 5 deletions.
6 changes: 3 additions & 3 deletions criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -2354,12 +2354,12 @@ int cr_restore_tasks(void)
if (init_service_fd())
return 1;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (check_img_inventory(/* restore = */ true) < 0)
goto err;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (init_stats(RESTORE_STATS))
goto err;

Expand Down
122 changes: 122 additions & 0 deletions criu/image.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids;
u32 root_cg_set;
Lsmtype image_lsm;

struct inventory_plugin {
struct list_head node;
char *name;
};

struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list);
static int n_inventory_plugins;

int check_img_inventory(bool restore)
{
int ret = -1;
Expand Down Expand Up @@ -99,6 +107,17 @@ int check_img_inventory(bool restore)
} else {
opts.network_lock_method = he->network_lock_method;
}

if (!he->plugins_entry) {
/* Check for backwards compatibility */
n_inventory_plugins = -1;
} else {
PluginsEntry *pe = he->plugins_entry;
for (int i = 0; i < pe->n_plugins; i++) {
if (add_inventory_plugin(pe->plugins[i]))
goto out_err;
}
}
}

ret = 0;
Expand All @@ -110,8 +129,92 @@ int check_img_inventory(bool restore)
return ret;
}

/**
* Check if the 'plugins' field in the inventory image contains
* the specified plugin name. If found, the plugin is removed
* from the linked list.
*
* Return:
* 1 - inventory image contains plugin name
* 0 - inventory image does not contain plugin name
* -1 - 'plugins' field is not set (backwards compatibility)
*/
int inventory_check_and_remove_plugin(const char *name, size_t n)
{
if (n_inventory_plugins == -1)
return -1;

if (n_inventory_plugins > 0) {
struct inventory_plugin *p, *tmp;
list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) {
if (!strncmp(name, p->name, n)) {
xfree(p->name);
list_del(&p->node);
n_inventory_plugins--;
return 1;
}
}
}

return 0;
}

/**
* We expect during restore all loaded plugins to be removed from
* the inventory_plugins_list. If the list is not empty, show an
* error message for each missing plugin.
*/
int check_inventory_plugins(void)
{
struct inventory_plugin *p;

if (n_inventory_plugins <= 0)
return 0;

list_for_each_entry(p, &inventory_plugins_list, node) {
pr_err("Required plugin is missing: %s\n", p->name);
}

return -1;
}

/**
* Add a plugin name to the inventory image. This array of names
* is used to load only the necessary plugins during restore.
*/
int add_inventory_plugin(const char *name)
{
struct inventory_plugin *p;

p = xmalloc(sizeof(struct inventory_plugin));
if (p == NULL)
return -1;

p->name = xstrdup(name);
if (!p->name) {
xfree(p);
return -1;
}
list_add(&p->node, &inventory_plugins_list);
n_inventory_plugins++;

return 0;
}

void free_inventory_plugins_list(void)
{
struct inventory_plugin *p;

if (!list_empty(&inventory_plugins_list)) {
list_for_each_entry(p, &inventory_plugins_list, node) {
xfree(p->name);
}
}
}

int write_img_inventory(InventoryEntry *he)
{
PluginsEntry pe = PLUGINS_ENTRY__INIT;
struct cr_img *img;
int ret;

Expand All @@ -121,8 +224,27 @@ int write_img_inventory(InventoryEntry *he)
if (!img)
return -1;

if (!list_empty(&inventory_plugins_list)) {
struct inventory_plugin *p;
int i = 0;

pe.n_plugins = n_inventory_plugins;
pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *));
if (!pe.plugins)
return -1;

list_for_each_entry(p, &inventory_plugins_list, node) {
pe.plugins[i] = p->name;
i++;
}
}
he->plugins_entry = &pe;

ret = pb_write_one(img, he, PB_INVENTORY);

free_inventory_plugins_list();
xfree(pe.plugins);

xfree(he->root_ids);
close_image(img);
if (ret < 0)
Expand Down
5 changes: 5 additions & 0 deletions criu/include/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "bfd.h"
#include "log.h"
#include "common/bug.h"
#include "common/list.h"

#define PAGE_RSS 1
#define PAGE_ANON 2
Expand Down Expand Up @@ -177,4 +178,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size);

extern void close_image(struct cr_img *);

extern int add_inventory_plugin(const char *name);
extern int inventory_check_and_remove_plugin(const char *name, size_t n);
extern int check_inventory_plugins(void);

#endif /* __CR_IMAGE_H__ */
3 changes: 3 additions & 0 deletions criu/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ int cr_plugin_init(int stage)
goto err;
}

if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins())
goto err;

exit_code = 0;
err:
closedir(d);
Expand Down
8 changes: 8 additions & 0 deletions images/inventory.proto
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ enum lsmtype {
APPARMOR = 2;
}

// It is not possible to distinguish between an empty repeated field
// and unset repeated field. To solve this problem and provide backwards
// compabibility, we use the 'plugins_entry' message.
message plugins_entry {
repeated string plugins = 12;
};

message inventory_entry {
required uint32 img_version = 1;
optional bool fdinfo_per_id = 2;
Expand All @@ -21,4 +28,5 @@ message inventory_entry {
optional uint32 pre_dump_mode = 9;
optional bool tcp_close = 10;
optional uint32 network_lock_method = 11;
optional plugins_entry plugins_entry = 12;
}
32 changes: 32 additions & 0 deletions plugins/amdgpu/amdgpu_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ static LIST_HEAD(update_vma_info_list);

size_t kfd_max_buffer_size;

/* Indicates if the plugin has been added to the inventory image */
bool plugin_added_to_inventory = false;

bool plugin_disabled = false;

/**************************************************************************************************/

/* Call ioctl, restarting if it is interrupted */
Expand Down Expand Up @@ -332,6 +337,13 @@ void getenv_size_t(const char *var, size_t *value)

int amdgpu_plugin_init(int stage)
{
if (stage == CR_PLUGIN_STAGE__RESTORE) {
if (inventory_check_and_remove_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)) != 1) {
plugin_disabled = true;
return 0;
}
}

pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);

topology_init(&src_topology);
Expand Down Expand Up @@ -365,6 +377,9 @@ int amdgpu_plugin_init(int stage)

void amdgpu_plugin_fini(int stage, int ret)
{
if (plugin_disabled)
return;

pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);

if (stage == CR_PLUGIN_STAGE__RESTORE)
Expand Down Expand Up @@ -414,6 +429,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
if (ret)
pr_perror("%s(), Can't handle VMAs of input device", __func__);

if (!plugin_added_to_inventory) {
ret = add_inventory_plugin(CR_PLUGIN_DESC.name);
if (ret)
pr_err("Falied to add AMDGPU plugin to inventory image\n");
else
plugin_added_to_inventory = true;
}

return ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
Expand Down Expand Up @@ -1540,6 +1563,9 @@ int amdgpu_plugin_restore_file(int id)
size_t img_size;
FILE *img_fp = NULL;

if (plugin_disabled)
return -ENOTSUP;

pr_info("Initialized kfd plugin restorer with ID = %d\n", id);

snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id);
Expand Down Expand Up @@ -1746,6 +1772,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const
char *p_end;
bool is_kfd = false, is_renderD = false;

if (plugin_disabled)
return -ENOTSUP;

plugin_log_msg("Enter %s\n", __func__);

strncpy(path, in_path, sizeof(path));
Expand Down Expand Up @@ -1805,6 +1834,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
struct kfd_ioctl_criu_args args = { 0 };
int fd, exit_code = 0;

if (plugin_disabled)
return -ENOTSUP;

pr_info("Inside %s for target pid = %d\n", __func__, target_pid);

fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
Expand Down
23 changes: 21 additions & 2 deletions plugins/cuda/cuda_plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
*/
bool plugin_disabled = false;

/* Indicates if the plugin has been added to the inventory image */
bool plugin_added_to_inventory = false;

struct pid_info {
int pid;
char checkpointed;
Expand Down Expand Up @@ -319,7 +322,7 @@ int cuda_plugin_checkpoint_devices(int pid)
k_rtsigset_t save_sigset;

if (plugin_disabled) {
return 0;
return -ENOTSUP;
}

restore_tid = get_cuda_restore_tid(pid);
Expand Down Expand Up @@ -354,6 +357,15 @@ int cuda_plugin_checkpoint_devices(int pid)
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
}
}

if (!plugin_added_to_inventory) {
status = add_inventory_plugin(CR_PLUGIN_DESC.name);
if (status)
pr_err("Falied to add CUDA plugin to inventory image\n");
else
plugin_added_to_inventory = true;
}

interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);

Expand All @@ -367,7 +379,7 @@ int cuda_plugin_pause_devices(int pid)
char msg_buf[CUDA_CKPT_BUF_SIZE];

if (plugin_disabled) {
return 0;
return -ENOTSUP;
}

restore_tid = get_cuda_restore_tid(pid);
Expand Down Expand Up @@ -463,6 +475,13 @@ int cuda_plugin_init(int stage)
{
int ret;

if (stage == CR_PLUGIN_STAGE__RESTORE) {
if (inventory_check_and_remove_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)) != 1) {
plugin_disabled = true;
return 0;
}
}

if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
plugin_disabled = true;
Expand Down

0 comments on commit bd51fdb

Please sign in to comment.