From 2eb34b3a43df6cf471773d6d3f644276d5ead6cb Mon Sep 17 00:00:00 2001 From: Tom Dohrmann Date: Fri, 11 Oct 2024 11:29:28 +0200 Subject: [PATCH] qemu-static: add patches for VFIO support on SNP Upstream QEMU can't launch SNP VMs with VFIO devices. Apply the patches from this patch series: https://patchwork.kernel.org/project/kvm/cover/20240725072118.358923-1-chenyi.qiang@intel.com/ We also need another patch to support large devices (such as GPUs). --- ...oduce-an-object-to-manage-the-guest-.patch | 417 ++++++++++++++++++ ...oduce-a-helper-to-notify-the-shared-.patch | 199 +++++++++ ...tate-change-via-RamDiscardManager-he.patch | 51 +++ ...the-RamDiscardManager-instance-upon-.patch | 53 +++ ...ult-to-discarded-private-in-guest_me.patch | 47 ++ ...est_memfd-require-coordinate-discard.patch | 29 ++ ...0-increase-min-granularity-for-memfd.patch | 29 ++ packages/by-name/qemu-static/package.nix | 9 + 8 files changed, 834 insertions(+) create mode 100644 packages/by-name/qemu-static/0004-guest_memfd-Introduce-an-object-to-manage-the-guest-.patch create mode 100644 packages/by-name/qemu-static/0005-guest_memfd-Introduce-a-helper-to-notify-the-shared-.patch create mode 100644 packages/by-name/qemu-static/0006-KVM-Notify-the-state-change-via-RamDiscardManager-he.patch create mode 100644 packages/by-name/qemu-static/0007-memory-Register-the-RamDiscardManager-instance-upon-.patch create mode 100644 packages/by-name/qemu-static/0008-guest-memfd-Default-to-discarded-private-in-guest_me.patch create mode 100644 packages/by-name/qemu-static/0009-RAMBlock-make-guest_memfd-require-coordinate-discard.patch create mode 100644 packages/by-name/qemu-static/0010-increase-min-granularity-for-memfd.patch diff --git a/packages/by-name/qemu-static/0004-guest_memfd-Introduce-an-object-to-manage-the-guest-.patch b/packages/by-name/qemu-static/0004-guest_memfd-Introduce-an-object-to-manage-the-guest-.patch new file mode 100644 index 0000000000..b92f47b2d1 --- /dev/null +++ b/packages/by-name/qemu-static/0004-guest_memfd-Introduce-an-object-to-manage-the-guest-.patch @@ -0,0 +1,417 @@ +From 91317d0d1be9474e1b492e75e290322e84e78c3e Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:10 -0400 +Subject: [PATCH 1/6] guest_memfd: Introduce an object to manage the + guest-memfd with RamDiscardManager + +As the commit 852f0048f3 ("RAMBlock: make guest_memfd require +uncoordinated discard") highlighted, some subsystems like VFIO might +disable ram block discard. However, guest_memfd relies on the discard +operation to perform page conversion between private and shared memory. +This can lead to stale IOMMU mapping issue when assigning a hardware +device to a confidential guest via shared memory (unprotected memory +pages). Blocking shared page discard can solve this problem, but it +could cause guests to consume twice the memory with VFIO, which is not +acceptable in some cases. An alternative solution is to convey other +systems like VFIO to refresh its outdated IOMMU mappings. + +RamDiscardManager is an existing concept (used by virtio-mem) to adjust +VFIO mappings in relation to VM page assignement. Effectively page +conversion is similar to hot-removing a page in one mode and adding it +back in the other, so the similar work that needs to happen in response +to virtio-mem changes needs to happen for page conversion events. +Introduce the RamDiscardManager to guest_memfd to achieve it. + +However, Implementing the RamDiscardManager interface poses a challenge +as guest_memfd is not an object, instead, it is contained within RamBlock +and is indicated by a RAM_GUEST_MEMFD flag upon creation. + +One option is to implement the interface in HostMemoryBackend. Any +guest_memfd-backed host memory backend can register itself in the target +MemoryRegion. However, this solution doesn't cover the scenario where a +guest_memfd MemoryRegion doesn't belong to the HostMemoryBackend, e.g. +the virtual BIOS MemoryRegion. + +Thus, implement the second option, which involves defining an object type +named guest_memfd_manager with the RamDiscardManager interface. Upon +creation of guest_memfd, a new guest_memfd_manager object can be +instantiated and registered to the managed guest_memfd MemoryRegion to +handle the page conversion events. + +In the context of guest_memfd, the discarded state signifies that the +page is private, while the populated state indicated that the page is +shared. The state of the memory is tracked at the granularity of the +host page size (i.e. block_size), as the minimum conversion size can be +one page per request. In addition, VFIO expects the DMA mapping for a +specific iova to be mapped and unmapped with the same granularity. +However, there's no guarantee that the confidential guest won't +partially convert the pages. For instance the confidential guest may +flip a 2M page from private to shared and later flip the first 4K +sub-range from shared to private. To prevent such invalid cases, all +operations are performed with a 4K granularity. + +Signed-off-by: Chenyi Qiang +--- + include/sysemu/guest-memfd-manager.h | 46 +++++ + system/guest-memfd-manager.c | 283 +++++++++++++++++++++++++++ + system/meson.build | 1 + + 3 files changed, 330 insertions(+) + create mode 100644 include/sysemu/guest-memfd-manager.h + create mode 100644 system/guest-memfd-manager.c + +diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h +new file mode 100644 +index 0000000000..ab8c2ba362 +--- /dev/null ++++ b/include/sysemu/guest-memfd-manager.h +@@ -0,0 +1,46 @@ ++/* ++ * QEMU guest memfd manager ++ * ++ * Copyright Intel ++ * ++ * Author: ++ * Chenyi Qiang ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory ++ * ++ */ ++ ++#ifndef SYSEMU_GUEST_MEMFD_MANAGER_H ++#define SYSEMU_GUEST_MEMFD_MANAGER_H ++ ++#include "sysemu/hostmem.h" ++ ++#define TYPE_GUEST_MEMFD_MANAGER "guest-memfd-manager" ++ ++OBJECT_DECLARE_TYPE(GuestMemfdManager, GuestMemfdManagerClass, GUEST_MEMFD_MANAGER) ++ ++struct GuestMemfdManager { ++ Object parent; ++ ++ /* Managed memory region. */ ++ MemoryRegion *mr; ++ ++ /* bitmap used to track discard (private) memory */ ++ int32_t discard_bitmap_size; ++ unsigned long *discard_bitmap; ++ ++ /* block size and alignment */ ++ uint64_t block_size; ++ ++ /* listeners to notify on populate/discard activity. */ ++ QLIST_HEAD(, RamDiscardListener) rdl_list; ++}; ++ ++struct GuestMemfdManagerClass { ++ ObjectClass parent_class; ++ ++ void (*realize)(Object *gmm, MemoryRegion *mr, uint64_t region_size); ++}; ++ ++#endif +diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c +new file mode 100644 +index 0000000000..7b90f26859 +--- /dev/null ++++ b/system/guest-memfd-manager.c +@@ -0,0 +1,283 @@ ++/* ++ * QEMU guest memfd manager ++ * ++ * Copyright Intel ++ * ++ * Author: ++ * Chenyi Qiang ++ * ++ * This work is licensed under the terms of the GNU GPL, version 2 or later. ++ * See the COPYING file in the top-level directory ++ * ++ */ ++ ++#include "qemu/osdep.h" ++#include "qemu/error-report.h" ++#include "sysemu/guest-memfd-manager.h" ++ ++OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(GuestMemfdManager, ++ guest_memfd_manager, ++ GUEST_MEMFD_MANAGER, ++ OBJECT, ++ { TYPE_RAM_DISCARD_MANAGER }, ++ { }) ++ ++static bool guest_memfd_rdm_is_populated(const RamDiscardManager *rdm, ++ const MemoryRegionSection *section) ++{ ++ const GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ uint64_t first_bit = section->offset_within_region / gmm->block_size; ++ uint64_t last_bit = first_bit + int128_get64(section->size) / gmm->block_size - 1; ++ unsigned long first_discard_bit; ++ ++ first_discard_bit = find_next_bit(gmm->discard_bitmap, last_bit + 1, first_bit); ++ return first_discard_bit > last_bit; ++} ++ ++static bool guest_memfd_rdm_intersect_memory_section(MemoryRegionSection *section, ++ uint64_t offset, uint64_t size) ++{ ++ uint64_t start = MAX(section->offset_within_region, offset); ++ uint64_t end = MIN(section->offset_within_region + int128_get64(section->size), ++ offset + size); ++ if (end <= start) { ++ return false; ++ } ++ ++ section->offset_within_address_space += start - section->offset_within_region; ++ section->offset_within_region = start; ++ section->size = int128_make64(end - start); ++ ++ return true; ++} ++ ++typedef int (*guest_memfd_section_cb)(MemoryRegionSection *s, void *arg); ++ ++static int guest_memfd_notify_populate_cb(MemoryRegionSection *section, void *arg) ++{ ++ RamDiscardListener *rdl = arg; ++ ++ return rdl->notify_populate(rdl, section); ++} ++ ++static int guest_memfd_notify_discard_cb(MemoryRegionSection *section, void *arg) ++{ ++ RamDiscardListener *rdl = arg; ++ ++ rdl->notify_discard(rdl, section); ++ ++ return 0; ++} ++ ++static int guest_memfd_for_each_populated_range(const GuestMemfdManager *gmm, ++ MemoryRegionSection *section, ++ void *arg, ++ guest_memfd_section_cb cb) ++{ ++ unsigned long first_zero_bit, last_zero_bit; ++ uint64_t offset, size; ++ int ret = 0; ++ ++ first_zero_bit = section->offset_within_region / gmm->block_size; ++ first_zero_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ first_zero_bit); ++ ++ while (first_zero_bit < gmm->discard_bitmap_size) { ++ MemoryRegionSection tmp = *section; ++ ++ offset = first_zero_bit * gmm->block_size; ++ last_zero_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ first_zero_bit + 1) - 1; ++ size = (last_zero_bit - first_zero_bit + 1) * gmm->block_size; ++ ++ if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { ++ break; ++ } ++ ++ ret = cb(&tmp, arg); ++ if (ret) { ++ break; ++ } ++ ++ first_zero_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ last_zero_bit + 2); ++ } ++ ++ return ret; ++} ++ ++static int guest_memfd_for_each_discarded_range(const GuestMemfdManager *gmm, ++ MemoryRegionSection *section, ++ void *arg, ++ guest_memfd_section_cb cb) ++{ ++ unsigned long first_one_bit, last_one_bit; ++ uint64_t offset, size; ++ int ret = 0; ++ ++ first_one_bit = section->offset_within_region / gmm->block_size; ++ first_one_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ first_one_bit); ++ ++ while (first_one_bit < gmm->discard_bitmap_size) { ++ MemoryRegionSection tmp = *section; ++ ++ offset = first_one_bit * gmm->block_size; ++ last_one_bit = find_next_zero_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ first_one_bit + 1) - 1; ++ size = (last_one_bit - first_one_bit + 1) * gmm->block_size; ++ ++ if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { ++ break; ++ } ++ ++ ret = cb(&tmp, arg); ++ if (ret) { ++ break; ++ } ++ ++ first_one_bit = find_next_bit(gmm->discard_bitmap, gmm->discard_bitmap_size, ++ last_one_bit + 2); ++ } ++ ++ return ret; ++} ++ ++static uint64_t guest_memfd_rdm_get_min_granularity(const RamDiscardManager *rdm, ++ const MemoryRegion *mr) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ ++ g_assert(mr == gmm->mr); ++ return gmm->block_size; ++} ++ ++static void guest_memfd_rdm_register_listener(RamDiscardManager *rdm, ++ RamDiscardListener *rdl, ++ MemoryRegionSection *section) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ int ret; ++ ++ g_assert(section->mr == gmm->mr); ++ rdl->section = memory_region_section_new_copy(section); ++ ++ QLIST_INSERT_HEAD(&gmm->rdl_list, rdl, next); ++ ++ ret = guest_memfd_for_each_populated_range(gmm, section, rdl, ++ guest_memfd_notify_populate_cb); ++ if (ret) { ++ error_report("%s: Failed to register RAM discard listener: %s", __func__, ++ strerror(-ret)); ++ } ++} ++ ++static void guest_memfd_rdm_unregister_listener(RamDiscardManager *rdm, ++ RamDiscardListener *rdl) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ int ret; ++ ++ g_assert(rdl->section); ++ g_assert(rdl->section->mr == gmm->mr); ++ ++ ret = guest_memfd_for_each_populated_range(gmm, rdl->section, rdl, ++ guest_memfd_notify_discard_cb); ++ if (ret) { ++ error_report("%s: Failed to unregister RAM discard listener: %s", __func__, ++ strerror(-ret)); ++ } ++ ++ memory_region_section_free_copy(rdl->section); ++ rdl->section = NULL; ++ QLIST_REMOVE(rdl, next); ++ ++} ++ ++typedef struct GuestMemfdReplayData { ++ void *fn; ++ void *opaque; ++} GuestMemfdReplayData; ++ ++static int guest_memfd_rdm_replay_populated_cb(MemoryRegionSection *section, void *arg) ++{ ++ struct GuestMemfdReplayData *data = arg; ++ ReplayRamPopulate replay_fn = data->fn; ++ ++ return replay_fn(section, data->opaque); ++} ++ ++static int guest_memfd_rdm_replay_populated(const RamDiscardManager *rdm, ++ MemoryRegionSection *section, ++ ReplayRamPopulate replay_fn, ++ void *opaque) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; ++ ++ g_assert(section->mr == gmm->mr); ++ return guest_memfd_for_each_populated_range(gmm, section, &data, ++ guest_memfd_rdm_replay_populated_cb); ++} ++ ++static int guest_memfd_rdm_replay_discarded_cb(MemoryRegionSection *section, void *arg) ++{ ++ struct GuestMemfdReplayData *data = arg; ++ ReplayRamDiscard replay_fn = data->fn; ++ ++ replay_fn(section, data->opaque); ++ ++ return 0; ++} ++ ++static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, ++ MemoryRegionSection *section, ++ ReplayRamDiscard replay_fn, ++ void *opaque) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); ++ struct GuestMemfdReplayData data = { .fn = replay_fn, .opaque = opaque }; ++ ++ g_assert(section->mr == gmm->mr); ++ guest_memfd_for_each_discarded_range(gmm, section, &data, ++ guest_memfd_rdm_replay_discarded_cb); ++} ++ ++static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, ++ uint64_t region_size) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); ++ uint64_t bitmap_size = ROUND_UP(region_size, gmm->block_size) / gmm->block_size; ++ ++ gmm->mr = mr; ++ gmm->discard_bitmap_size = bitmap_size; ++ gmm->discard_bitmap = bitmap_new(bitmap_size); ++} ++ ++static void guest_memfd_manager_init(Object *obj) ++{ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(obj); ++ ++ gmm->block_size = qemu_real_host_page_size(); ++ QLIST_INIT(&gmm->rdl_list); ++} ++ ++static void guest_memfd_manager_finalize(Object *obj) ++{ ++ g_free(GUEST_MEMFD_MANAGER(obj)->discard_bitmap); ++} ++ ++static void guest_memfd_manager_class_init(ObjectClass *oc, void *data) ++{ ++ GuestMemfdManagerClass *gmmc = GUEST_MEMFD_MANAGER_CLASS(oc); ++ RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc); ++ ++ gmmc->realize = guest_memfd_manager_realize; ++ ++ rdmc->get_min_granularity = guest_memfd_rdm_get_min_granularity; ++ rdmc->register_listener = guest_memfd_rdm_register_listener; ++ rdmc->unregister_listener = guest_memfd_rdm_unregister_listener; ++ rdmc->is_populated = guest_memfd_rdm_is_populated; ++ rdmc->replay_populated = guest_memfd_rdm_replay_populated; ++ rdmc->replay_discarded = guest_memfd_rdm_replay_discarded; ++} +diff --git a/system/meson.build b/system/meson.build +index a296270cb0..9b96d645ab 100644 +--- a/system/meson.build ++++ b/system/meson.build +@@ -16,6 +16,7 @@ system_ss.add(files( + 'dirtylimit.c', + 'dma-helpers.c', + 'globals.c', ++ 'guest-memfd-manager.c', + 'memory_mapping.c', + 'qdev-monitor.c', + 'qtest.c', +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0005-guest_memfd-Introduce-a-helper-to-notify-the-shared-.patch b/packages/by-name/qemu-static/0005-guest_memfd-Introduce-a-helper-to-notify-the-shared-.patch new file mode 100644 index 0000000000..ba75c61106 --- /dev/null +++ b/packages/by-name/qemu-static/0005-guest_memfd-Introduce-a-helper-to-notify-the-shared-.patch @@ -0,0 +1,199 @@ +From cfc2bc7492cc3cc1f713dcd7d73c55bc2caac65a Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:11 -0400 +Subject: [PATCH 2/6] guest_memfd: Introduce a helper to notify the + shared/private state change + +Introduce a helper function within RamDiscardManager to efficiently +notify all registered RamDiscardListeners, including VFIO listeners +about the memory conversion events between shared and private in +guest_memfd. The existing VFIO listener can dynamically DMA map/unmap +the shared pages based on the conversion type: +- For conversions from shared to private, the VFIO system ensures the + discarding of shared mapping from the IOMMU. +- For conversions from private to shared, it triggers the population of + the shared mapping into the IOMMU. + +Additionally, there could be some special conversion requests: +- When a conversion request is made for a page already in the desired + state (either private or shared), the helper simply returns success. +- For requests involving a range partially in the desired state, only + the necessary segments are converted, ensuring the entire range + complies with the request efficiently. +- In scenarios where a conversion request is declined by other systems, + such as a failure from VFIO during notify_populate(), the helper will + roll back the request, maintaining consistency. + +Signed-off-by: Chenyi Qiang +--- + include/sysemu/guest-memfd-manager.h | 3 + + system/guest-memfd-manager.c | 141 +++++++++++++++++++++++++++ + 2 files changed, 144 insertions(+) + +diff --git a/include/sysemu/guest-memfd-manager.h b/include/sysemu/guest-memfd-manager.h +index ab8c2ba362..1cce4cde43 100644 +--- a/include/sysemu/guest-memfd-manager.h ++++ b/include/sysemu/guest-memfd-manager.h +@@ -43,4 +43,7 @@ struct GuestMemfdManagerClass { + void (*realize)(Object *gmm, MemoryRegion *mr, uint64_t region_size); + }; + ++int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, ++ bool shared_to_private); ++ + #endif +diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c +index 7b90f26859..deb43db90b 100644 +--- a/system/guest-memfd-manager.c ++++ b/system/guest-memfd-manager.c +@@ -243,6 +243,147 @@ static void guest_memfd_rdm_replay_discarded(const RamDiscardManager *rdm, + guest_memfd_rdm_replay_discarded_cb); + } + ++static bool guest_memfd_is_valid_range(GuestMemfdManager *gmm, ++ uint64_t offset, uint64_t size) ++{ ++ MemoryRegion *mr = gmm->mr; ++ ++ g_assert(mr); ++ ++ uint64_t region_size = memory_region_size(mr); ++ if (!QEMU_IS_ALIGNED(offset, gmm->block_size)) { ++ return false; ++ } ++ if (offset + size < offset || !size) { ++ return false; ++ } ++ if (offset >= region_size || offset + size > region_size) { ++ return false; ++ } ++ return true; ++} ++ ++static void guest_memfd_notify_discard(GuestMemfdManager *gmm, ++ uint64_t offset, uint64_t size) ++{ ++ RamDiscardListener *rdl; ++ ++ QLIST_FOREACH(rdl, &gmm->rdl_list, next) { ++ MemoryRegionSection tmp = *rdl->section; ++ ++ if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { ++ continue; ++ } ++ ++ guest_memfd_for_each_populated_range(gmm, &tmp, rdl, ++ guest_memfd_notify_discard_cb); ++ } ++} ++ ++ ++static int guest_memfd_notify_populate(GuestMemfdManager *gmm, ++ uint64_t offset, uint64_t size) ++{ ++ RamDiscardListener *rdl, *rdl2; ++ int ret = 0; ++ ++ QLIST_FOREACH(rdl, &gmm->rdl_list, next) { ++ MemoryRegionSection tmp = *rdl->section; ++ ++ if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { ++ continue; ++ } ++ ++ ret = guest_memfd_for_each_discarded_range(gmm, &tmp, rdl, ++ guest_memfd_notify_populate_cb); ++ if (ret) { ++ break; ++ } ++ } ++ ++ if (ret) { ++ /* Notify all already-notified listeners. */ ++ QLIST_FOREACH(rdl2, &gmm->rdl_list, next) { ++ MemoryRegionSection tmp = *rdl2->section; ++ ++ if (rdl2 == rdl) { ++ break; ++ } ++ if (!guest_memfd_rdm_intersect_memory_section(&tmp, offset, size)) { ++ continue; ++ } ++ ++ guest_memfd_for_each_discarded_range(gmm, &tmp, rdl2, ++ guest_memfd_notify_discard_cb); ++ } ++ } ++ return ret; ++} ++ ++static bool guest_memfd_is_range_populated(GuestMemfdManager *gmm, ++ uint64_t offset, uint64_t size) ++{ ++ const unsigned long first_bit = offset / gmm->block_size; ++ const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; ++ unsigned long found_bit; ++ ++ /* We fake a shorter bitmap to avoid searching too far. */ ++ found_bit = find_next_bit(gmm->discard_bitmap, last_bit + 1, first_bit); ++ return found_bit > last_bit; ++} ++ ++static bool guest_memfd_is_range_discarded(GuestMemfdManager *gmm, ++ uint64_t offset, uint64_t size) ++{ ++ const unsigned long first_bit = offset / gmm->block_size; ++ const unsigned long last_bit = first_bit + (size / gmm->block_size) - 1; ++ unsigned long found_bit; ++ ++ /* We fake a shorter bitmap to avoid searching too far. */ ++ found_bit = find_next_zero_bit(gmm->discard_bitmap, last_bit + 1, first_bit); ++ return found_bit > last_bit; ++} ++ ++int guest_memfd_state_change(GuestMemfdManager *gmm, uint64_t offset, uint64_t size, ++ bool shared_to_private) ++{ ++ int ret = 0; ++ ++ if (!guest_memfd_is_valid_range(gmm, offset, size)) { ++ error_report("%s, invalid range: offset 0x%lx, size 0x%lx", ++ __func__, offset, size); ++ return -1; ++ } ++ ++ if ((shared_to_private && guest_memfd_is_range_discarded(gmm, offset, size)) || ++ (!shared_to_private && guest_memfd_is_range_populated(gmm, offset, size))) { ++ return 0; ++ } ++ ++ if (shared_to_private) { ++ guest_memfd_notify_discard(gmm, offset, size); ++ } else { ++ ret = guest_memfd_notify_populate(gmm, offset, size); ++ } ++ ++ if (!ret) { ++ unsigned long first_bit = offset / gmm->block_size; ++ unsigned long nbits = size / gmm->block_size; ++ ++ g_assert((first_bit + nbits) <= gmm->discard_bitmap_size); ++ ++ if (shared_to_private) { ++ bitmap_set(gmm->discard_bitmap, first_bit, nbits); ++ } else { ++ bitmap_clear(gmm->discard_bitmap, first_bit, nbits); ++ } ++ ++ return 0; ++ } ++ ++ return ret; ++} ++ + static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, + uint64_t region_size) + { +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0006-KVM-Notify-the-state-change-via-RamDiscardManager-he.patch b/packages/by-name/qemu-static/0006-KVM-Notify-the-state-change-via-RamDiscardManager-he.patch new file mode 100644 index 0000000000..b81cceb392 --- /dev/null +++ b/packages/by-name/qemu-static/0006-KVM-Notify-the-state-change-via-RamDiscardManager-he.patch @@ -0,0 +1,51 @@ +From 35a523caf0cdab45782fe1330e991733b8d85635 Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:12 -0400 +Subject: [PATCH 3/6] KVM: Notify the state change via RamDiscardManager helper + during shared/private conversion + +Once exit to userspace to convert the page from private to shared or +vice versa at runtime, notify the state change via the +guest_memfd_state_change() helper so that other registered subsystems +like VFIO can be notified. + +Signed-off-by: Chenyi Qiang +--- + accel/kvm/kvm-all.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c +index acc23092e7..42e05e82c7 100644 +--- a/accel/kvm/kvm-all.c ++++ b/accel/kvm/kvm-all.c +@@ -48,6 +48,7 @@ + #include "kvm-cpus.h" + #include "sysemu/dirtylimit.h" + #include "qemu/range.h" ++#include "sysemu/guest-memfd-manager.h" + + #include "hw/boards.h" + #include "sysemu/stats.h" +@@ -2896,6 +2897,7 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + RAMBlock *rb; + void *addr; + int ret = -1; ++ GuestMemfdManager *gmm; + + trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared"); + +@@ -2958,6 +2960,11 @@ int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) + addr = memory_region_get_ram_ptr(mr) + section.offset_within_region; + rb = qemu_ram_block_from_host(addr, false, &offset); + ++ gmm = GUEST_MEMFD_MANAGER(mr->rdm); ++ if (gmm) { ++ guest_memfd_state_change(gmm, offset, size, to_private); ++ } ++ + if (to_private) { + if (rb->page_size != qemu_real_host_page_size()) { + /* +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0007-memory-Register-the-RamDiscardManager-instance-upon-.patch b/packages/by-name/qemu-static/0007-memory-Register-the-RamDiscardManager-instance-upon-.patch new file mode 100644 index 0000000000..882fde28f1 --- /dev/null +++ b/packages/by-name/qemu-static/0007-memory-Register-the-RamDiscardManager-instance-upon-.patch @@ -0,0 +1,53 @@ +From 1edf8d61a13344f820bc7f2d489386061fb560c5 Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:13 -0400 +Subject: [PATCH 4/6] memory: Register the RamDiscardManager instance upon + guest_memfd creation + +Instantiate a new guest_memfd_manager object and register it in the +target MemoryRegion. From this point, other subsystems such as VFIO can +register their listeners in guest_memfd_manager and receive conversion +events through RamDiscardManager. + +Signed-off-by: Chenyi Qiang +--- + system/physmem.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/system/physmem.c b/system/physmem.c +index 94600a33ec..a10f769cb5 100644 +--- a/system/physmem.c ++++ b/system/physmem.c +@@ -53,6 +53,7 @@ + #include "sysemu/hostmem.h" + #include "sysemu/hw_accel.h" + #include "sysemu/xen-mapcache.h" ++#include "sysemu/guest-memfd-manager.h" + #include "trace.h" + + #ifdef CONFIG_FALLOCATE_PUNCH_HOLE +@@ -1899,6 +1900,12 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) + qemu_mutex_unlock_ramlist(); + goto out_free; + } ++ ++ GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(object_new(TYPE_GUEST_MEMFD_MANAGER)); ++ GuestMemfdManagerClass *gmmc = GUEST_MEMFD_MANAGER_GET_CLASS(gmm); ++ g_assert(new_block->mr); ++ gmmc->realize(OBJECT(gmm), new_block->mr, new_block->mr->size); ++ memory_region_set_ram_discard_manager(gmm->mr, RAM_DISCARD_MANAGER(gmm)); + } + + new_ram_size = MAX(old_ram_size, +@@ -2156,6 +2163,8 @@ static void reclaim_ramblock(RAMBlock *block) + + if (block->guest_memfd >= 0) { + close(block->guest_memfd); ++ g_assert(block->mr); ++ object_unref(OBJECT(block->mr->rdm)); + ram_block_discard_require(false); + } + +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0008-guest-memfd-Default-to-discarded-private-in-guest_me.patch b/packages/by-name/qemu-static/0008-guest-memfd-Default-to-discarded-private-in-guest_me.patch new file mode 100644 index 0000000000..21f67cb61a --- /dev/null +++ b/packages/by-name/qemu-static/0008-guest-memfd-Default-to-discarded-private-in-guest_me.patch @@ -0,0 +1,47 @@ +From 6b692d82049689c95fa3feb6522d847b6738aa3f Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:14 -0400 +Subject: [PATCH 5/6] guest-memfd: Default to discarded (private) in + guest_memfd_manager + +guest_memfd was initially set to shared until the commit bd3bcf6962 +("kvm/memory: Make memory type private by default if it has guest memfd +backend"). To align with this change, the default state in +guest_memfd_manager is set to discarded. + +One concern raised by this commit is the handling of the virtual BIOS. +The virtual BIOS loads its image into the shared memory of guest_memfd. +However, during the region_commit() stage, the memory attribute is +set to private while its shared memory remains valid. This mismatch +persists until the shared content is copied to the private region. +Fortunately, this interval only exits during setup stage and currently, +only the guest_memfd_manager is concerned with the state of the +guest_memfd at that stage. For simplicity, the default bitmap in +guest_memfd_manager is set to discarded (private). This is feasible +because the shared content of the virtual BIOS will eventually be +discarded and there are no requests to DMA access to this shared part +during this period. + +Additionally, setting the default to private can also reduce the +overhead of mapping shared pages into IOMMU by VFIO at the bootup stage. + +Signed-off-by: Chenyi Qiang +--- + system/guest-memfd-manager.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c +index deb43db90b..ad1a46bac4 100644 +--- a/system/guest-memfd-manager.c ++++ b/system/guest-memfd-manager.c +@@ -393,6 +393,7 @@ static void guest_memfd_manager_realize(Object *obj, MemoryRegion *mr, + gmm->mr = mr; + gmm->discard_bitmap_size = bitmap_size; + gmm->discard_bitmap = bitmap_new(bitmap_size); ++ bitmap_fill(gmm->discard_bitmap, bitmap_size); + } + + static void guest_memfd_manager_init(Object *obj) +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0009-RAMBlock-make-guest_memfd-require-coordinate-discard.patch b/packages/by-name/qemu-static/0009-RAMBlock-make-guest_memfd-require-coordinate-discard.patch new file mode 100644 index 0000000000..2d85485cb1 --- /dev/null +++ b/packages/by-name/qemu-static/0009-RAMBlock-make-guest_memfd-require-coordinate-discard.patch @@ -0,0 +1,29 @@ +From e9ba216a062efcfff2831edb214815cce88c80dd Mon Sep 17 00:00:00 2001 +From: Chenyi Qiang +Date: Thu, 25 Jul 2024 03:21:15 -0400 +Subject: [PATCH 6/6] RAMBlock: make guest_memfd require coordinate discard + +As guest_memfd is now managed by guest_memfd_manager with +RamDiscardManager, only block uncoordinated discard. + +Signed-off-by: Chenyi Qiang +--- + system/physmem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/system/physmem.c b/system/physmem.c +index a10f769cb5..6aae81812e 100644 +--- a/system/physmem.c ++++ b/system/physmem.c +@@ -1886,7 +1886,7 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) + assert(kvm_enabled()); + assert(new_block->guest_memfd < 0); + +- ret = ram_block_discard_require(true); ++ ret = ram_block_coordinated_discard_require(true); + if (ret < 0) { + error_setg_errno(errp, -ret, + "cannot set up private guest memory: discard currently blocked"); +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/0010-increase-min-granularity-for-memfd.patch b/packages/by-name/qemu-static/0010-increase-min-granularity-for-memfd.patch new file mode 100644 index 0000000000..cc09cdf04d --- /dev/null +++ b/packages/by-name/qemu-static/0010-increase-min-granularity-for-memfd.patch @@ -0,0 +1,29 @@ +From 895758b0deedbf8a696c04d9567484e2e535fba2 Mon Sep 17 00:00:00 2001 +From: Tom Dohrmann +Date: Fri, 11 Oct 2024 10:49:27 +0000 +Subject: [PATCH] increase min granularity for memfd + +If the granularity is too small, we run into kvm memslot limits when +mapping devices using VFIO. Increase the limit to reduce the number of +required memslots. +Ideally this should be configurable, but for now, we always use 2 MiB. +--- + system/guest-memfd-manager.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/system/guest-memfd-manager.c b/system/guest-memfd-manager.c +index ad1a46bac4..25d0d1701e 100644 +--- a/system/guest-memfd-manager.c ++++ b/system/guest-memfd-manager.c +@@ -149,7 +149,7 @@ static uint64_t guest_memfd_rdm_get_min_granularity(const RamDiscardManager *rdm + GuestMemfdManager *gmm = GUEST_MEMFD_MANAGER(rdm); + + g_assert(mr == gmm->mr); +- return gmm->block_size; ++ return 0x200000; + } + + static void guest_memfd_rdm_register_listener(RamDiscardManager *rdm, +-- +2.34.1 + diff --git a/packages/by-name/qemu-static/package.nix b/packages/by-name/qemu-static/package.nix index ac244f30db..2b5f99e3d8 100644 --- a/packages/by-name/qemu-static/package.nix +++ b/packages/by-name/qemu-static/package.nix @@ -36,5 +36,14 @@ # Fix needed for a behaviour change in Linux 6.11-rc4. # TODO(freax13): Remove this when QEMU 9.1.2 is released. ./0003-accel-kvm-check-for-KVM_CAP_READONLY_MEM-on-VM.patch + # This series allows VFIO to work on SNP. + ./0004-guest_memfd-Introduce-an-object-to-manage-the-guest-.patch + ./0005-guest_memfd-Introduce-a-helper-to-notify-the-shared-.patch + ./0006-KVM-Notify-the-state-change-via-RamDiscardManager-he.patch + ./0007-memory-Register-the-RamDiscardManager-instance-upon-.patch + ./0008-guest-memfd-Default-to-discarded-private-in-guest_me.patch + ./0009-RAMBlock-make-guest_memfd-require-coordinate-discard.patch + # Fix needed for map large devices using VFIO. + ./0010-increase-min-granularity-for-memfd.patch ]; })