Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix UFFD handler never receiving remove messages when balloon device inflates #5021

Merged
merged 4 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ and this project adheres to
- [#5007](https://github.com/firecracker-microvm/firecracker/pull/5007): Fixed
watchdog softlockup warning on x86_64 guests when a vCPU is paused during GDB
debugging.
- [#5021](https://github.com/firecracker-microvm/firecracker/pull/5021) If a
balloon device is inflated post UFFD-backed snapshot restore, Firecracker now
causes `remove` UFFD messages to be sent to the UFFD handler. Previously, no
such message would be sent.

## [1.10.1]

Expand Down
2 changes: 1 addition & 1 deletion src/firecracker/examples/uffd/fault_all_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ fn main() {
userfaultfd::Event::Pagefault { .. } => {
for region in uffd_handler.mem_regions.clone() {
uffd_handler
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
roypat marked this conversation as resolved.
Show resolved Hide resolved
}
}
_ => panic!("Unexpected event on userfaultfd"),
Expand Down
43 changes: 33 additions & 10 deletions src/firecracker/examples/uffd/uffd_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl UffdHandler {
}
}

pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
// Find the start of the page that the current faulting address belongs to.
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
let fault_page_addr = dst as u64;
Expand All @@ -133,14 +133,18 @@ impl UffdHandler {
// event was received. This can be a consequence of guest reclaiming back its
// memory from the host (through balloon device)
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
return;
match self.populate_from_file(region, fault_page_addr, len) {
Some((start, end)) => {
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
}
None => return false,
}
return true;
}
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
let (start, end) = self.zero_out(fault_page_addr);
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
return;
return true;
}
None => {}
}
Expand All @@ -152,20 +156,39 @@ impl UffdHandler {
);
}

fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
let offset = dst - region.mapping.base_host_virt_addr;
let src = self.backing_buffer as u64 + region.mapping.offset + offset;

let ret = unsafe {
self.uffd
.copy(src as *const _, dst as *mut _, len, true)
.expect("Uffd copy failed")
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
Ok(value) => value,
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
// queue while we're processing `pagefault` events.
// The weird cast is because the `bytes_copied` field is based on the
// `uffdio_copy->copy` field, which is a signed 64 bit integer, and if something
// goes wrong, it gets set to a -errno code. However, uffd-rs always casts this
// value to an unsigned `usize`, which scrambled the errno.
Err(Error::PartiallyCopied(bytes_copied))
if bytes_copied == 0 || bytes_copied == (-libc::EAGAIN) as usize =>
roypat marked this conversation as resolved.
Show resolved Hide resolved
{
return None
}
Err(Error::CopyFailed(errno))
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
{
len
}
Err(e) => {
panic!("Uffd copy failed: {e:?}");
}
}
};

// Make sure the UFFD copied some bytes.
assert!(ret > 0);

(dst, dst + len as u64)
Some((dst, dst + len as u64))
}

fn zero_out(&mut self, addr: u64) -> (u64, u64) {
Expand Down
89 changes: 72 additions & 17 deletions src/firecracker/examples/uffd/valid_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,79 @@ fn main() {
let mut runtime = Runtime::new(stream, file);
runtime.install_panic_hook();
runtime.run(|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

// We expect to receive either a Page Fault or Removed
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
// !DISCLAIMER!
// When using UFFD together with the balloon device, this handler needs to deal with
// `remove` and `pagefault` events. There are multiple things to keep in mind in
// such setups:
//
// As long as any `remove` event is pending in the UFFD queue, all ioctls return EAGAIN
// -----------------------------------------------------------------------------------
//
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
// UFFD, and then go back to the process the pre-fetched events.
//
// UFFD might receive events in not in their causal order
// -----------------------------------------------------
//
// For example, the guest
// kernel might first respond to a balloon inflation by freeing some memory, and
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
// free memory range, which causes a `remove` event to be sent to UFFD. Then, the
// guest kernel might immediately fault the page in again (for example because
// default_on_oom was set). which causes a `pagefault` event to be sent to UFFD.
//
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
// this handler can receive the `pagefault` _before_ the `remove` event.
//
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
// to make sure no `remove` event is blocking us can result in the handler acting on
// the `pagefault` event before the `remove` message (despite the `remove` event being
// in the causal past of the `pagefault` event), which means that we will fault in a page
// from the snapshot file, while really we should be faulting in a zero page.
//
// In this example handler, we ignore this problem, to avoid
// complexity (under the assumption that the guest kernel will zero a newly faulted in
// page anyway). A production handler will most likely want to ensure that `remove`
// events for a specific range are always handled before `pagefault` events.
//
// Lastly, we still need to deal with the race condition where a `remove` event arrives
// in the UFFD queue after we got done reading all events, in which case we need to go
// back to reading more events before we can continue processing `pagefault`s.
let mut deferred_events = Vec::new();

loop {
// First, try events that we couldn't handle last round
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));

// Read all events from the userfaultfd.
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
events_to_handle.push(event);
}

for event in events_to_handle.drain(..) {
// We expect to receive either a Page Fault or `remove`
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
}
}
userfaultfd::Event::Remove { start, end } => uffd_handler
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
_ => panic!("Unexpected event on userfaultfd"),
}
}

// We assume that really only the above removed/pagefault interaction can result in
// deferred events. In that scenario, the loop will always terminate (unless
// newly arriving `remove` events end up indefinitely blocking it, but there's nothing
// we can do about that, and it's a largely theoretical problem).
if deferred_events.is_empty() {
break;
}
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
start as u64,
end as u64,
MemPageState::Removed,
),
_ => panic!("Unexpected event on userfaultfd"),
}
});
}
1 change: 1 addition & 0 deletions src/vmm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ pub fn build_microvm_from_snapshot(
resource_allocator: &mut vmm.resource_allocator,
vm_resources,
instance_id: &instance_info.id,
restored_from_file: vmm.uffd.is_none(),
};

vmm.mmio_device_manager =
Expand Down
7 changes: 6 additions & 1 deletion src/vmm/src/device_manager/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ pub struct MMIODevManagerConstructorArgs<'a> {
pub resource_allocator: &'a mut ResourceAllocator,
pub vm_resources: &'a mut VmResources,
pub instance_id: &'a str,
pub restored_from_file: bool,
}
impl fmt::Debug for MMIODevManagerConstructorArgs<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Expand Down Expand Up @@ -512,7 +513,10 @@ impl<'a> Persist<'a> for MMIODeviceManager {

if let Some(balloon_state) = &state.balloon_device {
let device = Arc::new(Mutex::new(Balloon::restore(
BalloonConstructorArgs { mem: mem.clone() },
BalloonConstructorArgs {
mem: mem.clone(),
restored_from_file: constructor_args.restored_from_file,
},
&balloon_state.device_state,
)?));

Expand Down Expand Up @@ -807,6 +811,7 @@ mod tests {
resource_allocator: &mut resource_allocator,
vm_resources,
instance_id: "microvm-id",
restored_from_file: true,
};
let restored_dev_manager =
MMIODeviceManager::restore(restore_args, &device_states).unwrap();
Expand Down
10 changes: 5 additions & 5 deletions src/vmm/src/devices/virtio/balloon/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
pub(crate) irq_trigger: IrqTrigger,

// Implementation specific fields.
pub(crate) restored: bool,
pub(crate) restored_from_file: bool,
pub(crate) stats_polling_interval_s: u16,
pub(crate) stats_timer: TimerFd,
// The index of the previous stats descriptor is saved because
Expand All @@ -189,7 +189,7 @@
.field("queue_evts", &self.queue_evts)
.field("device_state", &self.device_state)
.field("irq_trigger", &self.irq_trigger)
.field("restored", &self.restored)
.field("restored_from_file", &self.restored_from_file)

Check warning on line 192 in src/vmm/src/devices/virtio/balloon/device.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/devices/virtio/balloon/device.rs#L192

Added line #L192 was not covered by tests
.field("stats_polling_interval_s", &self.stats_polling_interval_s)
.field("stats_desc_index", &self.stats_desc_index)
.field("latest_stats", &self.latest_stats)
Expand All @@ -204,7 +204,7 @@
amount_mib: u32,
deflate_on_oom: bool,
stats_polling_interval_s: u16,
restored: bool,
restored_from_file: bool,
) -> Result<Balloon, BalloonError> {
let mut avail_features = 1u64 << VIRTIO_F_VERSION_1;

Expand Down Expand Up @@ -245,7 +245,7 @@
irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?,
device_state: DeviceState::Inactive,
activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?,
restored,
restored_from_file,
stats_polling_interval_s,
stats_timer,
stats_desc_index: None,
Expand Down Expand Up @@ -355,7 +355,7 @@
if let Err(err) = remove_range(
mem,
(guest_addr, u64::from(range_len) << VIRTIO_BALLOON_PFN_SHIFT),
self.restored,
self.restored_from_file,
) {
error!("Error removing memory range: {:?}", err);
}
Expand Down
15 changes: 12 additions & 3 deletions src/vmm/src/devices/virtio/balloon/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ pub struct BalloonState {
pub struct BalloonConstructorArgs {
/// Pointer to guest memory.
pub mem: GuestMemoryMmap,
pub restored_from_file: bool,
}

impl Persist<'_> for Balloon {
Expand All @@ -121,7 +122,12 @@ impl Persist<'_> for Balloon {
) -> Result<Self, Self::Error> {
// We can safely create the balloon with arbitrary flags and
// num_pages because we will overwrite them after.
let mut balloon = Balloon::new(0, false, state.stats_polling_interval_s, true)?;
let mut balloon = Balloon::new(
0,
false,
state.stats_polling_interval_s,
constructor_args.restored_from_file,
)?;

let mut num_queues = BALLOON_NUM_QUEUES;
// As per the virtio 1.1 specification, the statistics queue
Expand Down Expand Up @@ -192,13 +198,16 @@ mod tests {

// Deserialize and restore the balloon device.
let restored_balloon = Balloon::restore(
BalloonConstructorArgs { mem: guest_mem },
BalloonConstructorArgs {
mem: guest_mem,
restored_from_file: true,
},
&Snapshot::deserialize(&mut mem.as_slice()).unwrap(),
)
.unwrap();

assert_eq!(restored_balloon.device_type(), TYPE_BALLOON);
assert!(restored_balloon.restored);
assert!(restored_balloon.restored_from_file);

assert_eq!(restored_balloon.acked_features, balloon.acked_features);
assert_eq!(restored_balloon.avail_features, balloon.avail_features);
Expand Down
4 changes: 2 additions & 2 deletions src/vmm/src/devices/virtio/balloon/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> {
pub(crate) fn remove_range(
guest_memory: &GuestMemoryMmap,
range: (GuestAddress, u64),
restored: bool,
restored_from_file: bool,
) -> Result<(), RemoveRegionError> {
let (guest_address, range_len) = range;

Expand All @@ -83,7 +83,7 @@ pub(crate) fn remove_range(
// Mmap a new anonymous region over the present one in order to create a hole.
// This workaround is (only) needed after resuming from a snapshot because the guest memory
// is mmaped from file as private and there is no `madvise` flag that works for this case.
if restored {
if restored_from_file {
// SAFETY: The address and length are known to be valid.
let ret = unsafe {
libc::mmap(
Expand Down
2 changes: 0 additions & 2 deletions src/vmm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,6 @@ pub struct Vmm {
vm: Vm,
guest_memory: GuestMemoryMmap,
// Save UFFD in order to keep it open in the Firecracker process, as well.
// Since this field is never read again, we need to allow `dead_code`.
#[allow(dead_code)]
uffd: Option<Uffd>,
vcpus_handles: Vec<VcpuHandle>,
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
Expand Down
4 changes: 4 additions & 0 deletions tests/integration_tests/functional/test_uffd.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,14 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths):
# Inflate balloon.
vm.api.balloon.patch(amount_mib=200)

# Verify if the restored guest works.
vm.ssh.check_output("true")

# Deflate balloon.
vm.api.balloon.patch(amount_mib=0)

# Verify if the restored guest works.
vm.ssh.check_output("true")


def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths):
Expand Down