From 55f9a68fcc37513b740c43ad72ed902c401519ea Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 30 Jan 2025 15:47:02 +0000 Subject: [PATCH] fix(example): correctly handle `Removed` events in uffd exammple The crux of the issue was that UFFD gets blocked (all ioctls return -EAGAIN) when there's any `remove` events pending in the queue, which means during processing we not only need to look at the "head" of the queue, but also make sure there's no `remove` events in the "tail". Deal with these scenarios correctly by always greedily reading the entire queue, to ensure there's nothing pending, and only then processing things one-by-one. Please see the new code comments for intricacies with this approach. Fixes #4990 Signed-off-by: Patrick Roy --- .../examples/uffd/fault_all_handler.rs | 2 +- src/firecracker/examples/uffd/uffd_utils.rs | 35 +++++--- .../examples/uffd/valid_handler.rs | 88 +++++++++++++++---- 3 files changed, 97 insertions(+), 28 deletions(-) diff --git a/src/firecracker/examples/uffd/fault_all_handler.rs b/src/firecracker/examples/uffd/fault_all_handler.rs index 5e9f49a3207..cfeaa099236 100644 --- a/src/firecracker/examples/uffd/fault_all_handler.rs +++ b/src/firecracker/examples/uffd/fault_all_handler.rs @@ -36,7 +36,7 @@ fn main() { userfaultfd::Event::Pagefault { .. } => { for region in uffd_handler.mem_regions.clone() { uffd_handler - .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size) + .serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size); } } _ => panic!("Unexpected event on userfaultfd"), diff --git a/src/firecracker/examples/uffd/uffd_utils.rs b/src/firecracker/examples/uffd/uffd_utils.rs index 8cc70ab7c21..a61c18fe506 100644 --- a/src/firecracker/examples/uffd/uffd_utils.rs +++ b/src/firecracker/examples/uffd/uffd_utils.rs @@ -116,7 +116,7 @@ impl UffdHandler { } } - pub fn serve_pf(&mut self, addr: *mut u8, len: usize) { + pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool { // Find the start of the page that the current faulting address belongs to. let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void; let fault_page_addr = dst as u64; @@ -133,14 +133,18 @@ impl UffdHandler { // event was received. This can be a consequence of guest reclaiming back its // memory from the host (through balloon device) Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => { - let (start, end) = self.populate_from_file(region, fault_page_addr, len); - self.update_mem_state_mappings(start, end, MemPageState::FromFile); - return; + match self.populate_from_file(region, fault_page_addr, len) { + Some((start, end)) => { + self.update_mem_state_mappings(start, end, MemPageState::FromFile) + } + None => return false, + } + return true; } Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => { let (start, end) = self.zero_out(fault_page_addr); self.update_mem_state_mappings(start, end, MemPageState::Anonymous); - return; + return true; } None => {} } @@ -152,20 +156,31 @@ impl UffdHandler { ); } - fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) { + fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> { let offset = dst - region.mapping.base_host_virt_addr; let src = self.backing_buffer as u64 + region.mapping.offset + offset; let ret = unsafe { - self.uffd - .copy(src as *const _, dst as *mut _, len, true) - .expect("Uffd copy failed") + match self.uffd.copy(src as *const _, dst as *mut _, len, true) { + Ok(value) => value, + // Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD + // queue while we're processing `pagefault` events. + Err(Error::PartiallyCopied(bytes_copied)) if bytes_copied == 0 => return None, + Err(Error::CopyFailed(errno)) + if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST => + { + len + } + Err(e) => { + panic!("Uffd copy failed: {e:?}"); + } + } }; // Make sure the UFFD copied some bytes. assert!(ret > 0); - (dst, dst + len as u64) + Some((dst, dst + len as u64)) } fn zero_out(&mut self, addr: u64) -> (u64, u64) { diff --git a/src/firecracker/examples/uffd/valid_handler.rs b/src/firecracker/examples/uffd/valid_handler.rs index 6c681d932ac..ecb8c17912c 100644 --- a/src/firecracker/examples/uffd/valid_handler.rs +++ b/src/firecracker/examples/uffd/valid_handler.rs @@ -26,24 +26,78 @@ fn main() { let mut runtime = Runtime::new(stream, file); runtime.install_panic_hook(); runtime.run(|uffd_handler: &mut UffdHandler| { - // Read an event from the userfaultfd. - let event = uffd_handler - .read_event() - .expect("Failed to read uffd_msg") - .expect("uffd_msg not ready"); - - // We expect to receive either a Page Fault or Removed - // event (if the balloon device is enabled). - match event { - userfaultfd::Event::Pagefault { addr, .. } => { - uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) + // !DISCLAIMER! + // When using UFFD together with the balloon device, this handler needs to deal with Removed + // and Pagefault events. There are multiple things to keep in mind in such setups: + // + // As long as any Removed event in pending in the UFFD queue, all ioctls return EAGAIN + // ----------------------------------------------------------------------------------- + // + // This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event + // arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the + // UFFD, and then go back to the process the pre-fetched events. + // + // UFFD might receive events in not in their causal order + // ----------------------------------------------------- + // + // For example, the guest + // kernel might first respond to a balloon inflation by freeing some memory, and + // telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the + // free memory range, which causes a Removed event to be sent to UFFD. Then, the + // guest kernel might immediately fault the page in again (for example because + // default_on_oom was set). which causes a Pagefault event to be sent to UFFD. + // + // However, the pagefault will be triggered from inside KVM on the vCPU thread, while the + // balloon device is handled by Firecracker on its VMM thread. This means that potentially + // this handler can receive the Pagefault _before_ the Removed event. + // + // This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events + // to make sure no `remove` event is blocking us can result in the handler acting on + // the `pagefault` event before the `remove` message (despite the `remove` event being + // in the causal past of the `pagefault` event), which means that we will fault in a page + // from the snapshot file, while really we should be faulting in a zero page. + // + // In this example handler, we ignore this problem, to avoid + // complexity (under the assumption that the guest kernel will zero a newly faulted in + // page anyway). A production handler will most likely want to ensure that Removed + // events for a specific range are always handled before Pagefault events. + // + // Lastly, we still need to deal with the race condition where a Removed event arrives + // in the UFFD queue after we got done reading all events, in which case we need to go + // back to reading more events before we can continue processing Pagefaults. + let mut deferred_events = Vec::new(); + + loop { + // First, try events that we couldn't handle last round + let mut events_to_handle = Vec::from_iter(deferred_events.drain(..)); + + // Read all events from the userfaultfd. + while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") { + events_to_handle.push(event); + } + + for event in events_to_handle.drain(..) { + // We expect to receive either a Page Fault or Removed + // event (if the balloon device is enabled). + match event { + userfaultfd::Event::Pagefault { addr, .. } => { + if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) { + deferred_events.push(event); + } + } + userfaultfd::Event::Remove { start, end } => uffd_handler + .update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed), + _ => panic!("Unexpected event on userfaultfd"), + } + } + + // We assume that really only the above removed/pagefault interaction can result in + // deferred events. In that scenario, the loop will always terminate (unless + // newly arriving Removed events end up indefinitely blocking it, but there's nothing + // we can do about that, and it's a largely theoretical problem). + if deferred_events.is_empty() { + break; } - userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings( - start as u64, - end as u64, - MemPageState::Removed, - ), - _ => panic!("Unexpected event on userfaultfd"), } }); }