Skip to content

Commit

Permalink
fix(example): correctly handle Removed events in uffd exammple
Browse files Browse the repository at this point in the history
The UFFD handler might receive events out of order compared to how they
actually happened. For example, if the guest first frees a page to the
balloon device, and then immediately faults it in again, the UFFD
handler might see the page fault before the freeing. This is a problem,
as any pending `Remove` events in the queue will "block" the userfault
FD (all ioctls return -EAGAIN). Fix this by always draining all events
from the fd's queue, and gracefully handling -EAGAIN. Please see the
code comment for in-depth analysis of the flow.

Fixes firecracker-microvm#4990
Signed-off-by: Patrick Roy <[email protected]>
  • Loading branch information
roypat committed Jan 30, 2025
1 parent 6759168 commit 441a078
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/firecracker/examples/uffd/fault_all_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ fn main() {
userfaultfd::Event::Pagefault { .. } => {
for region in uffd_handler.mem_regions.clone() {
uffd_handler
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
}
}
_ => panic!("Unexpected event on userfaultfd"),
Expand Down
38 changes: 28 additions & 10 deletions src/firecracker/examples/uffd/uffd_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl UffdHandler {
}
}

pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
// Find the start of the page that the current faulting address belongs to.
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
let fault_page_addr = dst as u64;
Expand All @@ -133,14 +133,18 @@ impl UffdHandler {
// event was received. This can be a consequence of guest reclaiming back its
// memory from the host (through balloon device)
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
return;
match self.populate_from_file(region, fault_page_addr, len) {
Some((start, end)) => {
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
}
None => return false,
}
return true;
}
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
let (start, end) = self.zero_out(fault_page_addr);
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
return;
return true;
}
None => {}
}
Expand All @@ -152,20 +156,34 @@ impl UffdHandler {
);
}

fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
let offset = dst - region.mapping.base_host_virt_addr;
let src = self.backing_buffer as u64 + region.mapping.offset + offset;

let ret = unsafe {
self.uffd
.copy(src as *const _, dst as *mut _, len, true)
.expect("Uffd copy failed")
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
Ok(value) => value,
// Catch EAGAIN errors, which occur when there is a copy racing with a EVENT_REMOVE.
// We need to defer this page fault until after we handle the remove event, because
// from the guest's PoV they happened in opposite order (e.g. first
// balloon expanded and caused page deallocation, then the guest
// faulted the page back in).
Err(Error::PartiallyCopied(bytes_copied)) if bytes_copied == 0 => return None,
Err(Error::CopyFailed(errno))
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
{
len
}
Err(e) => {
panic!("Uffd copy failed: {e:?}");
}
}
};

// Make sure the UFFD copied some bytes.
assert!(ret > 0);

(dst, dst + len as u64)
Some((dst, dst + len as u64))
}

fn zero_out(&mut self, addr: u64) -> (u64, u64) {
Expand Down
80 changes: 63 additions & 17 deletions src/firecracker/examples/uffd/valid_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,70 @@ fn main() {
let mut runtime = Runtime::new(stream, file);
runtime.install_panic_hook();
runtime.run(|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

// We expect to receive either a Page Fault or Removed
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
// !DISCLAIMER!
// When using UFFD together with the balloon device, this handler needs to deal with Removed
// and Pagefault events. However, since these are induced by different threads over in
// Firecracker-land they might get here in the wrong order: For example, the guest
// kernel might first respond to a balloon inflation by freeing some memory, and
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
// free memory range, which causes a Removed event to be sent to UFFD. Then, the
// guest kernel might immediately fault the page in again, which causes a Pagefault
// event to be sent to UFFD.
//
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
// this handler can receive the Pagefault _before_ the Removed event.
//
// This leads to two problems:
// 1. While a Removed event is pending (e.g. in the fd's queue, but not read yet), all UFFD
// ioctls such as UFFDIO_COPY will return -EAGAIN
// 2. Processing a Pagefault event before a Removed event where the order has been swapped
// as above means that we will fault in a page from the snapshot file, while really we
// should be faulting in a zero page.
//
// Problem 1. is solved fairly easily by simply reading all available events ahead of time
// to unblock the UFFD. Problem 2. we are ignoring in this example handler, to avoid
// complexity (under the assumption that the guest kernel will zero a newly faulted in
// page anyway). A production handler will most likely want to ensure that Removed
// events for a specific range are always handled before Pagefault events.
//
// Lastly, we still need to deal with the race condition where a Removed event arrives
// in the UFFD queue after we got done reading all events, in which case we need to go
// back to reading more events before we can continue processing Pagefaults.

let mut deferred_events = Vec::new();

loop {
// First, try events that we couldn't handle last round
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));

// Read all events from the userfaultfd.
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
events_to_handle.push(event);
}

for event in events_to_handle.drain(..) {
// We expect to receive either a Page Fault or Removed
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
}
}
userfaultfd::Event::Remove { start, end } => uffd_handler
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
_ => panic!("Unexpected event on userfaultfd"),
}
}

// We assume that really only the above removed/pagefault interaction can result in
// deferred events. In that scenario, the loop will always terminate (unless
// newly arriving Removed events end up indefinitely blocking it, but there's nothing
// we can do about that, and it's a largely theoretical problem).
if deferred_events.is_empty() {
break;
}
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
start as u64,
end as u64,
MemPageState::Removed,
),
_ => panic!("Unexpected event on userfaultfd"),
}
});
}

0 comments on commit 441a078

Please sign in to comment.