Skip to content

Commit

Permalink
fix(example): correctly handle Removed events in uffd exammple
Browse files Browse the repository at this point in the history
The crux of the issue was that UFFD gets blocked (all ioctls return
-EAGAIN) when there's any `remove` events pending in the queue, which
means during processing we not only need to look at the "head" of the
queue, but also make sure there's no `remove` events in the "tail".

Deal with these scenarios correctly by always greedily reading the
entire queue, to ensure there's nothing pending, and only then
processing things one-by-one. Please see the new code comments for
intricacies with this approach.

Fixes firecracker-microvm#4990
Signed-off-by: Patrick Roy <[email protected]>
  • Loading branch information
roypat committed Jan 30, 2025
1 parent 6759168 commit 55f9a68
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/firecracker/examples/uffd/fault_all_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ fn main() {
userfaultfd::Event::Pagefault { .. } => {
for region in uffd_handler.mem_regions.clone() {
uffd_handler
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size)
.serve_pf(region.mapping.base_host_virt_addr as _, region.mapping.size);
}
}
_ => panic!("Unexpected event on userfaultfd"),
Expand Down
35 changes: 25 additions & 10 deletions src/firecracker/examples/uffd/uffd_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ impl UffdHandler {
}
}

pub fn serve_pf(&mut self, addr: *mut u8, len: usize) {
pub fn serve_pf(&mut self, addr: *mut u8, len: usize) -> bool {
// Find the start of the page that the current faulting address belongs to.
let dst = (addr as usize & !(self.page_size - 1)) as *mut libc::c_void;
let fault_page_addr = dst as u64;
Expand All @@ -133,14 +133,18 @@ impl UffdHandler {
// event was received. This can be a consequence of guest reclaiming back its
// memory from the host (through balloon device)
Some(MemPageState::Uninitialized) | Some(MemPageState::FromFile) => {
let (start, end) = self.populate_from_file(region, fault_page_addr, len);
self.update_mem_state_mappings(start, end, MemPageState::FromFile);
return;
match self.populate_from_file(region, fault_page_addr, len) {
Some((start, end)) => {
self.update_mem_state_mappings(start, end, MemPageState::FromFile)
}
None => return false,
}
return true;
}
Some(MemPageState::Removed) | Some(MemPageState::Anonymous) => {
let (start, end) = self.zero_out(fault_page_addr);
self.update_mem_state_mappings(start, end, MemPageState::Anonymous);
return;
return true;
}
None => {}
}
Expand All @@ -152,20 +156,31 @@ impl UffdHandler {
);
}

fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> (u64, u64) {
fn populate_from_file(&self, region: &MemRegion, dst: u64, len: usize) -> Option<(u64, u64)> {
let offset = dst - region.mapping.base_host_virt_addr;
let src = self.backing_buffer as u64 + region.mapping.offset + offset;

let ret = unsafe {
self.uffd
.copy(src as *const _, dst as *mut _, len, true)
.expect("Uffd copy failed")
match self.uffd.copy(src as *const _, dst as *mut _, len, true) {
Ok(value) => value,
// Catch EAGAIN errors, which occur when a `remove` event lands in the UFFD
// queue while we're processing `pagefault` events.
Err(Error::PartiallyCopied(bytes_copied)) if bytes_copied == 0 => return None,
Err(Error::CopyFailed(errno))
if std::io::Error::from(errno).raw_os_error().unwrap() == libc::EEXIST =>
{
len
}
Err(e) => {
panic!("Uffd copy failed: {e:?}");
}
}
};

// Make sure the UFFD copied some bytes.
assert!(ret > 0);

(dst, dst + len as u64)
Some((dst, dst + len as u64))
}

fn zero_out(&mut self, addr: u64) -> (u64, u64) {
Expand Down
88 changes: 71 additions & 17 deletions src/firecracker/examples/uffd/valid_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,78 @@ fn main() {
let mut runtime = Runtime::new(stream, file);
runtime.install_panic_hook();
runtime.run(|uffd_handler: &mut UffdHandler| {
// Read an event from the userfaultfd.
let event = uffd_handler
.read_event()
.expect("Failed to read uffd_msg")
.expect("uffd_msg not ready");

// We expect to receive either a Page Fault or Removed
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size)
// !DISCLAIMER!
// When using UFFD together with the balloon device, this handler needs to deal with Removed
// and Pagefault events. There are multiple things to keep in mind in such setups:
//
// As long as any Removed event in pending in the UFFD queue, all ioctls return EAGAIN
// -----------------------------------------------------------------------------------
//
// This means we cannot process UFFD events simply one-by-one anymore - if a `remove` event
// arrives, we need to pre-fetch all other events up to the `remove` event, to unblock the
// UFFD, and then go back to the process the pre-fetched events.
//
// UFFD might receive events in not in their causal order
// -----------------------------------------------------
//
// For example, the guest
// kernel might first respond to a balloon inflation by freeing some memory, and
// telling Firecracker about this. Firecracker will then madvise(MADV_DONTNEED) the
// free memory range, which causes a Removed event to be sent to UFFD. Then, the
// guest kernel might immediately fault the page in again (for example because
// default_on_oom was set). which causes a Pagefault event to be sent to UFFD.
//
// However, the pagefault will be triggered from inside KVM on the vCPU thread, while the
// balloon device is handled by Firecracker on its VMM thread. This means that potentially
// this handler can receive the Pagefault _before_ the Removed event.
//
// This means that the simple "greedy" strategy of simply prefetching _all_ UFFD events
// to make sure no `remove` event is blocking us can result in the handler acting on
// the `pagefault` event before the `remove` message (despite the `remove` event being
// in the causal past of the `pagefault` event), which means that we will fault in a page
// from the snapshot file, while really we should be faulting in a zero page.
//
// In this example handler, we ignore this problem, to avoid
// complexity (under the assumption that the guest kernel will zero a newly faulted in
// page anyway). A production handler will most likely want to ensure that Removed
// events for a specific range are always handled before Pagefault events.
//
// Lastly, we still need to deal with the race condition where a Removed event arrives
// in the UFFD queue after we got done reading all events, in which case we need to go
// back to reading more events before we can continue processing Pagefaults.
let mut deferred_events = Vec::new();

loop {
// First, try events that we couldn't handle last round
let mut events_to_handle = Vec::from_iter(deferred_events.drain(..));

// Read all events from the userfaultfd.
while let Some(event) = uffd_handler.read_event().expect("Failed to read uffd_msg") {
events_to_handle.push(event);
}

for event in events_to_handle.drain(..) {
// We expect to receive either a Page Fault or Removed
// event (if the balloon device is enabled).
match event {
userfaultfd::Event::Pagefault { addr, .. } => {
if !uffd_handler.serve_pf(addr.cast(), uffd_handler.page_size) {
deferred_events.push(event);
}
}
userfaultfd::Event::Remove { start, end } => uffd_handler
.update_mem_state_mappings(start as u64, end as u64, MemPageState::Removed),
_ => panic!("Unexpected event on userfaultfd"),
}
}

// We assume that really only the above removed/pagefault interaction can result in
// deferred events. In that scenario, the loop will always terminate (unless
// newly arriving Removed events end up indefinitely blocking it, but there's nothing
// we can do about that, and it's a largely theoretical problem).
if deferred_events.is_empty() {
break;
}
userfaultfd::Event::Remove { start, end } => uffd_handler.update_mem_state_mappings(
start as u64,
end as u64,
MemPageState::Removed,
),
_ => panic!("Unexpected event on userfaultfd"),
}
});
}

0 comments on commit 55f9a68

Please sign in to comment.