Skip to content

Add Steal Time support in ARM #5139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ and this project adheres to

### Added

- [#5139](https://github.com/firecracker-microvm/firecracker/pull/5139): Added
support for [PVTime](https://docs.kernel.org/virt/kvm/arm/pvtime.html). This
is used to support steal time on ARM machines.
- [#5048](https://github.com/firecracker-microvm/firecracker/pull/5048): Added
support for [PVH boot mode](docs/pvh.md). This is used when an x86 kernel
provides the appropriate ELF Note to indicate that PVH boot mode is supported.
Expand Down
49 changes: 49 additions & 0 deletions src/vmm/src/arch/aarch64/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use std::mem::offset_of;
use kvm_bindings::*;
use kvm_ioctls::{VcpuExit, VcpuFd, VmFd};
use serde::{Deserialize, Serialize};
use vm_memory::GuestAddress;

use super::get_fdt_addr;
use super::regs::*;
Expand Down Expand Up @@ -42,6 +43,8 @@ pub enum VcpuArchError {
Fam(vmm_sys_util::fam::Error),
/// {0}
GetMidrEl1(String),
/// Failed to set/get device attributes for vCPU: {0}
DeviceAttribute(kvm_ioctls::Error),
}

/// Extract the Manufacturer ID from the host.
Expand Down Expand Up @@ -115,6 +118,8 @@ pub struct KvmVcpu {
/// Vcpu peripherals, such as buses
pub peripherals: Peripherals,
kvi: kvm_vcpu_init,
/// IPA of steal_time region
pub pvtime_ipa: Option<GuestAddress>,
}

/// Vcpu peripherals
Expand Down Expand Up @@ -148,6 +153,7 @@ impl KvmVcpu {
fd: kvm_vcpu,
peripherals: Default::default(),
kvi,
pvtime_ipa: None,
})
}

Expand Down Expand Up @@ -243,6 +249,8 @@ impl KvmVcpu {
// the boot state and turned secondary vcpus on.
state.kvi.features[0] &= !(1 << KVM_ARM_VCPU_POWER_OFF);

state.pvtime_ipa = self.pvtime_ipa.map(|guest_addr| guest_addr.0);

Ok(state)
}

Expand Down Expand Up @@ -276,6 +284,13 @@ impl KvmVcpu {
}
self.set_mpstate(state.mp_state)
.map_err(KvmVcpuError::RestoreState)?;

// Assumes that steal time memory region was set up already
if let Some(pvtime_ipa) = state.pvtime_ipa {
self.enable_pvtime(GuestAddress(pvtime_ipa))
.map_err(KvmVcpuError::RestoreState)?;
}

Ok(())
}

Expand Down Expand Up @@ -439,6 +454,38 @@ impl KvmVcpu {
pub fn set_mpstate(&self, state: kvm_mp_state) -> Result<(), VcpuArchError> {
self.fd.set_mp_state(state).map_err(VcpuArchError::SetMp)
}

/// Check if pvtime (steal time on ARM) is supported for vcpu
pub fn supports_pvtime(&self) -> bool {
let pvtime_device_attr = kvm_bindings::kvm_device_attr {
group: kvm_bindings::KVM_ARM_VCPU_PVTIME_CTRL,
attr: kvm_bindings::KVM_ARM_VCPU_PVTIME_IPA as u64,
addr: 0,
flags: 0,
};

// Use kvm_has_device_attr to check if PVTime is supported
self.fd.has_device_attr(&pvtime_device_attr).is_ok()
}

/// Enables pvtime for vcpu
pub fn enable_pvtime(&mut self, ipa: GuestAddress) -> Result<(), VcpuArchError> {
self.pvtime_ipa = Some(ipa);

// Use KVM syscall (kvm_set_device_attr) to register the vCPU with the steal_time region
let vcpu_device_attr = kvm_bindings::kvm_device_attr {
group: KVM_ARM_VCPU_PVTIME_CTRL,
attr: KVM_ARM_VCPU_PVTIME_IPA as u64,
addr: &ipa.0 as *const u64 as u64, // userspace address of attr data
flags: 0,
};

self.fd
.set_device_attr(&vcpu_device_attr)
.map_err(VcpuArchError::DeviceAttribute)?;

Ok(())
}
}

impl Peripherals {
Expand Down Expand Up @@ -467,6 +514,8 @@ pub struct VcpuState {
pub mpidr: u64,
/// kvi states for vcpu initialization.
pub kvi: kvm_vcpu_init,
/// ipa for steal_time region
pub pvtime_ipa: Option<u64>,
}

impl Debug for VcpuState {
Expand Down
60 changes: 60 additions & 0 deletions src/vmm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
use userfaultfd::Uffd;
use utils::time::TimestampUs;
#[cfg(target_arch = "aarch64")]
use vm_memory::GuestAddress;
#[cfg(target_arch = "aarch64")]
use vm_superio::Rtc;
use vm_superio::Serial;
use vmm_sys_util::eventfd::EventFd;
Expand Down Expand Up @@ -82,6 +84,9 @@
CreateLegacyDevice(device_manager::legacy::LegacyDeviceError),
/// Error creating VMGenID device: {0}
CreateVMGenID(VmGenIdError),
/// Error enabling pvtime on vcpu: {0}
#[cfg(target_arch = "aarch64")]
EnablePVTime(crate::arch::VcpuArchError),

Check warning on line 89 in src/vmm/src/builder.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/builder.rs#L89

Added line #L89 was not covered by tests
/// Invalid Memory Configuration: {0}
GuestMemory(crate::vstate::memory::MemoryError),
/// Error with initrd initialization: {0}.
Expand Down Expand Up @@ -289,6 +294,13 @@

attach_vmgenid_device(&mut vmm)?;

#[cfg(target_arch = "aarch64")]
if vcpus[0].kvm_vcpu.supports_pvtime() {
setup_pvtime(&mut vmm, &mut vcpus)?;
} else {
log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest");

Check warning on line 301 in src/vmm/src/builder.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/builder.rs#L301

Added line #L301 was not covered by tests
}

configure_system_for_boot(
&mut vmm,
vcpus.as_mut(),
Expand Down Expand Up @@ -449,6 +461,16 @@
}
}

// Restore allocator state
#[cfg(target_arch = "aarch64")]
if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa {
allocate_pvtime_region(
&mut vmm,
vcpus.len(),
vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0),
)?;

Check warning on line 471 in src/vmm/src/builder.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/builder.rs#L467-L471

Added lines #L467 - L471 were not covered by tests
}

// Restore vcpus kvm state.
for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) {
vcpu.kvm_vcpu
Expand Down Expand Up @@ -552,6 +574,44 @@
Ok(serial)
}

/// 64 bytes due to alignment requirement in 3.1 of https://www.kernel.org/doc/html/v5.8/virt/kvm/devices/vcpu.html#attribute-kvm-arm-vcpu-pvtime-ipa
#[cfg(target_arch = "aarch64")]
const STEALTIME_STRUCT_MEM_SIZE: u64 = 64;

/// Helper method to allocate steal time region
#[cfg(target_arch = "aarch64")]
fn allocate_pvtime_region(
vmm: &mut Vmm,
vcpu_count: usize,
policy: vm_allocator::AllocPolicy,
) -> Result<GuestAddress, StartMicrovmError> {
let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64;
let addr = vmm
.resource_allocator
.allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy)
.map_err(StartMicrovmError::AllocateResources)?;
Ok(GuestAddress(addr))
}

/// Sets up pvtime for all vcpus
#[cfg(target_arch = "aarch64")]

Check warning on line 597 in src/vmm/src/builder.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/builder.rs#L595-L597

Added lines #L595 - L597 were not covered by tests
fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> {
// Alloc sys mem for steal time region
let pvtime_mem: GuestAddress =
allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?;

// Register all vcpus with pvtime device

Check warning on line 603 in src/vmm/src/builder.rs

View check run for this annotation

Codecov / codecov/patch

src/vmm/src/builder.rs#L603

Added line #L603 was not covered by tests
for (i, vcpu) in vcpus.iter_mut().enumerate() {
vcpu.kvm_vcpu
.enable_pvtime(GuestAddress(
pvtime_mem.0 + i as u64 * STEALTIME_STRUCT_MEM_SIZE,
))
.map_err(StartMicrovmError::EnablePVTime)?;
}

Ok(())
}

#[cfg(target_arch = "aarch64")]
fn attach_legacy_devices_aarch64(
event_manager: &mut EventManager,
Expand Down
2 changes: 1 addition & 1 deletion src/vmm/src/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ pub enum CreateSnapshotError {
}

/// Snapshot version
pub const SNAPSHOT_VERSION: Version = Version::new(6, 0, 0);
pub const SNAPSHOT_VERSION: Version = Version::new(7, 0, 0);

/// Creates a Microvm snapshot.
pub fn create_snapshot(
Expand Down
121 changes: 121 additions & 0 deletions tests/integration_tests/functional/test_steal_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""Tests for verifying the PVTime device behavior under contention and across snapshots."""

import time

import pytest

from framework.properties import global_props


def get_steal_time_ms(vm):
"""Returns total steal time of vCPUs in VM in milliseconds"""
_, out, _ = vm.ssh.run("grep -w '^cpu' /proc/stat")
steal_time_tck = int(out.strip().split()[8])
clk_tck = int(vm.ssh.run("getconf CLK_TCK").stdout)
return steal_time_tck / clk_tck * 1000


@pytest.mark.skipif(
global_props.cpu_architecture != "aarch64", reason="Only run in aarch64"
)
def test_guest_has_pvtime_enabled(uvm_plain):
"""
Check that the guest kernel has enabled PV steal time.
"""
vm = uvm_plain
vm.spawn()
vm.basic_config()
vm.add_net_iface()
vm.start()

_, stdout, _ = vm.ssh.run("dmesg | grep 'stolen time PV'")
assert (
"stolen time PV" in stdout
), "Guest kernel did not report PV steal time enabled"


def test_pvtime_steal_time_increases(uvm_plain):
"""
Test that PVTime steal time increases when both vCPUs are contended on the same pCPU.
"""
vm = uvm_plain
vm.spawn()
vm.basic_config()
vm.add_net_iface()
vm.start()

# Pin both vCPUs to the same physical CPU to induce contention
vm.pin_vcpu(0, 0)
vm.pin_vcpu(1, 0)

# Start two infinite loops to hog CPU time
hog_cmd = "nohup bash -c 'while true; do :; done' >/dev/null 2>&1 &"
vm.ssh.run(hog_cmd)
vm.ssh.run(hog_cmd)

# Measure before and after steal time
steal_before = get_steal_time_ms(vm)
time.sleep(2)
steal_after = get_steal_time_ms(vm)

# Require increase in steal time
assert (
steal_after > steal_before
), f"Steal time did not increase as expected. Before: {steal_before}, After: {steal_after}"


def test_pvtime_snapshot(uvm_plain, microvm_factory):
"""
Test that PVTime steal time is preserved across snapshot/restore
and continues increasing post-resume.
"""
vm = uvm_plain
vm.spawn()
vm.basic_config()
vm.add_net_iface()
vm.start()

vm.pin_vcpu(0, 0)
vm.pin_vcpu(1, 0)

hog_cmd = "nohup bash -c 'while true; do :; done' >/dev/null 2>&1 &"
vm.ssh.run(hog_cmd)
vm.ssh.run(hog_cmd)

# Snapshot pre-steal time
steal_before = get_steal_time_ms(vm)

snapshot = vm.snapshot_full()
vm.kill()

# Restore microVM from snapshot and resume
restored_vm = microvm_factory.build()
restored_vm.spawn()
restored_vm.restore_from_snapshot(snapshot, resume=False)
snapshot.delete()

restored_vm.pin_vcpu(0, 0)
restored_vm.pin_vcpu(1, 0)
restored_vm.resume()

# Steal time just after restoring
steal_after_snap = get_steal_time_ms(restored_vm)

time.sleep(2)

# Steal time after running resumed VM
steal_after_resume = get_steal_time_ms(restored_vm)

# Ensure steal time persisted and continued increasing
tolerance = 2000 # 2.0 seconds tolerance for persistence check
persisted = (
steal_before < steal_after_snap and steal_after_snap - steal_before < tolerance
)
increased = steal_after_resume > steal_after_snap

assert (
persisted and increased
), "Steal time did not persist through snapshot or failed to increase after resume"