Skip to content

feat(virtio-block): Add support for VIRTIO_BLK_T_DISCARD request type #5168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/vmm/src/devices/virtio/block/virtio/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::devices::virtio::block::CacheType;
use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice};
use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice};
use crate::devices::virtio::generated::virtio_blk::{
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES,
VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_F_DISCARD,
};
use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1;
use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
Expand Down Expand Up @@ -295,7 +295,8 @@ impl VirtioBlock {
.map_err(VirtioBlockError::RateLimiter)?
.unwrap_or_default();

let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX);
let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX)
| (1u64 << VIRTIO_BLK_F_DISCARD);

if config.cache_type == CacheType::Writeback {
avail_features |= 1u64 << VIRTIO_BLK_F_FLUSH;
Expand Down
2 changes: 1 addition & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/async_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ impl AsyncFileEngine {
Ok(())
}

#[cfg(test)]

pub fn file(&self) -> &File {
&self.file
}
Expand Down
39 changes: 38 additions & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pub mod sync_io;

use std::fmt::Debug;
use std::fs::File;
use libc::{c_int, off64_t};
use std::os::unix::io::AsRawFd;

pub use self::async_io::{AsyncFileEngine, AsyncIoError};
pub use self::sync_io::{SyncFileEngine, SyncIoError};
Expand Down Expand Up @@ -33,6 +35,13 @@ pub enum BlockIoError {
Async(AsyncIoError),
}

bitflags::bitflags! {
pub struct FallocateFlags: c_int {
const FALLOC_FL_KEEP_SIZE = 0x01;
const FALLOC_FL_PUNCH_HOLE = 0x02;
}
}

Comment on lines +38 to +44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

libc crate already has these constants

impl BlockIoError {
pub fn is_throttling_err(&self) -> bool {
match self {
Expand Down Expand Up @@ -75,7 +84,7 @@ impl FileEngine {
Ok(())
}

#[cfg(test)]

pub fn file(&self) -> &File {
match self {
FileEngine::Async(engine) => engine.file(),
Expand Down Expand Up @@ -172,6 +181,34 @@ impl FileEngine {
FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync),
}
}

pub fn handle_discard(&self, offset: u64, len: u32) -> Result<(), std::io::Error> {
let fd = self.file().as_raw_fd();
let result = Self::fallocate(
fd,
FallocateFlags::FALLOC_FL_PUNCH_HOLE | FallocateFlags::FALLOC_FL_KEEP_SIZE,
offset as i64,
len as i64,
);
if let Err(e) = result {
eprintln!("Discard failed: {}", e);
return Err(std::io::Error::last_os_error());
}
Ok(())
Comment on lines +186 to +197
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have 2 backends for block device, each need to implement it's own version of discard. For sync version this impl is basically what it needs to be, for io_uring backend there is IORING_OP_FALLOCATE op.

}

pub fn fallocate(fd: c_int, mode: FallocateFlags, offset: off64_t, len: off64_t) -> Result<(), std::io::Error> {
// need to refer to libc library.
let ret: i32 = unsafe { libc::fallocate64(fd, mode.bits(), offset, len) };
if ret == 0 {
Ok(())
} else {
Err(std::io::Error::last_os_error())
}
}



}

#[cfg(test)]
Expand Down
4 changes: 3 additions & 1 deletion src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ pub enum SyncIoError {
SyncAll(std::io::Error),
/// Transfer: {0}
Transfer(GuestMemoryError),
/// Discard: {0}
Discard(std::io::Error)
}

#[derive(Debug)]
Expand All @@ -33,7 +35,7 @@ impl SyncFileEngine {
SyncFileEngine { file }
}

#[cfg(test)]

pub fn file(&self) -> &File {
&self.file
}
Expand Down
3 changes: 3 additions & 0 deletions src/vmm/src/devices/virtio/block/virtio/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ pub struct BlockDeviceMetrics {
pub invalid_reqs_count: SharedIncMetric,
/// Number of flushes operation triggered on this block device.
pub flush_count: SharedIncMetric,
/// Number of discard operation triggered on this block device.
pub discard_count: SharedIncMetric,
/// Number of events triggered on the queue of this block device.
pub queue_event_count: SharedIncMetric,
/// Number of events ratelimiter-related.
Expand Down Expand Up @@ -210,6 +212,7 @@ impl BlockDeviceMetrics {
self.invalid_reqs_count
.add(other.invalid_reqs_count.fetch_diff());
self.flush_count.add(other.flush_count.fetch_diff());
self.discard_count.add(other.discard_count.fetch_diff());
self.queue_event_count
.add(other.queue_event_count.fetch_diff());
self.rate_limiter_event_count
Expand Down
8 changes: 8 additions & 0 deletions src/vmm/src/devices/virtio/block/virtio/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,12 @@ pub enum VirtioBlockError {
RateLimiter(std::io::Error),
/// Persistence error: {0}
Persist(crate::devices::virtio::persist::PersistError),
/// Sector overflow in discard segment
SectorOverflow,
/// Discard segment exceeds disk size
BeyondDiskSize,
/// Invalid flags in discard segment
InvalidDiscardFlags,
/// Invalid discard request (e.g., empty segments)
InvalidDiscardRequest,
}
90 changes: 88 additions & 2 deletions src/vmm/src/devices/virtio/block/virtio/request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@ use std::convert::From;

use vm_memory::GuestMemoryError;

use super::io::{BlockIoError, SyncIoError};
use super::{SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io};
use crate::devices::virtio::block::virtio::device::DiskProperties;
use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics;
pub use crate::devices::virtio::generated::virtio_blk::{
VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP,
VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT,
VIRTIO_BLK_T_DISCARD
};
use crate::devices::virtio::queue::DescriptorChain;
use crate::logger::{IncMetric, error};
use crate::rate_limiter::{RateLimiter, TokenType};
use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap};
use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap, Address};

#[derive(Debug, derive_more::From)]
pub enum IoErr {
Expand All @@ -34,6 +36,7 @@ pub enum RequestType {
Out,
Flush,
GetDeviceID,
Discard,
Unsupported(u32),
}

Expand All @@ -44,6 +47,7 @@ impl From<u32> for RequestType {
VIRTIO_BLK_T_OUT => RequestType::Out,
VIRTIO_BLK_T_FLUSH => RequestType::Flush,
VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID,
VIRTIO_BLK_T_DISCARD => RequestType::Discard,
t => RequestType::Unsupported(t),
}
}
Expand Down Expand Up @@ -176,6 +180,12 @@ impl PendingRequest {
(Ok(transferred_data_len), RequestType::GetDeviceID) => {
Status::from_data(self.data_len, transferred_data_len, true)
}
(Ok(_), RequestType::Discard) => {
block_metrics.discard_count.inc();
Status::Ok {
num_bytes_to_mem: 0,
}
}
(_, RequestType::Unsupported(op)) => Status::Unsupported { op },
(Err(err), _) => Status::IoErr {
num_bytes_to_mem: 0,
Expand Down Expand Up @@ -231,13 +241,23 @@ impl RequestHeader {
}
}

// For this, please see VirtIO v1.2. This struct mimics that of the implementation in Virtio.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct DiscardSegment {
sector: u64,
num_sectors: u32,
flags: u32,
}
Comment on lines +246 to +250
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flags is not just a 32 bit integer, it is a struct with 2 u32 inside. Also it needs to have a C struct layout attribute.

unsafe impl ByteValued for DiscardSegment {}

#[derive(Debug, PartialEq, Eq)]
pub struct Request {
pub r#type: RequestType,
pub data_len: u32,
pub status_addr: GuestAddress,
sector: u64,
data_addr: GuestAddress,
discard_segments: Option<Vec<DiscardSegment>>, // New field, holds ranges
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this comment?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh lol that was just a mental note to myself, forgot to take it out.

}

impl Request {
Expand All @@ -258,10 +278,11 @@ impl Request {
data_addr: GuestAddress(0),
data_len: 0,
status_addr: GuestAddress(0),
discard_segments: None
};

let data_desc;
let status_desc;
let mut status_desc;
let desc = avail_desc
.next_descriptor()
.ok_or(VirtioBlockError::DescriptorChainTooShort)?;
Expand Down Expand Up @@ -313,6 +334,52 @@ impl Request {
return Err(VirtioBlockError::InvalidDataLength);
}
}
RequestType::Discard => {
// Get data descriptor
let data_desc = avail_desc.next_descriptor()
.ok_or(VirtioBlockError::DescriptorChainTooShort)?;
Comment on lines +338 to +340
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a place to parse the request. It needs to be done above. There is already a place where data_desc is assigned.

if data_desc.is_write_only() {
return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor);
}

// Validate data length
let segment_size = std::mem::size_of::<DiscardSegment>() as u32; // 16 bytes
if data_desc.len % segment_size != 0 {
return Err(VirtioBlockError::InvalidDataLength);
}

// Calculate number of segments
let num_segments = data_desc.len / segment_size;
if num_segments == 0 {
return Err(VirtioBlockError::InvalidDiscardRequest);
}
let mut segments = Vec::with_capacity(num_segments as usize);
Copy link
Contributor

@ShadowCurse ShadowCurse Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can avoid having to deal with multiple segments if we just limit their number to 1 in the config space of the device. You will need to expand the ConfigSpace type to include max_discard_sectors and everything before it. Also in all other code please avoid dynamic allocations, it can cause issues in the future.


// Populate DiscardSegments vector
for i in 0..num_segments {
let offset = i * segment_size;
let segment: DiscardSegment = mem.read_obj(data_desc.addr.unchecked_add(offset as u64))
.map_err(VirtioBlockError::GuestMemory)?;
if segment.flags & !0x1 != 0 {
return Err(VirtioBlockError::InvalidDiscardFlags);
}
let end_sector = segment.sector.checked_add(segment.num_sectors as u64)
.ok_or(VirtioBlockError::SectorOverflow)?;
if end_sector > num_disk_sectors {
return Err(VirtioBlockError::BeyondDiskSize);
}
segments.push(segment);
}

// Assign to req.discard_segments
req.discard_segments = Some(segments);
req.data_len = data_desc.len;
status_desc = data_desc.next_descriptor()
.ok_or(VirtioBlockError::DescriptorChainTooShort)?;
if !status_desc.is_write_only() {
return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor);
}
}
_ => {}
}

Expand Down Expand Up @@ -390,6 +457,22 @@ impl Request {
.map_err(IoErr::GetId);
return ProcessingResult::Executed(pending.finish(mem, res, block_metrics));
}
RequestType::Discard => {
let res = disk
.file_engine
.handle_discard(self.offset(), self.data_len);

match res {
Ok(()) => Ok(block_io::FileEngineOk::Executed(block_io::RequestOk {
req: pending,
count: 0,
})),
Err(e) => Err(block_io::RequestError {
req: pending,
error: BlockIoError::Sync(SyncIoError::Discard(e)),
}),
}
Comment on lines +465 to +474
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice hallucinations. The file_engine.handle_discard should return same result type as file_engine.read/file_engine.write and so on. And then it will be handled by the code bellow.

Copy link
Author

@LDagnachew LDagnachew Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, that checks out, I'll make that change.

}
RequestType::Unsupported(_) => {
return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics));
}
Expand Down Expand Up @@ -730,6 +813,7 @@ mod tests {
RequestType::Out => VIRTIO_BLK_T_OUT,
RequestType::Flush => VIRTIO_BLK_T_FLUSH,
RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID,
RequestType::Discard => VIRTIO_BLK_T_DISCARD,
RequestType::Unsupported(id) => id,
}
}
Expand All @@ -742,6 +826,7 @@ mod tests {
RequestType::Out => VIRTQ_DESC_F_NEXT,
RequestType::Flush => VIRTQ_DESC_F_NEXT,
RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE,
RequestType::Discard => VIRTQ_DESC_F_NEXT,
RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT,
}
}
Expand Down Expand Up @@ -833,6 +918,7 @@ mod tests {
status_addr,
sector: sector & (NUM_DISK_SECTORS - sectors_len),
data_addr,
discard_segments: None
};
let mut request_header = RequestHeader::new(virtio_request_id, request.sector);

Expand Down