diff --git a/Cargo.toml b/Cargo.toml index 0a47ce92..bd7c304c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,8 @@ categories = ["hardware-support", "no-std"] log = "0.4" bitflags = "2.3.0" zerocopy = "0.6.1" +spin = "0.9" + [features] default = ["alloc"] diff --git a/examples/aarch64/src/main.rs b/examples/aarch64/src/main.rs index 8ba1ac57..09d081cb 100644 --- a/examples/aarch64/src/main.rs +++ b/examples/aarch64/src/main.rs @@ -146,7 +146,8 @@ fn virtio_device(transport: impl Transport) { } fn virtio_blk(transport: T) { - let mut blk = VirtIOBlk::::new(transport).expect("failed to create blk driver"); + let mut blk = + VirtIOBlk::::new(transport, true).expect("failed to create blk driver"); assert!(!blk.readonly()); let mut input = [0xffu8; 512]; let mut output = [0; 512]; @@ -154,8 +155,9 @@ fn virtio_blk(transport: T) { for x in input.iter_mut() { *x = i as u8; } - blk.write_block(i, &input).expect("failed to write"); - blk.read_block(i, &mut output).expect("failed to read"); + blk.write_blocks(i, &[&input]).expect("failed to write"); + blk.read_blocks(i, &mut [&mut output]) + .expect("failed to read"); assert_eq!(input, output); } info!("virtio-blk test finished"); diff --git a/examples/riscv/Makefile b/examples/riscv/Makefile index debb82f5..fc6dc49b 100644 --- a/examples/riscv/Makefile +++ b/examples/riscv/Makefile @@ -71,14 +71,26 @@ qemu: kernel $(img) -kernel $(kernel) \ -global virtio-mmio.force-legacy=false \ -drive file=$(img),if=none,format=raw,id=x0 \ - -device virtio-blk-device,drive=x0 \ + -device virtio-blk-device,packed=on,drive=x0 \ -device virtio-gpu-device \ -device virtio-mouse-device \ -device virtio-net-device,netdev=net0 \ -netdev user,id=net0,hostfwd=tcp::5555-:5555 +qemu-blk: kernel $(img) + qemu-system-$(arch) \ + $(QEMU_ARGS) \ + -machine virt \ + -serial mon:stdio \ + -bios default \ + -kernel $(kernel) \ + -global virtio-mmio.force-legacy=false \ + -drive file=$(img),if=none,format=raw,id=x0 \ + -device virtio-blk-device,packed=on,drive=x0 $(img): dd if=/dev/zero of=$@ bs=512 count=32 -run: build qemu-legacy qemu +# run: build qemu-legacy qemu +run: build qemu-blk + diff --git a/examples/riscv/src/main.rs b/examples/riscv/src/main.rs index e0e7dd86..2857d392 100644 --- a/examples/riscv/src/main.rs +++ b/examples/riscv/src/main.rs @@ -1,7 +1,7 @@ #![no_std] #![no_main] #![deny(warnings)] - +#![allow(warnings, unused)] #[macro_use] extern crate log; @@ -83,23 +83,25 @@ fn virtio_probe(node: FdtNode) { fn virtio_device(transport: impl Transport) { match transport.device_type() { DeviceType::Block => virtio_blk(transport), - DeviceType::GPU => virtio_gpu(transport), - DeviceType::Input => virtio_input(transport), - DeviceType::Network => virtio_net(transport), + // DeviceType::GPU => virtio_gpu(transport), + // DeviceType::Input => virtio_input(transport), + // DeviceType::Network => virtio_net(transport), t => warn!("Unrecognized virtio device: {:?}", t), } } fn virtio_blk(transport: T) { - let mut blk = VirtIOBlk::::new(transport).expect("failed to create blk driver"); + let mut blk = + VirtIOBlk::::new(transport, true).expect("failed to create blk driver"); let mut input = vec![0xffu8; 512]; let mut output = vec![0; 512]; for i in 0..32 { for x in input.iter_mut() { *x = i as u8; } - blk.write_block(i, &input).expect("failed to write"); - blk.read_block(i, &mut output).expect("failed to read"); + blk.write_blocks(i, &[&input]).expect("failed to write"); + blk.read_blocks(i, &mut [&mut output]) + .expect("failed to read"); assert_eq!(input, output); } info!("virtio-blk test finished"); diff --git a/examples/x86_64/src/main.rs b/examples/x86_64/src/main.rs index 1823f558..56e50c9b 100644 --- a/examples/x86_64/src/main.rs +++ b/examples/x86_64/src/main.rs @@ -72,7 +72,8 @@ fn virtio_device(transport: impl Transport) { } fn virtio_blk(transport: T) { - let mut blk = VirtIOBlk::::new(transport).expect("failed to create blk driver"); + let mut blk = + VirtIOBlk::::new(transport, true).expect("failed to create blk driver"); assert!(!blk.readonly()); let mut input = [0xffu8; 512]; let mut output = [0; 512]; @@ -80,8 +81,9 @@ fn virtio_blk(transport: T) { for x in input.iter_mut() { *x = i as u8; } - blk.write_block(i, &input).expect("failed to write"); - blk.read_block(i, &mut output).expect("failed to read"); + blk.write_blocks(i, &[&input]).expect("failed to write"); + blk.read_blocks(i, &mut [&mut output]) + .expect("failed to read"); assert_eq!(input, output); } info!("virtio-blk test finished"); diff --git a/src/device/blk.rs b/src/device/blk.rs index ea3aef00..e2879527 100644 --- a/src/device/blk.rs +++ b/src/device/blk.rs @@ -1,16 +1,28 @@ //! Driver for VirtIO block devices. +extern crate alloc; use crate::hal::Hal; -use crate::queue::VirtQueue; +use crate::queue::{packed_queue::PackedQueue, split_queue::SplitQueue, VirtQueue}; use crate::transport::Transport; use crate::volatile::{volread, Volatile}; +use crate::NonNull; use crate::{Error, Result}; use bitflags::bitflags; -use log::info; use zerocopy::{AsBytes, FromBytes}; +use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; + +use core::{ + future::Future, + pin::Pin, + task::{Context, Poll, Waker}, +}; +use spin::Mutex; + +use log::debug; + const QUEUE: u16 = 0; -const QUEUE_SIZE: u16 = 16; +const QUEUE_SIZE: u16 = 64; /// Driver for a VirtIO block device. /// @@ -27,66 +39,204 @@ const QUEUE_SIZE: u16 = 16; /// use virtio_drivers::device::blk::{VirtIOBlk, SECTOR_SIZE}; /// /// # fn example(transport: T) -> Result<(), Error> { -/// let mut disk = VirtIOBlk::::new(transport)?; +/// let mut disk = VirtIOBlk::::new(transport, true)?; /// /// println!("VirtIO block device: {} kB", disk.capacity() * SECTOR_SIZE as u64 / 2); /// /// // Read sector 0 and then copy it to sector 1. /// let mut buf = [0; SECTOR_SIZE]; -/// disk.read_block(0, &mut buf)?; -/// disk.write_block(1, &buf)?; +/// disk.read_blocks(0, &mut[&mut buf])?; +/// disk.write_blocks(1, &[&buf])?; /// # Ok(()) /// # } /// ``` +/// +/// +/// +struct VirtIoBlkInner { + queue: VirtQueue, + /// Asynchronous IO + blkinfos: Box<[BlkInfo]>, +} +/// aaa pub struct VirtIOBlk { transport: T, - queue: VirtQueue, - capacity: u64, - readonly: bool, + config: Blkconfiglocal, + features_neg: BlkFeature, + inner: Arc>>, } -impl VirtIOBlk { +// TODO: The ability of Used Buffer Notification Suppression is not fully exploited (Ref: Section 2.7.7.1) +impl<'a, H: Hal, T: Transport> VirtIOBlk { /// Create a new VirtIO-Blk driver. - pub fn new(mut transport: T) -> Result { - let mut readonly = false; + pub fn new(mut transport: T, notification_supress: bool) -> Result { + let mut features_neg = BlkFeature::empty(); transport.begin_init(|features| { - let features = BlkFeature::from_bits_truncate(features); - info!("device features: {:?}", features); - readonly = features.contains(BlkFeature::RO); + // 剔除 device 不支持的feature + + features_neg = BlkFeature::from_bits_truncate(features); + // negotiate these flags only - let supported_features = BlkFeature::empty(); - (features & supported_features).bits() + if notification_supress { + features_neg.remove(BlkFeature::VIRTIO_F_EVENT_IDX); + } + + features_neg.bits() }); // read configuration space let config = transport.config_space::()?; - info!("config: {:?}", config); - // Safe because config is a valid pointer to the device configuration space. - let capacity = unsafe { - volread!(config, capacity_low) as u64 | (volread!(config, capacity_high) as u64) << 32 + let config = Self::read_config(&config); + debug!("found a block device of size {}KB", config.capacity / 2); + + let mut indirect_desc = false; + if features_neg.contains(BlkFeature::VIRTIO_F_INDIRECT_DESC) { + indirect_desc = true; + } + if features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX) { + debug!("===__==="); + } + + let queue; + if features_neg.contains(BlkFeature::VIRTIO_F_RING_PACKED) { + queue = VirtQueue::Packedqueue(PackedQueue::::new( + &mut transport, + QUEUE, + indirect_desc, + )?); + } else { + queue = VirtQueue::Splitqueue(SplitQueue::::new( + &mut transport, + QUEUE, + indirect_desc, + )?); + } + + let blkinfos = { + let mut vec = Vec::::with_capacity(QUEUE_SIZE as usize); + vec.resize_with(QUEUE_SIZE as usize, || NULLINFO); + vec.into_boxed_slice() }; - info!("found a block device of size {}KB", capacity / 2); - let queue = VirtQueue::new(&mut transport, QUEUE)?; transport.finish_init(); Ok(VirtIOBlk { + config, transport, - queue, - capacity, - readonly, + features_neg, + inner: Arc::new(Mutex::new(VirtIoBlkInner { queue, blkinfos })), }) } + fn read_config(config: &NonNull) -> Blkconfiglocal { + Blkconfiglocal { + /// The capacity (in 512-byte sectors). + capacity: unsafe { + volread!(config, capacity_low) as u64 + | (volread!(config, capacity_high) as u64) << 32 + }, + + /// The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) + size_max: unsafe { volread!(config, size_max) }, + + /// The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) + seg_max: unsafe { volread!(config, seg_max) }, + + /// geometry of the device (if VIRTIO_BLK_F_GEOMETRY) + cylinders: unsafe { volread!(config, cylinders) }, + heads: unsafe { volread!(config, heads) }, + sectors: unsafe { volread!(config, sectors) }, + + /// block size of device (if VIRTIO_BLK_F_BLK_SIZE) + blk_size: unsafe { volread!(config, blk_size) }, + + /// the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY + /// exponent for physical block per logical block. + physical_block_exp: unsafe { volread!(config, physical_block_exp) }, + + /// alignment offset in logical blocks. + alignment_offset: unsafe { volread!(config, alignment_offset) }, + + /// minimum I/O size without performance penalty in logical blocks. + min_io_size: unsafe { volread!(config, min_io_size) }, + + /// optimal sustained I/O size in logical blocks. + opt_io_size: unsafe { volread!(config, opt_io_size) }, + + /// writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) + wce: unsafe { volread!(config, wce) }, + + /// number of vqs, only available when VIRTIO_BLK_F_MQ is set + num_queues: unsafe { volread!(config, num_queues) }, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD + /// + /// The maximum discard sectors (in 512-byte sectors) for + /// one segment. + max_discard_sectors: unsafe { volread!(config, max_discard_sectors) }, + + /// The maximum number of discard segments in a discard command. + max_discard_seg: unsafe { volread!(config, max_discard_seg) }, + + /// Discard commands must be aligned to this number of sectors. + discard_sector_alignment: unsafe { volread!(config, discard_sector_alignment) }, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES. + /// The maximum number of write zeroes sectors (in 512-byte sectors) in one segment. + max_write_zeroes_sectors: unsafe { volread!(config, max_write_zeroes_sectors) }, + + /// The maximum number of segments in a write zeroes command. + max_write_zeroes_seg: unsafe { volread!(config, max_write_zeroes_seg) }, + + /// Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the deallocation of one or more of the sectors. + write_zeroes_may_unmap: unsafe { volread!(config, write_zeroes_may_unmap) }, + } + } + /// Gets the capacity of the block device, in 512 byte ([`SECTOR_SIZE`]) sectors. pub fn capacity(&self) -> u64 { - self.capacity + self.config.capacity } /// Returns true if the block device is read-only, or false if it allows writes. pub fn readonly(&self) -> bool { - self.readonly + self.features_neg.contains(BlkFeature::VIRTIO_BLK_F_RO) + } + + /// Return (max_discard_seg, max_discard_sectors) + pub fn discard_parameters(&self) -> (u32, u32) { + (self.config.max_discard_seg, self.config.max_discard_sectors) + } + + /// Return (max_write_zeroes_seg, max_write_zeroes_sectors) + pub fn writezeros_parameters(&self) -> (u32, u32) { + ( + self.config.max_write_zeroes_seg, + self.config.max_write_zeroes_sectors, + ) + } + + /// Support discard single range? + pub fn support_discard(&self) -> bool { + self.features_neg.contains(BlkFeature::VIRTIO_BLK_F_DISCARD) + } + + /// Support VIRTIO_BLK_F_WRITE_ZEROES ? + pub fn support_writezeros(&self) -> bool { + self.features_neg + .contains(BlkFeature::VIRTIO_BLK_F_WRITE_ZEROES) + } + + /// Support indirect dictionary + pub fn support_indirect(&self) -> bool { + self.features_neg + .contains(BlkFeature::VIRTIO_F_INDIRECT_DESC) + } + + /// Support event? + fn support_event(&self) -> bool { + self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX) } /// Acknowledges a pending interrupt, if any. @@ -96,218 +246,670 @@ impl VirtIOBlk { self.transport.ack_interrupt() } - /// Reads a block into the given buffer. + /// pop used + pub unsafe fn pop_used( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + self.inner.lock().queue.pop_used(token, inputs, outputs) + } + + /// pop used + // TODO: will be deleted in the further + pub unsafe fn pop_used_async( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + self.inner + .lock() + .queue + .pop_used_async(token, inputs, outputs) + } + + /// Returns the size of the device's VirtQueue. /// - /// Blocks until the read completes or there is an error. - pub fn read_block(&mut self, block_id: usize, buf: &mut [u8]) -> Result { - assert_eq!(buf.len(), SECTOR_SIZE); + /// This can be used to tell the caller how many channels to monitor on. + pub fn virt_queue_size(&self) -> u16 { + QUEUE_SIZE + } + + /// Flush + pub fn flush(&mut self) -> Result { + // assert_eq!(buf.len(), SECTOR_SIZE); + let req = BlkReq { + type_: ReqType::Flush, + reserved: 0, + sector: 0, + }; + let mut resp = BlkResp::default(); + let support_event = self.support_event(); + let mut inner = self.inner.lock(); + inner.queue.add_notify_wait_pop( + &[req.as_bytes()], + &mut [resp.as_bytes_mut()], + &mut self.transport, + support_event, + )?; + + resp.status.into() + } + + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to + /// complete. + pub fn write_blocks(&mut self, block_id: usize, bufs: &[&[u8]]) -> Result { + assert_eq!(self.readonly(), false); + + let req = BlkReq { + type_: ReqType::Out, + reserved: 0, + sector: block_id as u64, + }; + let mut resp = BlkResp::default(); + + let mut inputs = vec![req.as_bytes(); 1 + bufs.len()]; + let mut index = 1; + for x in bufs.iter() { + inputs[index] = *x; + index += 1; + } + let support_event = self.support_event(); + let mut inner = self.inner.lock(); + inner.queue.add_notify_wait_pop( + inputs.as_slice(), + &mut [resp.as_bytes_mut()], + &mut self.transport, + support_event, + )?; + resp.status.into() + } + + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to + /// complete. + pub fn read_blocks(&mut self, block_id: usize, bufs: &mut [&mut [u8]]) -> Result { + assert_eq!(self.readonly(), false); + let req = BlkReq { type_: ReqType::In, reserved: 0, sector: block_id as u64, }; let mut resp = BlkResp::default(); - self.queue.add_notify_wait_pop( + + let mut outputs: Vec<&mut [u8]> = Vec::new(); + for x in bufs.iter_mut() { + outputs.push(*x); + } + outputs.push(resp.as_bytes_mut()); + + let support_event = self.support_event(); + let mut inner = self.inner.lock(); + inner.queue.add_notify_wait_pop( + &[req.as_bytes()], + &mut outputs.as_mut_slice(), + &mut self.transport, + support_event, + )?; + resp.status.into() + } + + /// get the device id + pub fn get_device_id(&mut self, buf: &mut [u8]) -> Result { + let req = BlkReq { + type_: ReqType::GetID, + reserved: 0, + sector: 0, + }; + + let mut resp = BlkResp::default(); + let support_event = self.support_event(); + let mut inner = self.inner.lock(); + inner.queue.add_notify_wait_pop( &[req.as_bytes()], &mut [buf, resp.as_bytes_mut()], &mut self.transport, + support_event, + )?; + resp.status.into() + } + + /// discard nu_block blocks starting from start_block + pub fn discard_ranges(&mut self, start_sectors: &[u64], nr_sectors: &[u32]) -> Result { + let req = BlkReq { + type_: ReqType::Discard, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges(&req, start_sectors, nr_sectors, unmap, true) + } + + /// wirtezeros nu_block blocks starting from start_block + pub fn writezeros_ranges(&mut self, start_sectors: &[u64], nr_sectors: &[u32]) -> Result { + let req = BlkReq { + type_: ReqType::WriteZeroes, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges(&req, start_sectors, nr_sectors, unmap, false) + } + + /// erase nu_block blocks starting from start_block without blocking + fn erase_ranges( + &mut self, + req: &BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + unmap: bool, + is_discard: bool, + ) -> Result { + assert_eq!(start_sectors.len(), nr_sectors.len()); + + let (input, nr_seg_used) = + self.prepare_erase_ranges(start_sectors, nr_sectors, unmap, is_discard); + + let input_f = unsafe { + core::slice::from_raw_parts( + input.as_ptr() as *const u8, + core::mem::size_of::() * nr_seg_used, + ) + }; + let support_event = self.support_event(); + let mut resp = BlkResp::default(); + let mut inner = self.inner.lock(); + + inner.queue.add_notify_wait_pop( + &[req.as_bytes(), input_f], + &mut [resp.as_bytes_mut()], + &mut self.transport, + support_event, )?; resp.status.into() } - /// Submits a request to read a block, but returns immediately without waiting for the read to + /// prepare for erase + fn prepare_erase_ranges( + &mut self, + start_sectors: &[u64], + nr_sectors: &[u32], + unmap: bool, + is_discard: bool, + ) -> (Vec, usize) { + assert_eq!(start_sectors.len(), nr_sectors.len()); + + let discard_sector_alignment = self.config.discard_sector_alignment; + let num_seg = start_sectors.len(); + let mut input = vec![ + Range { + sector: 1, + num_sector: 1, + flags: 0 + }; + num_seg * 2 + ]; + + let mut nr_seg_used = 0; + let mut start_sectors = start_sectors.iter(); + for nr_sector in nr_sectors.iter() { + let start_sector = *start_sectors.next().unwrap(); + let flag = match unmap { + true => 1, + false => 0, + }; + if is_discard && *nr_sector % discard_sector_alignment != 0 { + let nr_first_sector = nr_sector % discard_sector_alignment; + assert!((nr_sector - nr_first_sector) < self.config.max_discard_sectors); + input[nr_seg_used].flags = flag; + input[nr_seg_used].num_sector = nr_first_sector; + input[nr_seg_used].sector = start_sector; + nr_seg_used += 1; + input[nr_seg_used].flags = flag; + input[nr_seg_used].num_sector = nr_sector - nr_first_sector; + input[nr_seg_used].sector = start_sector + nr_first_sector as u64; + nr_seg_used += 1; + } else { + if is_discard { + assert!(*nr_sector < self.config.max_discard_sectors); + } else { + assert!(*nr_sector < self.config.max_write_zeroes_sectors); + } + input[nr_seg_used].flags = flag; + input[nr_seg_used].num_sector = *nr_sector; + input[nr_seg_used].sector = start_sector; + nr_seg_used += 1; + } + } + if is_discard { + assert!( + nr_seg_used <= self.config.max_discard_seg as usize, + "The device does not support two many discarded segments in a single request" + ) + } else { + assert!( + nr_seg_used <= self.config.max_write_zeroes_seg as usize, + "The device does not support two many writezeros segments in a single request" + ) + } + (input, nr_seg_used) + } +} + +impl VirtIOBlk { + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to /// complete. - /// - /// # Arguments - /// - /// * `block_id` - The identifier of the block to read. - /// * `req` - A buffer which the driver can use for the request to send to the device. The - /// contents don't matter as `read_block_nb` will initialise it, but like the other buffers it - /// needs to be valid (and not otherwise used) until the corresponding `complete_read_block` - /// call. - /// * `buf` - The buffer in memory into which the block should be read. - /// * `resp` - A mutable reference to a variable provided by the caller - /// to contain the status of the request. The caller can safely - /// read the variable only after the request is complete. - /// - /// # Usage - /// - /// It will submit request to the VirtIO block device and return a token identifying - /// the position of the first Descriptor in the chain. If there are not enough - /// Descriptors to allocate, then it returns [`Error::QueueFull`]. - /// - /// The caller can then call `peek_used` with the returned token to check whether the device has - /// finished handling the request. Once it has, the caller must call `complete_read_block` with - /// the same buffers before reading the response. - /// - /// ``` - /// # use virtio_drivers::{Error, Hal}; - /// # use virtio_drivers::device::blk::VirtIOBlk; - /// # use virtio_drivers::transport::Transport; - /// use virtio_drivers::device::blk::{BlkReq, BlkResp, RespStatus}; - /// - /// # fn example(blk: &mut VirtIOBlk) -> Result<(), Error> { - /// let mut request = BlkReq::default(); - /// let mut buffer = [0; 512]; - /// let mut response = BlkResp::default(); - /// let token = unsafe { blk.read_block_nb(42, &mut request, &mut buffer, &mut response) }?; - /// - /// // Wait for an interrupt to tell us that the request completed... - /// assert_eq!(blk.peek_used(), Some(token)); - /// - /// unsafe { - /// blk.complete_read_block(token, &request, &mut buffer, &mut response)?; - /// } - /// if response.status() == RespStatus::OK { - /// println!("Successfully read block."); - /// } else { - /// println!("Error {:?} reading block.", response.status()); - /// } - /// # Ok(()) - /// # } - /// ``` - /// - /// # Safety - /// - /// `req`, `buf` and `resp` are still borrowed by the underlying VirtIO block device even after - /// this method returns. Thus, it is the caller's responsibility to guarantee that they are not - /// accessed before the request is completed in order to avoid data races. - pub unsafe fn read_block_nb( + pub unsafe fn write_blocks_nb_sync( &mut self, block_id: usize, req: &mut BlkReq, - buf: &mut [u8], + bufs: &[&[u8]], resp: &mut BlkResp, ) -> Result { - assert_eq!(buf.len(), SECTOR_SIZE); + assert_eq!(self.readonly(), true); *req = BlkReq { - type_: ReqType::In, + type_: ReqType::Out, reserved: 0, sector: block_id as u64, }; - let token = self + + let mut inputs = vec![req.as_bytes(); 1 + bufs.len()]; + let mut index = 1; + for x in bufs.iter() { + inputs[index] = *x; + index += 1; + } + let mut inner = self.inner.lock(); + let token = inner .queue - .add(&[req.as_bytes()], &mut [buf, resp.as_bytes_mut()])?; - if self.queue.should_notify() { + .add(inputs.as_slice(), &mut [resp.as_bytes_mut()])?; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { self.transport.notify(QUEUE); } Ok(token) } - /// Completes a read operation which was started by `read_block_nb`. - /// - /// # Safety - /// - /// The same buffers must be passed in again as were passed to `read_block_nb` when it returned - /// the token. - pub unsafe fn complete_read_block( + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to + /// complete. + pub unsafe fn write_blocks_nb_async( &mut self, - token: u16, - req: &BlkReq, - buf: &mut [u8], - resp: &mut BlkResp, - ) -> Result<()> { - self.queue - .pop_used(token, &[req.as_bytes()], &mut [buf, resp.as_bytes_mut()])?; - resp.status.into() + block_id: usize, + req: &mut BlkReq, + bufs: &[&[u8]], + ) -> Pin>> { + assert_eq!(self.readonly(), true); + *req = BlkReq { + type_: ReqType::Out, + reserved: 0, + sector: block_id as u64, + }; + + let mut inputs = vec![req.as_bytes(); 1 + bufs.len()]; + let mut index = 1; + for x in bufs.iter() { + inputs[index] = *x; + index += 1; + } + let mut inner = self.inner.lock(); + let mut future = Box::pin(BlkFuture::new(self.inner.clone())); + + match inner + .queue + .add(inputs.as_slice(), &mut [future.resp.as_bytes_mut()]) + { + Ok(n) => { + future.head = n; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + } + Err(e) => future.err = Some(e), + } + + future } - /// Writes the contents of the given buffer to a block. - /// - /// Blocks until the write is complete or there is an error. - pub fn write_block(&mut self, block_id: usize, buf: &[u8]) -> Result { - assert_eq!(buf.len(), SECTOR_SIZE); + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to + /// complete. + pub unsafe fn read_blocks_nb_async( + &mut self, + block_id: usize, + bufs: &mut [&mut [u8]], + ) -> Pin>> { + assert_eq!(self.readonly(), false); + let req = BlkReq { - type_: ReqType::Out, + type_: ReqType::In, reserved: 0, sector: block_id as u64, }; - let mut resp = BlkResp::default(); - self.queue.add_notify_wait_pop( - &[req.as_bytes(), buf], - &mut [resp.as_bytes_mut()], - &mut self.transport, - )?; - resp.status.into() + let mut future = Box::pin(BlkFuture::new(self.inner.clone())); + + let mut outputs: Vec<&mut [u8]> = Vec::new(); + for x in bufs.iter_mut() { + outputs.push(*x); + } + outputs.push(future.resp.as_bytes_mut()); + + let mut inner = self.inner.lock(); + match inner + .queue + .add(&[req.as_bytes()], &mut outputs.as_mut_slice()) + { + Ok(n) => { + future.head = n; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + } + Err(e) => future.err = Some(e), + } + future } - /// Submits a request to write a block, but returns immediately without waiting for the write to + /// Submits a request to write **multiple** blocks, but returns immediately without waiting for the read to /// complete. - /// - /// # Arguments - /// - /// * `block_id` - The identifier of the block to write. - /// * `req` - A buffer which the driver can use for the request to send to the device. The - /// contents don't matter as `read_block_nb` will initialise it, but like the other buffers it - /// needs to be valid (and not otherwise used) until the corresponding `complete_read_block` - /// call. - /// * `buf` - The buffer in memory containing the data to write to the block. - /// * `resp` - A mutable reference to a variable provided by the caller - /// to contain the status of the request. The caller can safely - /// read the variable only after the request is complete. - /// - /// # Usage - /// - /// See [VirtIOBlk::read_block_nb]. - /// - /// # Safety - /// - /// See [VirtIOBlk::read_block_nb]. - pub unsafe fn write_block_nb( + pub unsafe fn read_blocks_nb_sync( &mut self, block_id: usize, - req: &mut BlkReq, - buf: &[u8], + bufs: &mut [&mut [u8]], resp: &mut BlkResp, ) -> Result { - assert_eq!(buf.len(), SECTOR_SIZE); - *req = BlkReq { - type_: ReqType::Out, + assert_eq!(self.readonly(), false); + + let req = BlkReq { + type_: ReqType::In, reserved: 0, sector: block_id as u64, }; - let token = self + + let mut outputs: Vec<&mut [u8]> = Vec::new(); + for x in bufs.iter_mut() { + outputs.push(*x); + } + outputs.push(resp.as_bytes_mut()); + + let mut inner = self.inner.lock(); + let token = inner .queue - .add(&[req.as_bytes(), buf], &mut [resp.as_bytes_mut()])?; - if self.queue.should_notify() { + .add(&[req.as_bytes()], &mut outputs.as_mut_slice())?; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { self.transport.notify(QUEUE); } Ok(token) } - /// Completes a write operation which was started by `write_block_nb`. - /// - /// # Safety - /// - /// The same buffers must be passed in again as were passed to `write_block_nb` when it returned - /// the token. - pub unsafe fn complete_write_block( + /// discard nu_block blocks starting from start_block + pub unsafe fn discard_ranges_nb_sync( &mut self, - token: u16, - req: &BlkReq, - buf: &[u8], + req: &mut BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + resp: &mut BlkResp, + ) -> Result { + *req = BlkReq { + type_: ReqType::Discard, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges_nb_sync(req, start_sectors, nr_sectors, unmap, resp, true) + } + + /// discard nu_block blocks starting from start_block + pub unsafe fn discard_ranges_nb_async( + &mut self, + req: &mut BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + ) -> Pin>> { + *req = BlkReq { + type_: ReqType::Discard, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges_nb_async(req, start_sectors, nr_sectors, unmap, true) + } + + /// discard nu_block blocks starting from start_block + pub unsafe fn writezeros_ranges_nb_sync( + &mut self, + req: &mut BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + resp: &mut BlkResp, + ) -> Result { + *req = BlkReq { + type_: ReqType::WriteZeroes, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges_nb_sync(req, start_sectors, nr_sectors, unmap, resp, false) + } + /// discard nu_block blocks starting from start_block + pub unsafe fn writezeros_ranges_nb_async( + &mut self, + req: &mut BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + ) -> Pin>> { + *req = BlkReq { + type_: ReqType::WriteZeroes, + reserved: 0, + sector: 0, + }; + let unmap = false; + self.erase_ranges_nb_async(req, start_sectors, nr_sectors, unmap, false) + } + + /// get the device id + pub unsafe fn get_device_id_nb_sync( + &mut self, + req: &mut BlkReq, + buf: &mut [u8], resp: &mut BlkResp, - ) -> Result<()> { - self.queue - .pop_used(token, &[req.as_bytes(), buf], &mut [resp.as_bytes_mut()])?; + ) -> Result { + *req = BlkReq { + type_: ReqType::GetID, + reserved: 0, + sector: 0, + }; + + let mut inner = self.inner.lock(); + let token = inner + .queue + .add(&[req.as_bytes()], &mut [buf, resp.as_bytes_mut()])?; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + Ok(token) + } + + /// get the device id + pub unsafe fn get_device_id_nb_async( + &mut self, + req: &mut BlkReq, + buf: &mut [u8], + ) -> Pin>> { + *req = BlkReq { + type_: ReqType::GetID, + reserved: 0, + sector: 0, + }; + + let mut inner = self.inner.lock(); + let mut future = Box::pin(BlkFuture::new(self.inner.clone())); + + match inner + .queue + .add(&[req.as_bytes()], &mut [buf, future.resp.as_bytes_mut()]) + { + Ok(n) => { + future.head = n; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + } + Err(e) => future.err = Some(e), + } + future + } + + /// Flush + pub unsafe fn flush_nb_async(&mut self) -> Pin>> { + // assert_eq!(buf.len(), SECTOR_SIZE); + let req = BlkReq { + type_: ReqType::Flush, + reserved: 0, + sector: 0, + }; + + let mut inner = self.inner.lock(); + let mut future = Box::pin(BlkFuture::new(self.inner.clone())); + + match inner + .queue + .add(&[req.as_bytes()], &mut [future.resp.as_bytes_mut()]) + { + Ok(n) => { + future.head = n; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + } + Err(e) => future.err = Some(e), + } + + future + } + + /// Flush + pub unsafe fn flush_nb_sync(&mut self) -> Result { + // assert_eq!(buf.len(), SECTOR_SIZE); + let req = BlkReq { + type_: ReqType::Flush, + reserved: 0, + sector: 0, + }; + let mut resp = BlkResp::default(); + let mut inner = self.inner.lock(); + + inner + .queue + .add(&[req.as_bytes()], &mut [resp.as_bytes_mut()])?; + + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + resp.status.into() } - /// Fetches the token of the next completed request from the used ring and returns it, without - /// removing it from the used ring. If there are no pending completed requests returns `None`. - pub fn peek_used(&mut self) -> Option { - self.queue.peek_used() + /// erase nu_block blocks starting from start_block without blocking + unsafe fn erase_ranges_nb_sync( + &mut self, + req: &BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + unmap: bool, + resp: &mut BlkResp, + is_discard: bool, + ) -> Result { + assert_eq!(start_sectors.len(), nr_sectors.len()); + + let (input, nr_seg_used) = + self.prepare_erase_ranges(start_sectors, nr_sectors, unmap, is_discard); + + let input_f = unsafe { + core::slice::from_raw_parts( + input.as_ptr() as *const u8, + core::mem::size_of::() * nr_seg_used, + ) + }; + let mut inner = self.inner.lock(); + let token = inner + .queue + .add(&[req.as_bytes(), input_f], &mut [resp.as_bytes_mut()])?; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + Ok(token) } - /// Returns the size of the device's VirtQueue. - /// - /// This can be used to tell the caller how many channels to monitor on. - pub fn virt_queue_size(&self) -> u16 { - QUEUE_SIZE + /// erase nu_block blocks starting from start_block without blocking + unsafe fn erase_ranges_nb_async( + &mut self, + req: &BlkReq, + start_sectors: &[u64], + nr_sectors: &[u32], + unmap: bool, + is_discard: bool, + ) -> Pin>> { + assert_eq!(start_sectors.len(), nr_sectors.len()); + + let (input, nr_seg_used) = + self.prepare_erase_ranges(start_sectors, nr_sectors, unmap, is_discard); + + let input_f = unsafe { + core::slice::from_raw_parts( + input.as_ptr() as *const u8, + core::mem::size_of::() * nr_seg_used, + ) + }; + let mut inner = self.inner.lock(); + let mut future = Box::pin(BlkFuture::new(self.inner.clone())); + + match inner.queue.add( + &[req.as_bytes(), input_f], + &mut [future.resp.as_bytes_mut()], + ) { + Ok(n) => { + future.head = n; + if inner + .queue + .should_notify(self.features_neg.contains(BlkFeature::VIRTIO_F_EVENT_IDX)) + { + self.transport.notify(QUEUE); + } + } + Err(e) => future.err = Some(e), + } + future } } -impl Drop for VirtIOBlk { +impl<'a, H: Hal, T: Transport> Drop for VirtIOBlk { fn drop(&mut self) { // Clear any pointers pointing to DMA regions, so the device doesn't try to access them // after they have been freed. @@ -315,30 +917,142 @@ impl Drop for VirtIOBlk { } } +/// +/// Ref: linux kernel (virtio_blk.h: virtio_blk_config) #[repr(C)] struct BlkConfig { - /// Number of 512 Bytes sectors + /// The capacity (in 512-byte sectors). capacity_low: Volatile, capacity_high: Volatile, + + /// The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) size_max: Volatile, + + /// The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) seg_max: Volatile, + + /// geometry of the device (if VIRTIO_BLK_F_GEOMETRY) cylinders: Volatile, heads: Volatile, sectors: Volatile, + + /// block size of device (if VIRTIO_BLK_F_BLK_SIZE) blk_size: Volatile, + + /// the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY + /// exponent for physical block per logical block. physical_block_exp: Volatile, + + /// alignment offset in logical blocks. alignment_offset: Volatile, + + /// minimum I/O size without performance penalty in logical blocks. min_io_size: Volatile, + + /// optimal sustained I/O size in logical blocks. opt_io_size: Volatile, - // ... ignored + + /// writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) + wce: Volatile, + unused: Volatile, + + /// number of vqs, only available when VIRTIO_BLK_F_MQ is set + num_queues: Volatile, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD + /// + /// The maximum discard sectors (in 512-byte sectors) for + /// one segment. + max_discard_sectors: Volatile, + + /// The maximum number of discard segments in a discard command. + max_discard_seg: Volatile, + + /// Discard commands must be aligned to this number of sectors. + discard_sector_alignment: Volatile, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES. + /// The maximum number of write zeroes sectors (in 512-byte sectors) in one segment. + max_write_zeroes_sectors: Volatile, + + /// The maximum number of segments in a write zeroes command. + max_write_zeroes_seg: Volatile, + + /// Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the deallocation of one or more of the sectors. + write_zeroes_may_unmap: Volatile, + + unused1: [Volatile; 3], } +struct Blkconfiglocal { + /// The capacity (in 512-byte sectors). + capacity: u64, + + /// The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) + size_max: u32, + + /// The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) + seg_max: u32, + + /// geometry of the device (if VIRTIO_BLK_F_GEOMETRY) + cylinders: u16, + heads: u8, + sectors: u8, + + /// block size of device (if VIRTIO_BLK_F_BLK_SIZE) + blk_size: u32, + + /// the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY + /// exponent for physical block per logical block. + physical_block_exp: u8, + /// alignment offset in logical blocks. + alignment_offset: u8, + + /// minimum I/O size without performance penalty in logical blocks. + min_io_size: u16, + + /// optimal sustained I/O size in logical blocks. + opt_io_size: u32, + + /// writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) + wce: u8, + + /// number of vqs, only available when VIRTIO_BLK_F_MQ is set + num_queues: u16, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD + /// + /// The maximum discard sectors (in 512-byte sectors) for + /// one segment. + max_discard_sectors: u32, + + /// The maximum number of discard segments in a discard command. + max_discard_seg: u32, + + /// Discard commands must be aligned to this number of sectors. + discard_sector_alignment: u32, + + /// the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES. + /// The maximum number of write zeroes sectors (in 512-byte sectors) in one segment. + max_write_zeroes_sectors: u32, + + /// The maximum number of segments in a write zeroes command. + max_write_zeroes_seg: u32, + + /// Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the deallocation of one or more of the sectors. + write_zeroes_may_unmap: u8, +} /// A VirtIO block device request. +/// +/// Ref: virtio_blk.c virtio_blk_outhdr #[repr(C)] #[derive(AsBytes, Debug)] pub struct BlkReq { + /// VIRTIO_BLK_T* type_: ReqType, + /// io priority. reserved: u32, + /// Sector (ie. 512 byte offset) sector: u64, } @@ -372,13 +1086,16 @@ enum ReqType { In = 0, Out = 1, Flush = 4, + GetID = 8, + GetLifeTime = 10, Discard = 11, WriteZeroes = 13, + SecureErase = 14, } /// Status of a VirtIOBlk request. -#[repr(transparent)] #[derive(AsBytes, Copy, Clone, Debug, Eq, FromBytes, PartialEq)] +#[repr(transparent)] pub struct RespStatus(u8); impl RespStatus { @@ -417,55 +1134,135 @@ impl Default for BlkResp { pub const SECTOR_SIZE: usize = 512; bitflags! { - #[derive(Copy, Clone, Debug, Default, Eq, PartialEq)] struct BlkFeature: u64 { /// Device supports request barriers. (legacy) - const BARRIER = 1 << 0; + const VIRTIO_BLK_F_BARRIER = 1 << 0; /// Maximum size of any single segment is in `size_max`. - const SIZE_MAX = 1 << 1; + const VIRTIO_BLK_F_SIZE_MAX = 1 << 1; /// Maximum number of segments in a request is in `seg_max`. - const SEG_MAX = 1 << 2; + const VIRTIO_BLK_F_SEG_MAX = 1 << 2; /// Disk-style geometry specified in geometry. - const GEOMETRY = 1 << 4; + const VIRTIO_BLK_F_GEOMETRY = 1 << 4; /// Device is read-only. - const RO = 1 << 5; + const VIRTIO_BLK_F_RO = 1 << 5; /// Block size of disk is in `blk_size`. - const BLK_SIZE = 1 << 6; + const VIRTIO_BLK_F_BLK_SIZE = 1 << 6; /// Device supports scsi packet commands. (legacy) - const SCSI = 1 << 7; + const VIRTIO_BLK_F_SCSI = 1 << 7; /// Cache flush command support. - const FLUSH = 1 << 9; + const VIRTIO_BLK_F_FLUSH = 1 << 9; /// Device exports information on optimal I/O alignment. - const TOPOLOGY = 1 << 10; + const VIRTIO_BLK_F_TOPOLOGY = 1 << 10; /// Device can toggle its cache between writeback and writethrough modes. - const CONFIG_WCE = 1 << 11; + const VIRTIO_BLK_F_CONFIG_WCE = 1 << 11; /// Device can support discard command, maximum discard sectors size in /// `max_discard_sectors` and maximum discard segment number in /// `max_discard_seg`. - const DISCARD = 1 << 13; + const VIRTIO_BLK_F_DISCARD = 1 << 13; /// Device can support write zeroes command, maximum write zeroes sectors /// size in `max_write_zeroes_sectors` and maximum write zeroes segment /// number in `max_write_zeroes_seg`. - const WRITE_ZEROES = 1 << 14; + const VIRTIO_BLK_F_WRITE_ZEROES = 1 << 14; // device independent - const NOTIFY_ON_EMPTY = 1 << 24; // legacy - const ANY_LAYOUT = 1 << 27; // legacy - const RING_INDIRECT_DESC = 1 << 28; - const RING_EVENT_IDX = 1 << 29; - const UNUSED = 1 << 30; // legacy - const VERSION_1 = 1 << 32; // detect legacy + const VIRTIO_F_NOTIFY_ON_EMPTY = 1 << 24; // legacy + const VIRTIO_F_ANY_LAYOUT = 1 << 27; // legacy + const VIRTIO_F_INDIRECT_DESC = 1 << 28; + const VIRTIO_F_EVENT_IDX = 1 << 29; + const UNUSED = 1 << 30; // legacy + const VIRTIO_F_VERSION_1 = 1 << 32; // detect legacy // the following since virtio v1.1 - const ACCESS_PLATFORM = 1 << 33; - const RING_PACKED = 1 << 34; - const IN_ORDER = 1 << 35; - const ORDER_PLATFORM = 1 << 36; - const SR_IOV = 1 << 37; - const NOTIFICATION_DATA = 1 << 38; + const VIRTIO_F_ACCESS_PLATFORM = 1 << 33; + const VIRTIO_F_RING_PACKED = 1 << 34; + const VIRTIO_F_IN_ORDER = 1 << 35; + const VIRTIO_F_ORDER_PLATFORM = 1 << 36; + const VIRTIO_F_SR_IOV = 1 << 37; + const VIRTIO_F_NOTIFICATION_DATA = 1 << 38; + } +} +/// used for discard +pub struct Range { + /// discard/write zeroes start sector + sector: u64, + /// number of discard/write zeroes sectors + num_sector: u32, + /// flags for this range + flags: u32, +} +impl Copy for Range {} + +impl Clone for Range { + fn clone(&self) -> Range { + Range { + sector: self.sector, + num_sector: self.num_sector, + flags: self.flags, + } } } +const SECTOR_SHIFT: usize = 9; + +// +++++++++++++++++++++++++++++++++++++++ +// Async IO +// +++++++++++++++++++++++++++++++++++++++ + +#[repr(C)] +#[derive(Debug)] +struct BlkInfo { + waker: Option, +} + +const NULLINFO: BlkInfo = BlkInfo::new(); + +impl BlkInfo { + const fn new() -> Self { + BlkInfo { waker: None } + } +} + +/// for async IO +pub struct BlkFuture { + resp: BlkResp, + head: u16, + queue: Arc>>, + err: Option, +} + +impl<'a, H: Hal> BlkFuture { + /// construct a new BlkFuture + fn new(queue: Arc>>) -> Self { + Self { + resp: BlkResp::default(), + head: u16::MAX, + queue: queue, + err: None, + } + } +} + +impl Future for BlkFuture { + type Output = Result; + fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll { + if let Some(e) = self.err { + return Poll::Ready(Err(e)); + } + match unsafe { core::ptr::read_volatile(&self.resp.status) } { + RespStatus::OK => Poll::Ready(Ok(())), + RespStatus::NOT_READY => { + self.queue.lock().blkinfos[self.head as usize].waker = Some(cx.waker().clone()); + Poll::Pending + } + _ => Poll::Ready(Err(Error::IoError)), + } + } +} + +// +++++++++++++++++++++++++++++++++++++++ +// test +// +++++++++++++++++++++++++++++++++++++++ + #[cfg(test)] mod tests { use super::*; @@ -495,6 +1292,16 @@ mod tests { alignment_offset: Volatile::new(0), min_io_size: Volatile::new(0), opt_io_size: Volatile::new(0), + wce: Volatile::new(0), + unused: Volatile::new(0), + max_discard_sectors: Volatile::new(0), + num_queues: Volatile::new(0), + max_discard_seg: Volatile::new(0), + max_write_zeroes_sectors: Volatile::new(0), + max_write_zeroes_seg: Volatile::new(0), + write_zeroes_may_unmap: Volatile::new(0), + discard_sector_alignment: Volatile::new(0), + unused1: [Volatile::new(0), Volatile::new(0), Volatile::new(0)], }; let state = Arc::new(Mutex::new(State { status: DeviceStatus::empty(), @@ -506,11 +1313,11 @@ mod tests { let transport = FakeTransport { device_type: DeviceType::Console, max_queue_size: QUEUE_SIZE.into(), - device_features: BlkFeature::RO.bits(), + device_features: BlkFeature::VIRTIO_BLK_F_RO.bits(), config_space: NonNull::from(&mut config_space), state: state.clone(), }; - let blk = VirtIOBlk::>::new(transport).unwrap(); + let blk = VirtIOBlk::>::new(transport, false).unwrap(); assert_eq!(blk.capacity(), 0x02_0000_0042); assert_eq!(blk.readonly(), true); @@ -531,6 +1338,16 @@ mod tests { alignment_offset: Volatile::new(0), min_io_size: Volatile::new(0), opt_io_size: Volatile::new(0), + wce: Volatile::new(0), + unused: Volatile::new(0), + max_discard_sectors: Volatile::new(0), + num_queues: Volatile::new(0), + max_discard_seg: Volatile::new(0), + max_write_zeroes_sectors: Volatile::new(0), + max_write_zeroes_seg: Volatile::new(0), + write_zeroes_may_unmap: Volatile::new(0), + discard_sector_alignment: Volatile::new(0), + unused1: [Volatile::new(0), Volatile::new(0), Volatile::new(0)], }; let state = Arc::new(Mutex::new(State { status: DeviceStatus::empty(), @@ -546,7 +1363,8 @@ mod tests { config_space: NonNull::from(&mut config_space), state: state.clone(), }; - let mut blk = VirtIOBlk::>::new(transport).unwrap(); + let mut blk = + VirtIOBlk::>::new(transport, false).unwrap(); // Start a thread to simulate the device waiting for a read request. let handle = thread::spawn(move || { @@ -583,7 +1401,7 @@ mod tests { // Read a block from the device. let mut buffer = [0; 512]; - blk.read_block(42, &mut buffer).unwrap(); + blk.read_blocks(42, &mut [&mut buffer]).unwrap(); assert_eq!(&buffer[0..9], b"Test data"); handle.join().unwrap(); @@ -604,6 +1422,16 @@ mod tests { alignment_offset: Volatile::new(0), min_io_size: Volatile::new(0), opt_io_size: Volatile::new(0), + wce: Volatile::new(0), + unused: Volatile::new(0), + max_discard_sectors: Volatile::new(0), + num_queues: Volatile::new(0), + max_discard_seg: Volatile::new(0), + max_write_zeroes_sectors: Volatile::new(0), + max_write_zeroes_seg: Volatile::new(0), + write_zeroes_may_unmap: Volatile::new(0), + discard_sector_alignment: Volatile::new(0), + unused1: [Volatile::new(0), Volatile::new(0), Volatile::new(0)], }; let state = Arc::new(Mutex::new(State { status: DeviceStatus::empty(), @@ -619,7 +1447,8 @@ mod tests { config_space: NonNull::from(&mut config_space), state: state.clone(), }; - let mut blk = VirtIOBlk::>::new(transport).unwrap(); + let mut blk = + VirtIOBlk::>::new(transport, false).unwrap(); // Start a thread to simulate the device waiting for a write request. let handle = thread::spawn(move || { @@ -659,7 +1488,7 @@ mod tests { // Write a block to the device. let mut buffer = [0; 512]; buffer[0..9].copy_from_slice(b"Test data"); - blk.write_block(42, &mut buffer).unwrap(); + blk.write_blocks(42, &[&buffer]).unwrap(); handle.join().unwrap(); } diff --git a/src/device/console.rs b/src/device/console.rs index 7d3c7d4e..95f3cc41 100644 --- a/src/device/console.rs +++ b/src/device/console.rs @@ -1,7 +1,7 @@ //! Driver for VirtIO console devices. use crate::hal::Hal; -use crate::queue::VirtQueue; +use crate::queue::split_queue::SplitQueue; use crate::transport::Transport; use crate::volatile::{volread, ReadOnly, WriteOnly}; use crate::{Result, PAGE_SIZE}; @@ -42,8 +42,8 @@ const QUEUE_SIZE: usize = 2; pub struct VirtIOConsole { transport: T, config_space: NonNull, - receiveq: VirtQueue, - transmitq: VirtQueue, + receiveq: SplitQueue, + transmitq: SplitQueue, queue_buf_rx: Box<[u8; PAGE_SIZE]>, cursor: usize, pending_len: usize, @@ -65,6 +65,9 @@ pub struct ConsoleInfo { impl VirtIOConsole { /// Creates a new VirtIO console driver. pub fn new(mut transport: T) -> Result { + // TODO: If the device does not support indirect descriptors, this parameter will be set to false, and vice versa. + let indirect_desc = false; + transport.begin_init(|features| { let features = Features::from_bits_truncate(features); info!("Device features {:?}", features); @@ -72,8 +75,8 @@ impl VirtIOConsole { (features & supported_features).bits() }); let config_space = transport.config_space::()?; - let receiveq = VirtQueue::new(&mut transport, QUEUE_RECEIVEQ_PORT_0)?; - let transmitq = VirtQueue::new(&mut transport, QUEUE_TRANSMITQ_PORT_0)?; + let receiveq = SplitQueue::new(&mut transport, QUEUE_RECEIVEQ_PORT_0, indirect_desc)?; + let transmitq = SplitQueue::new(&mut transport, QUEUE_TRANSMITQ_PORT_0, indirect_desc)?; // Safe because no alignment or initialisation is required for [u8], the DMA buffer is // dereferenceable, and the lifetime of the reference matches the lifetime of the DMA buffer @@ -120,7 +123,7 @@ impl VirtIOConsole { self.receiveq .add(&[], &mut [self.queue_buf_rx.as_mut_slice()]) }?); - if self.receiveq.should_notify() { + if self.receiveq.should_notify(false) { self.transport.notify(QUEUE_RECEIVEQ_PORT_0); } } @@ -187,7 +190,7 @@ impl VirtIOConsole { pub fn send(&mut self, chr: u8) -> Result<()> { let buf: [u8; 1] = [chr]; self.transmitq - .add_notify_wait_pop(&[&buf], &mut [], &mut self.transport)?; + .add_notify_wait_pop(&[&buf], &mut [], &mut self.transport, false)?; Ok(()) } } diff --git a/src/device/gpu.rs b/src/device/gpu.rs index 43e1b76d..cfb3bf2f 100644 --- a/src/device/gpu.rs +++ b/src/device/gpu.rs @@ -1,7 +1,7 @@ //! Driver for VirtIO GPU devices. use crate::hal::{BufferDirection, Dma, Hal}; -use crate::queue::VirtQueue; +use crate::queue::split_queue::SplitQueue; use crate::transport::Transport; use crate::volatile::{volread, ReadOnly, Volatile, WriteOnly}; use crate::{pages, Error, Result, PAGE_SIZE}; @@ -27,9 +27,9 @@ pub struct VirtIOGpu { /// DMA area of cursor image buffer. cursor_buffer_dma: Option>, /// Queue for sending control commands. - control_queue: VirtQueue, + control_queue: SplitQueue, /// Queue for sending cursor commands. - cursor_queue: VirtQueue, + cursor_queue: SplitQueue, /// Send buffer for queue. queue_buf_send: Box<[u8]>, /// Recv buffer for queue. @@ -39,6 +39,9 @@ pub struct VirtIOGpu { impl VirtIOGpu { /// Create a new VirtIO-Gpu driver. pub fn new(mut transport: T) -> Result { + // TODO: + let indirect_desc = false; + transport.begin_init(|features| { let features = Features::from_bits_truncate(features); info!("Device features {:?}", features); @@ -57,8 +60,8 @@ impl VirtIOGpu { ); } - let control_queue = VirtQueue::new(&mut transport, QUEUE_TRANSMIT)?; - let cursor_queue = VirtQueue::new(&mut transport, QUEUE_CURSOR)?; + let control_queue = SplitQueue::new(&mut transport, QUEUE_TRANSMIT, indirect_desc)?; + let cursor_queue = SplitQueue::new(&mut transport, QUEUE_CURSOR, indirect_desc)?; let queue_buf_send = FromBytes::new_box_slice_zeroed(PAGE_SIZE); let queue_buf_recv = FromBytes::new_box_slice_zeroed(PAGE_SIZE); @@ -173,6 +176,7 @@ impl VirtIOGpu { &[&self.queue_buf_send], &mut [&mut self.queue_buf_recv], &mut self.transport, + false, )?; Ok(Rsp::read_from_prefix(&*self.queue_buf_recv).unwrap()) } @@ -184,6 +188,7 @@ impl VirtIOGpu { &[&self.queue_buf_send], &mut [], &mut self.transport, + false, )?; Ok(()) } diff --git a/src/device/input.rs b/src/device/input.rs index dee2fec9..92f5ef16 100644 --- a/src/device/input.rs +++ b/src/device/input.rs @@ -2,7 +2,7 @@ use super::common::Feature; use crate::hal::Hal; -use crate::queue::VirtQueue; +use crate::queue::split_queue::SplitQueue; use crate::transport::Transport; use crate::volatile::{volread, volwrite, ReadOnly, WriteOnly}; use crate::Result; @@ -18,8 +18,8 @@ use zerocopy::{AsBytes, FromBytes}; /// making pass-through implementations on top of evdev easy. pub struct VirtIOInput { transport: T, - event_queue: VirtQueue, - status_queue: VirtQueue, + event_queue: SplitQueue, + status_queue: SplitQueue, event_buf: Box<[InputEvent; 32]>, config: NonNull, } @@ -27,6 +27,9 @@ pub struct VirtIOInput { impl VirtIOInput { /// Create a new VirtIO-Input driver. pub fn new(mut transport: T) -> Result { + // TODO: + let indirect_desc = false; + let mut event_buf = Box::new([InputEvent::default(); QUEUE_SIZE]); transport.begin_init(|features| { let features = Feature::from_bits_truncate(features); @@ -38,14 +41,14 @@ impl VirtIOInput { let config = transport.config_space::()?; - let mut event_queue = VirtQueue::new(&mut transport, QUEUE_EVENT)?; - let status_queue = VirtQueue::new(&mut transport, QUEUE_STATUS)?; + let mut event_queue = SplitQueue::new(&mut transport, QUEUE_EVENT, indirect_desc)?; + let status_queue = SplitQueue::new(&mut transport, QUEUE_STATUS, indirect_desc)?; for (i, event) in event_buf.as_mut().iter_mut().enumerate() { // Safe because the buffer lasts as long as the queue. let token = unsafe { event_queue.add(&[], &mut [event.as_bytes_mut()])? }; assert_eq!(token, i as u16); } - if event_queue.should_notify() { + if event_queue.should_notify(false) { transport.notify(QUEUE_EVENT); } @@ -85,7 +88,7 @@ impl VirtIOInput { // the list of free descriptors in the queue, so `add` reuses the descriptor which // was just freed by `pop_used`. assert_eq!(new_token, token); - if self.event_queue.should_notify() { + if self.event_queue.should_notify(false) { self.transport.notify(QUEUE_EVENT); } return Some(event_saved); diff --git a/src/device/net.rs b/src/device/net.rs index b9419e72..3a5354bb 100644 --- a/src/device/net.rs +++ b/src/device/net.rs @@ -1,7 +1,7 @@ //! Driver for VirtIO network devices. use crate::hal::Hal; -use crate::queue::VirtQueue; +use crate::queue::split_queue::SplitQueue; use crate::transport::Transport; use crate::volatile::{volread, ReadOnly}; use crate::{Error, Result}; @@ -104,14 +104,17 @@ impl RxBuffer { pub struct VirtIONet { transport: T, mac: EthernetAddress, - recv_queue: VirtQueue, - send_queue: VirtQueue, + recv_queue: SplitQueue, + send_queue: SplitQueue, rx_buffers: [Option; QUEUE_SIZE], } impl VirtIONet { /// Create a new VirtIO-Net driver. pub fn new(mut transport: T, buf_len: usize) -> Result { + // TODO: + let indirect_desc = false; + transport.begin_init(|features| { let features = Features::from_bits_truncate(features); info!("Device features {:?}", features); @@ -139,8 +142,8 @@ impl VirtIONet return Err(Error::InvalidParam); } - let send_queue = VirtQueue::new(&mut transport, QUEUE_TRANSMIT)?; - let mut recv_queue = VirtQueue::new(&mut transport, QUEUE_RECEIVE)?; + let send_queue = SplitQueue::new(&mut transport, QUEUE_TRANSMIT, indirect_desc)?; + let mut recv_queue = SplitQueue::new(&mut transport, QUEUE_RECEIVE, indirect_desc)?; const NONE_BUF: Option = None; let mut rx_buffers = [NONE_BUF; QUEUE_SIZE]; @@ -152,7 +155,7 @@ impl VirtIONet *rx_buf_place = Some(rx_buf); } - if recv_queue.should_notify() { + if recv_queue.should_notify(false) { transport.notify(QUEUE_RECEIVE); } @@ -228,7 +231,7 @@ impl VirtIONet } rx_buf.idx = new_token; self.rx_buffers[new_token as usize] = Some(rx_buf); - if self.recv_queue.should_notify() { + if self.recv_queue.should_notify(false) { self.transport.notify(QUEUE_RECEIVE); } Ok(()) @@ -247,6 +250,7 @@ impl VirtIONet &[header.as_bytes(), tx_buf.packet()], &mut [], &mut self.transport, + false, )?; Ok(()) } @@ -378,7 +382,6 @@ bitflags! { const RSC_INFO = 4; } } - #[repr(transparent)] #[derive(AsBytes, Debug, Copy, Clone, Default, Eq, FromBytes, PartialEq)] struct GsoType(u8); diff --git a/src/device/socket/vsock.rs b/src/device/socket/vsock.rs index 523930e8..acaf5376 100644 --- a/src/device/socket/vsock.rs +++ b/src/device/socket/vsock.rs @@ -4,7 +4,7 @@ use super::error::SocketError; use super::protocol::{Feature, VirtioVsockConfig, VirtioVsockHdr, VirtioVsockOp, VsockAddr}; use crate::hal::Hal; -use crate::queue::VirtQueue; +use crate::queue::split_queue::SplitQueue; use crate::transport::Transport; use crate::volatile::volread; use crate::{Error, Result}; @@ -212,10 +212,10 @@ pub enum VsockEventType { pub struct VirtIOSocket { transport: T, /// Virtqueue to receive packets. - rx: VirtQueue, - tx: VirtQueue, + rx: SplitQueue, + tx: SplitQueue, /// Virtqueue to receive events from the device. - event: VirtQueue, + event: SplitQueue, /// The guest_cid field contains the guest’s context ID, which uniquely identifies /// the device for its lifetime. The upper 32 bits of the CID are reserved and zeroed. guest_cid: u64, @@ -241,6 +241,9 @@ impl Drop for VirtIOSocket { impl VirtIOSocket { /// Create a new VirtIO Vsock driver. pub fn new(mut transport: T) -> Result { + // TODO: + let indirect_desc = false; + transport.begin_init(|features| { let features = Feature::from_bits_truncate(features); debug!("Device features: {:?}", features); @@ -257,9 +260,9 @@ impl VirtIOSocket { }; debug!("guest cid: {guest_cid:?}"); - let mut rx = VirtQueue::new(&mut transport, RX_QUEUE_IDX)?; - let tx = VirtQueue::new(&mut transport, TX_QUEUE_IDX)?; - let event = VirtQueue::new(&mut transport, EVENT_QUEUE_IDX)?; + let mut rx = SplitQueue::new(&mut transport, RX_QUEUE_IDX, indirect_desc)?; + let tx = SplitQueue::new(&mut transport, TX_QUEUE_IDX, indirect_desc)?; + let event = SplitQueue::new(&mut transport, EVENT_QUEUE_IDX, indirect_desc)?; // Allocate and add buffers for the RX queue. let mut rx_queue_buffers = [null_mut(); QUEUE_SIZE]; @@ -274,7 +277,7 @@ impl VirtIOSocket { let rx_queue_buffers = rx_queue_buffers.map(|ptr| NonNull::new(ptr).unwrap()); transport.finish_init(); - if rx.should_notify() { + if rx.should_notify(false) { transport.notify(RX_QUEUE_IDX); } @@ -415,6 +418,7 @@ impl VirtIOSocket { &[header.as_bytes(), buffer], &mut [], &mut self.transport, + false, )?; Ok(()) } @@ -440,7 +444,7 @@ impl VirtIOSocket { assert_eq!(new_token, index); } - if self.rx.should_notify() { + if self.rx.should_notify(false) { self.transport.notify(RX_QUEUE_IDX); } diff --git a/src/lib.rs b/src/lib.rs index f2f2f12b..9794fc73 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,8 +51,8 @@ extern crate alloc; pub mod device; mod hal; -mod queue; pub mod transport; +mod queue; mod volatile; use core::{ diff --git a/src/queue/mod.rs b/src/queue/mod.rs new file mode 100644 index 00000000..ff26483e --- /dev/null +++ b/src/queue/mod.rs @@ -0,0 +1,84 @@ +//! VirtIO queues. + +pub mod packed_queue; +pub mod split_queue; + +use crate::transport::Transport; +use crate::Hal; +use crate::Result; + +pub enum VirtQueue { + Packedqueue(packed_queue::PackedQueue), + Splitqueue(split_queue::SplitQueue), +} + +impl VirtQueue { + pub fn add_notify_wait_pop<'a>( + &mut self, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + transport: &mut impl Transport, + support_event: bool, + ) -> Result { + match self { + Self::Packedqueue(packedqueue) => { + packedqueue.add_notify_wait_pop(inputs, outputs, transport, support_event) + } + Self::Splitqueue(splitqueue) => { + splitqueue.add_notify_wait_pop(inputs, outputs, transport, support_event) + } + } + } + + pub unsafe fn add<'a, 'b>( + &mut self, + inputs: &'a [&'b [u8]], + outputs: &'a mut [&'b mut [u8]], + ) -> Result { + match self { + Self::Packedqueue(packedqueue) => packedqueue.add(inputs, outputs), + Self::Splitqueue(splitqueue) => splitqueue.add(inputs, outputs), + } + } + + pub fn should_notify(&self, support_event: bool) -> bool { + match self { + Self::Packedqueue(packedqueue) => packedqueue.should_notify(support_event), + Self::Splitqueue(splitqueue) => splitqueue.should_notify(support_event), + } + } + + /// Returns whether there is a used element that can be popped. + pub fn can_pop(&self) -> bool { + match self { + Self::Packedqueue(packedqueue) => packedqueue.can_pop(), + Self::Splitqueue(splitqueue) => splitqueue.can_pop(), + } + } + + /// pop used + // TODO: will be deleted in the further + pub unsafe fn pop_used_async<'a>( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + match self { + Self::Packedqueue(packedqueue) => packedqueue.pop_used_async(token, inputs, outputs), + Self::Splitqueue(splitqueue) => splitqueue.pop_used_async(token, inputs, outputs), + } + } + + pub unsafe fn pop_used<'a>( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + match self { + Self::Packedqueue(packedqueue) => packedqueue.pop_used(token, inputs, outputs), + Self::Splitqueue(splitqueue) => splitqueue.pop_used(token, inputs, outputs), + } + } +} diff --git a/src/queue/packed_queue.rs b/src/queue/packed_queue.rs new file mode 100644 index 00000000..1d932557 --- /dev/null +++ b/src/queue/packed_queue.rs @@ -0,0 +1,739 @@ +#![deny(unsafe_op_in_unsafe_fn)] + +use crate::hal::{BufferDirection, Dma, Hal, PhysAddr}; +use crate::transport::Transport; +use crate::{nonnull_slice_from_raw_parts, pages, Error, Result}; +use bitflags::bitflags; +use core::array; +use core::hint::spin_loop; +use core::mem::{size_of, take}; +use core::ptr::NonNull; +use core::sync::atomic::{fence, Ordering}; +use log::info; +use zerocopy::FromBytes; + +pub struct PackedQueue { + /// DMA guard + layout: PackedQueueLayout, + + /// Descriptor table + /// + /// The device may be able to modify this, even though it's not supposed to, so we shouldn't + /// trust values read back from it. Use `desc_shadow` instead to keep track of what we wrote to + /// it. + desc: NonNull<[Descriptor]>, + + indirect_desc_vec: [Option>; SIZE], + driver_event_suppression: NonNull, + device_event_suppression: NonNull, + + /// The number of descriptors currently in use. + avail_wrap_count: bool, + used_wrap_count: bool, + avail_used_flags: DescFlags, + + /// The index of queue + queue_idx: u16, + + /// Our trusted copy of `desc` that the device can't access. + desc_shadow: [Descriptor; SIZE], + /// Our trusted copy of `avail.idx`. + last_used_idx: u16, + free_head: u16, + num_used: u16, + + indirect_desc: bool, +} + +impl PackedQueue { + /// Create a new VirtQueue. + /// + /// Ref: 4.2.3.2 Virtqueue Configuration + pub fn new(transport: &mut T, idx: u16, indirect_desc: bool) -> Result { + if transport.queue_used(idx) { + return Err(Error::AlreadyUsed); + } + if !SIZE.is_power_of_two() + || SIZE > u16::MAX.into() + || transport.max_queue_size(idx) < SIZE as u32 + { + return Err(Error::InvalidParam); + } + let size = SIZE as u16; + + let layout = PackedQueueLayout::allocate(size)?; + + transport.queue_set( + idx, + size.into(), + layout.descriptors_paddr(), + layout.driver_area_paddr(), + layout.device_area_paddr(), + ); + + let mut indirect_desc_vec = array::from_fn(|i| None); + + let desc = + nonnull_slice_from_raw_parts(layout.descriptors_vaddr().cast::(), SIZE); + let driver_event_suppression = layout.driver_event_suppression_vaddr().cast(); + let device_event_suppression = layout.device_event_suppression_vaddr().cast(); + + let desc_shadow: [Descriptor; SIZE] = FromBytes::new_zeroed(); + + Ok(PackedQueue { + layout, + desc, + driver_event_suppression, + device_event_suppression, + indirect_desc_vec, + queue_idx: idx, + avail_wrap_count: true, + used_wrap_count: true, + avail_used_flags: DescFlags::VIRTQ_DESC_F_AVAIL, + desc_shadow, + free_head: 0, + num_used: 0, + last_used_idx: 0, + indirect_desc, + }) + } + + /// Unshares buffers in the list starting at descriptor index `head` and adds them to the free + /// list. Unsharing may involve copying data back to the original buffers, so they must be + /// passed in too. + /// + /// Ref: linux virtio_ring.c detach_buf_split + /// This will push all linked descriptors at the front of the free list. + /// + /// # Safety + /// + /// The buffers in `inputs` and `outputs` must match the set of buffers originally added to the + /// queue by `add`. + unsafe fn recycle_descriptors<'a>( + &mut self, + head: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> usize { + // let original_free_head = self.free_head; + let mut next = head; + let mut len = 0; + + if self.indirect_desc { + let desc: &mut Descriptor = &mut self.desc_shadow[usize::from(next)]; + len = desc.len as usize / size_of::(); + desc.unset_buf(); + self.write_desc(next); + self.num_used = self.num_used.wrapping_sub(1); + next += 1; + if next == SIZE as u16 { + next = 0; + self.used_wrap_count ^= true; + } + self.last_used_idx = next; + + // Release indirect descriptors + let size_indirect_descs = size_of::() * (inputs.len() + outputs.len()); + let indirect_descs = nonnull_slice_from_raw_parts( + self.indirect_desc_vec[head as usize] + .as_ref() + .unwrap() + .vaddr(0) + .cast::(), + size_indirect_descs, + ); + + let mut indirect_desc_index = 0 as usize; + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // let desc = &mut self.indirect_descs[usize::from(indirect_desc_index)]; + let paddr = unsafe { (*indirect_descs.as_ptr())[indirect_desc_index].addr }; + + // Safe because the caller ensures that the buffer is valid and matches the + // descriptor from which we got `paddr`. + unsafe { + // Unshare the buffer (and perhaps copy its contents back to the original buffer). + H::unshare(paddr as usize, buffer, direction); + } + indirect_desc_index += 1; + } + } else { + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + let desc_index = next; + let desc: &mut Descriptor = &mut self.desc_shadow[usize::from(desc_index)]; + + let paddr = desc.addr; + desc.unset_buf(); + self.num_used = self.num_used.wrapping_sub(1); + next += 1; + if next == SIZE as u16 { + next = 0; + self.used_wrap_count ^= true; + } + + self.write_desc(desc_index); + + // Safe because the caller ensures that the buffer is valid and matches the descriptor + // from which we got `paddr`. + unsafe { + // Unshare the buffer (and perhaps copy its contents back to the original buffer). + H::unshare(paddr as usize, buffer, direction); + } + self.last_used_idx = next; + } + } + len + } + + /// Copies the descriptor at the given index from `desc_shadow` to `desc`, so it can be seen by + /// the device. + fn write_desc(&mut self, index: u16) { + let index = usize::from(index); + // Safe because self.desc is properly aligned, dereferenceable and initialised, and nothing + // else reads or writes the descriptor during this block. + unsafe { + (*self.desc.as_ptr())[index] = self.desc_shadow[index].clone(); + } + } +} + +impl PackedQueue { + /// Add buffers to the virtqueue, return a token. + /// + /// Ref: linux virtio_ring.c virtqueue_add + /// Ref: Section 2.7.13 + /// + /// # Safety + /// + /// The input and output buffers must remain valid and not be accessed until a call to + /// `pop_used` with the returned token succeeds. + pub unsafe fn add<'a, 'b>( + &mut self, + inputs: &'a [&'b [u8]], + outputs: &'a mut [&'b mut [u8]], + ) -> Result { + if inputs.is_empty() && outputs.is_empty() { + return Err(Error::InvalidParam); + } + + let desc_nr_needed = inputs.len() + outputs.len(); + if self.indirect_desc { + if (SIZE - self.num_used as usize) < 1 || desc_nr_needed > SIZE { + return Err(Error::QueueFull); + } + } else { + if desc_nr_needed + self.num_used as usize > SIZE { + return Err(Error::QueueFull); + } + } + + // allocate descriptors from free list + let head = self.free_head; + let id = head; + let mut last = 0; + let mut first_flags = DescFlags::empty(); + + if self.indirect_desc && desc_nr_needed > 1 { + let size_indirect_descs = size_of::() * (inputs.len() + outputs.len()); + let indirect_descs_dma = + Dma::::new(pages(size_indirect_descs), BufferDirection::DriverToDevice)?; + let indirect_descs = nonnull_slice_from_raw_parts( + indirect_descs_dma.vaddr(0).cast::(), + size_indirect_descs, + ); + + self.indirect_desc_vec[id as usize] = Some(indirect_descs_dma); + let mut index = 0; + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // Write to desc_shadow then copy. + let mut desc = Descriptor::default(); + let flag = DescFlags::empty(); + + unsafe { + desc.set_buf::(buffer, direction, flag); + } + unsafe { + (*indirect_descs.as_ptr())[index] = desc; + } + index += 1; + } + let indirect_descs = unsafe { + core::slice::from_raw_parts( + indirect_descs.as_ptr() as *const u8, + size_indirect_descs, + ) + }; + let desc = &mut self.desc_shadow[usize::from(self.free_head)]; + unsafe { + desc.set_buf::( + indirect_descs.into(), + BufferDirection::DriverToDevice, + DescFlags::empty(), + ); + } + last = self.free_head; + self.free_head += 1; + first_flags |= self.avail_used_flags; + first_flags |= DescFlags::VIRTQ_DESC_F_INDIRECT; + + if self.free_head == SIZE as u16 { + self.free_head = 0; + self.avail_wrap_count ^= true; + self.avail_used_flags ^= + DescFlags::VIRTQ_DESC_F_AVAIL | DescFlags::VIRTQ_DESC_F_USED; + } + self.write_desc(last); + self.num_used += 1; + } else { + // A buffer consists of zero or more device-readable physically-contiguous elements followed by zero or more physically-contiguous device-writable elements (each has at least one element). + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // Write to desc_shadow then copy. + let mut flags = DescFlags::VIRTQ_DESC_F_NEXT; + let desc = &mut self.desc_shadow[usize::from(self.free_head)]; + if head != self.free_head { + flags |= self.avail_used_flags; + } else { + first_flags |= self.avail_used_flags; + } + unsafe { + desc.set_buf::(buffer, direction, flags); + } + + last = self.free_head; + self.free_head += 1; + + if self.free_head == SIZE as u16 { + self.free_head = 0; + self.avail_wrap_count ^= true; + self.avail_used_flags ^= + DescFlags::VIRTQ_DESC_F_AVAIL | DescFlags::VIRTQ_DESC_F_USED; + } + self.write_desc(last); + } + self.num_used += (inputs.len() + outputs.len()) as u16; + } + // set last_elem.next = NULL + self.desc_shadow[usize::from(last)].id = id; + self.desc_shadow[usize::from(last)] + .flags + .remove(DescFlags::VIRTQ_DESC_F_NEXT); + self.write_desc(last); + + // Let the new request visible to the device + self.desc_shadow[usize::from(id)].flags.insert(first_flags); + // Write barrier so that device sees changes to descriptor table and available ring before + // change to available index. + // The driver performs a suitable memory barrier to ensure the device sees the updated descriptor table and available ring before the next step. + // + // Ref: Section 2.7.13 + fence(Ordering::SeqCst); + self.write_desc(id); + + // Write barrier so that device can see change to available index after this method returns. + // The driver performs a suitable memory barrier to ensure that it updates the idx field before checking for notification suppression. + fence(Ordering::SeqCst); + + Ok(id) + } + + /// Add the given buffers to the virtqueue, notifies the device, blocks until the device uses + /// them, then pops them. + /// + /// This assumes that the device isn't processing any other buffers at the same time. + pub fn add_notify_wait_pop<'a>( + &mut self, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + transport: &mut impl Transport, + support_event: bool, + ) -> Result { + // Safe because we don't return until the same token has been popped, so the buffers remain + // valid and are not otherwise accessed until then. + + let token = unsafe { self.add(inputs, outputs) }?; + + // // Notify the queue. + if self.should_notify(support_event) { + transport.notify(self.queue_idx); + } + + // Wait until there is at least one element in the used ring. + while !self.can_pop() { + spin_loop(); + } + + // Safe because these are the same buffers as we passed to `add` above and they are still + // valid. + unsafe { self.pop_used(token, inputs, outputs) } + } + + // TODO: will be deleted in the further (this method does not support event) + /// Add the given buffers to the virtqueue, notifies the device, blocks until the device uses + /// them, then pops them. + /// + /// This assumes that the device isn't processing any other buffers at the same time. + pub fn add_notify_wait_pop_old<'a>( + &mut self, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + transport: &mut impl Transport, + ) -> Result { + // Safe because we don't return until the same token has been popped, so the buffers remain + // valid and are not otherwise accessed until then. + + let token = unsafe { self.add(inputs, outputs) }?; + + // // Notify the queue. + if self.should_notify(false) { + transport.notify(self.queue_idx); + } + + // Wait until there is at least one element in the used ring. + while !self.can_pop() { + spin_loop(); + } + + // Safe because these are the same buffers as we passed to `add` above and they are still + // valid. + unsafe { self.pop_used(token, inputs, outputs) } + } + + // TODO: need modify + /// Returns whether the driver should notify the device after adding a new buffer to the + /// virtqueue. + /// + /// Ref: virtio_ring.c virtqueue_kick_prepare_split + /// This will be false if the device has supressed notifications. + pub fn should_notify(&self, _support_event: bool) -> bool { + // Read barrier, so we read a fresh value from the device. + fence(Ordering::SeqCst); + + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, readable + // instance of UsedRing. + let flags = unsafe { (*self.device_event_suppression.as_ptr()).flags }; + // let off_wrap = unsafe { (*self.device_event_suppression.as_ptr()).off_wrap }; + if flags != PackedQueueFlag::RING_EVENT_FLAGS_DESC.bits() { + flags != PackedQueueFlag::RING_EVENT_FLAGS_DISABLE.bits() + } else { + // let event_offset = off_wrap & ((1 << 15) - 1); + // let event_wrap_counter = off_wrap & (1 << 15); + + false + } + } + + /// Returns whether the driver should notify the device after adding a new buffer to the + /// virtqueue. + /// + /// Ref: virtio_ring.c virtqueue_kick_prepare_split + /// This will be false if the device has supressed notifications. + pub fn should_notify_old(&self) -> bool { + // Read barrier, so we read a fresh value from the device. + fence(Ordering::SeqCst); + + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, readable + // instance of UsedRing. + unsafe { (*self.device_event_suppression.as_ptr()).flags & 0x0001 == 0 } + } + + /// If the given token is next on the device used queue, pops it and returns the total buffer + /// length which was used (written) by the device. + /// + /// Ref: linux virtio_ring.c virtqueue_get_buf_ctx + /// + /// # Safety + /// + /// The buffers in `inputs` and `outputs` must match the set of buffers originally added to the + /// queue by `add` when it returned the token being passed in here. + pub unsafe fn pop_used<'a>( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + if !self.can_pop() { + return Err(Error::NotReady); + } + // Read barrier not necessary, as can_pop already has one. + + // Get the index of the start of the descriptor chain for the next element in the used ring. + let index = token; + let len; + + // Safe because the caller ensures the buffers are valid and match the descriptor. + len = unsafe { self.recycle_descriptors(index, inputs, outputs) }; + if self.indirect_desc { + self.indirect_desc_vec[index as usize] = None; + } + // self.last_used_idx = self.last_used_idx.wrapping_add(1); + + Ok(len as u32) + } + + /// queue by `add` when it returned the token being passed in here. + pub unsafe fn pop_used_async<'a>( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + if !self.can_pop() { + return Err(Error::NotReady); + } + // Read barrier not necessary, as can_pop already has one. + + // Get the index of the start of the descriptor chain for the next element in the used ring. + let index: u16 = token; + let len; + + // Safe because the caller ensures the buffers are valid and match the descriptor. + len = unsafe { self.recycle_descriptors(index, inputs, outputs) }; + // self.last_used_idx = self.last_used_idx.wrapping_add(1); + + Ok(len as u32) + } + + /// Returns the descriptor index (a.k.a. token) of the next used element without popping it, or + /// `None` if the used ring is empty. + fn peek_used(&self) -> Option { + if self.can_pop() { + let last_used_slot = self.last_used_idx; + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, + // readable instance of UsedRing. + Some(unsafe { (*self.desc.as_ptr())[last_used_slot as usize].id as u16 }) + } else { + None + } + } + + /// Returns whether there is a used element that can be popped. + pub fn can_pop(&self) -> bool { + // Read barrier, so we read a fresh value from the device. + fence(Ordering::SeqCst); + + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, readable + // instance of UsedRing. + let next_used = self.last_used_idx as usize; + let avail = + unsafe { (*self.desc.as_ptr())[next_used].flags & DescFlags::VIRTQ_DESC_F_AVAIL }; + let used = unsafe { (*self.desc.as_ptr())[next_used].flags & DescFlags::VIRTQ_DESC_F_USED }; + (avail.is_empty() ^ self.used_wrap_count) && (used.is_empty() ^ self.used_wrap_count) + } + + fn last_used_idx(&self) -> u16 { + self.last_used_idx & (SIZE as u16 - 1) + } + /// Returns the number of free descriptors. + fn available_desc(&self) -> usize { + SIZE - self.num_used as usize + } +} + +/// Descriptor flags +#[derive(Copy, Clone, Debug, Default, Eq, FromBytes, PartialEq)] +#[repr(transparent)] +struct DescFlags(u16); + +bitflags! { + impl DescFlags: u16 { + /// This marks a buffer as continuing via the next field. + const VIRTQ_DESC_F_NEXT = 1; + /// This marks a buffer as write-only (otherwise read-only). + const VIRTQ_DESC_F_WRITE = 1 << 1; + /// This means the buffer contains a list of buffer descriptors. + const VIRTQ_DESC_F_INDIRECT = 1 << 2; + + /// + /// Mark a descriptor as available or used in packed ring. + /// Notice: they are defined as shifts instead of shifted values. + /// + const VIRTQ_DESC_F_AVAIL = 1 << 7; + const VIRTQ_DESC_F_USED = 1 << 15; + } +} + +#[derive(Clone, Debug, FromBytes, Default)] +pub(crate) struct Descriptor { + /// Buffer Address + addr: u64, + /// Buffer Length + len: u32, + /// Buffer ID + id: u16, + /// The flags depending on descriptor type + flags: DescFlags, +} + +impl Descriptor { + /// Sets the buffer address, length and flags, and shares it with the device. + /// + /// # Safety + /// + /// The caller must ensure that the buffer lives at least as long as the descriptor is active. + /// + /// Ref: Section 2.7.13.1 + unsafe fn set_buf( + &mut self, + buf: NonNull<[u8]>, + direction: BufferDirection, + extra_flags: DescFlags, + ) { + unsafe { + self.addr = H::share(buf, direction) as u64; + } + self.len = buf.len() as u32; + + // If buffer is device-writable, set d.flags to VIRTQ_DESC_F_WRITE, otherwise 0. + self.flags = extra_flags + | match direction { + BufferDirection::DriverToDevice => DescFlags::empty(), + BufferDirection::DeviceToDriver => DescFlags::VIRTQ_DESC_F_WRITE, + BufferDirection::Both => { + panic!("Buffer passed to device should never use BufferDirection::Both.") + } + }; + } + + /// Sets the buffer address and length to 0. + /// + /// This must only be called once the device has finished using the descriptor. + fn unset_buf(&mut self) { + self.addr = 0; + self.len = 0; + self.id = 0; + } +} + +#[derive(Debug)] +struct PvirtqEventSuppress { + /// Descriptor Ring Change Event Offset/Wrap Counter. + off_wrap: u16, + /// Descriptor Ring Change Event Flags. + flags: u16, +} + +/// The inner layout of a VirtQueue. +/// +/// Ref: 2.6 Split Virtqueues +#[derive(Debug)] +struct PackedQueueLayout { + /// The whole region used for the communication between device and driver. + whole_dma: Dma, + /// The region used for the desc area. + device_event_offset: usize, + /// The offset from the start of the `driver_to_device_dma` region to the driver area + /// (available ring). + driver_event_offset: usize, +} + +impl PackedQueueLayout { + /// Allocates separate DMA regions for the the different parts of the virtqueue, as supported by + /// non-legacy interfaces. + /// + /// This is preferred over `allocate_legacy` where possible as it reduces memory fragmentation + /// and allows the HAL to know which DMA regions are used in which direction. + fn allocate(queue_size: u16) -> Result { + let (desc, device_event, driver_event) = queue_part_sizes(queue_size); + let whole_dma = Dma::new( + pages(desc + device_event + driver_event), + BufferDirection::DriverToDevice, + )?; + Ok(Self { + whole_dma, + device_event_offset: desc, + driver_event_offset: desc + device_event, + }) + } + + /// Returns the physical address of the descriptor area. + fn descriptors_paddr(&self) -> PhysAddr { + self.whole_dma.paddr() + } + + /// Returns a pointer to the descriptor table (in the descriptor area). + fn descriptors_vaddr(&self) -> NonNull { + self.whole_dma.vaddr(0) + } + + /// Returns the physical address of the driver area. + fn driver_area_paddr(&self) -> PhysAddr { + self.whole_dma.paddr() + self.device_event_offset + } + + /// Returns a pointer to the driver event suppression structure (in the driver area). + fn driver_event_suppression_vaddr(&self) -> NonNull { + self.whole_dma.vaddr(self.device_event_offset) + } + + /// Returns the physical address of the device area. + fn device_area_paddr(&self) -> PhysAddr { + self.whole_dma.paddr() + self.driver_event_offset + } + + /// Returns a pointer to the device event suppression (in the driver area). + fn device_event_suppression_vaddr(&self) -> NonNull { + self.whole_dma.vaddr(self.driver_event_offset) + } +} + +/// Returns the size in bytes of the descriptor table, available ring and used ring for a given +/// queue size. +/// +/// Ref: 2.6 Split Virtqueues +fn queue_part_sizes(queue_size: u16) -> (usize, usize, usize) { + assert!( + queue_size.is_power_of_two(), + "queue size should be a power of 2" + ); + let desc = size_of::() * queue_size as usize; + let device_event = size_of::(); + let driver_event = size_of::(); + (desc, device_event, driver_event) +} + +bitflags! { + struct PackedQueueFlag: u16 { + const RING_EVENT_FLAGS_ENABLE = 0; + const RING_EVENT_FLAGS_DISABLE = 1; + const RING_EVENT_FLAGS_DESC = 2; + } +} + +struct InputOutputIter<'a, 'b> { + inputs: &'a [&'b [u8]], + outputs: &'a mut [&'b mut [u8]], +} + +impl<'a, 'b> InputOutputIter<'a, 'b> { + fn new(inputs: &'a [&'b [u8]], outputs: &'a mut [&'b mut [u8]]) -> Self { + Self { inputs, outputs } + } +} + +impl<'a, 'b> Iterator for InputOutputIter<'a, 'b> { + type Item = (NonNull<[u8]>, BufferDirection); + + fn next(&mut self) -> Option { + if let Some(input) = take_first(&mut self.inputs) { + Some(((*input).into(), BufferDirection::DriverToDevice)) + } else { + let output = take_first_mut(&mut self.outputs)?; + Some(((*output).into(), BufferDirection::DeviceToDriver)) + } + } +} + +// TODO: Use `slice::take_first` once it is stable +// (https://github.com/rust-lang/rust/issues/62280). +fn take_first<'a, T>(slice: &mut &'a [T]) -> Option<&'a T> { + let (first, rem) = slice.split_first()?; + *slice = rem; + Some(first) +} + +// TODO: Use `slice::take_first_mut` once it is stable +// (https://github.com/rust-lang/rust/issues/62280). +fn take_first_mut<'a, T>(slice: &mut &'a mut [T]) -> Option<&'a mut T> { + let (first, rem) = take(slice).split_first_mut()?; + *slice = rem; + Some(first) +} diff --git a/src/queue.rs b/src/queue/split_queue.rs similarity index 69% rename from src/queue.rs rename to src/queue/split_queue.rs index 758c139b..624a2d14 100644 --- a/src/queue.rs +++ b/src/queue/split_queue.rs @@ -1,5 +1,4 @@ #![deny(unsafe_op_in_unsafe_fn)] - use crate::hal::{BufferDirection, Dma, Hal, PhysAddr}; use crate::transport::Transport; use crate::{align_up, nonnull_slice_from_raw_parts, pages, Error, Result, PAGE_SIZE}; @@ -14,14 +13,18 @@ use core::ptr::NonNull; use core::sync::atomic::{fence, Ordering}; use zerocopy::FromBytes; +use core::array; +// use alloc::vec::Vec; + /// The mechanism for bulk data transport on virtio devices. /// /// Each device can have zero or more virtqueues. /// -/// * `SIZE`: The size of the queue. This is both the number of descriptors, and the number of slots -/// in the available and used rings. +/// * `SIZE`: The size of the queue. This is both the number of descriptors, and the number of +/// slots in the available and used rings. Queue Size value is always a power of 2. The maximum +/// Queue Size value is 32768. This value is specified in a bus-specific way. #[derive(Debug)] -pub struct VirtQueue { +pub struct SplitQueue { /// DMA guard layout: VirtQueueLayout, /// Descriptor table @@ -30,13 +33,18 @@ pub struct VirtQueue { /// trust values read back from it. Use `desc_shadow` instead to keep track of what we wrote to /// it. desc: NonNull<[Descriptor]>, - /// Available ring + + indirect_desc_vec: [Option>; SIZE], + /// Available ring: When the driver wants to send a buffer to the device, it fills in a slot + /// in the descriptor table (or chains several together), and writes the descriptor index into + /// the available ring. It then notifies the device. /// /// The device may be able to modify this, even though it's not supposed to, so we shouldn't /// trust values read back from it. The only field we need to read currently is `idx`, so we /// have `avail_idx` below to use instead. avail: NonNull>, - /// Used ring + /// Used ring: When the device has finished a buffer, it writes the descriptor index into the + /// used ring, and sends a used buffer notification. used: NonNull>, /// The index of queue @@ -50,11 +58,16 @@ pub struct VirtQueue { /// Our trusted copy of `avail.idx`. avail_idx: u16, last_used_idx: u16, + + // Support indirect descriptor (true) or not (false) + indirect_desc: bool, } -impl VirtQueue { +impl SplitQueue { /// Create a new VirtQueue. - pub fn new(transport: &mut T, idx: u16) -> Result { + /// + /// Ref: 4.2.3.2 Virtqueue Configuration + pub fn new(transport: &mut T, idx: u16, indirect_desc: bool) -> Result { if transport.queue_used(idx) { return Err(Error::AlreadyUsed); } @@ -96,9 +109,12 @@ impl VirtQueue { } } - Ok(VirtQueue { + let mut indirect_desc_vec = array::from_fn(|_i| None); + + Ok(SplitQueue { layout, desc, + indirect_desc_vec, avail, used, queue_idx: idx, @@ -107,12 +123,15 @@ impl VirtQueue { desc_shadow, avail_idx: 0, last_used_idx: 0, + indirect_desc, }) } - +} +impl SplitQueue { /// Add buffers to the virtqueue, return a token. /// /// Ref: linux virtio_ring.c virtqueue_add + /// Ref: Section 2.7.13 /// /// # Safety /// @@ -123,38 +142,102 @@ impl VirtQueue { inputs: &'a [&'b [u8]], outputs: &'a mut [&'b mut [u8]], ) -> Result { - if inputs.is_empty() && outputs.is_empty() { + if inputs.is_empty() || outputs.is_empty() { return Err(Error::InvalidParam); } - if inputs.len() + outputs.len() + self.num_used as usize > SIZE { - return Err(Error::QueueFull); + + let desc_nr_needed = inputs.len() + outputs.len(); + if self.indirect_desc { + if (SIZE - self.num_used as usize) < 1 || desc_nr_needed > SIZE { + return Err(Error::QueueFull); + } + } else { + if desc_nr_needed + self.num_used as usize > SIZE { + return Err(Error::QueueFull); + } } // allocate descriptors from free list let head = self.free_head; let mut last = self.free_head; - for (buffer, direction) in InputOutputIter::new(inputs, outputs) { - // Write to desc_shadow then copy. - let desc = &mut self.desc_shadow[usize::from(self.free_head)]; - // Safe because our caller promises that the buffers live at least until `pop_used` - // returns them. + if self.indirect_desc { + let size_indirect_descs = size_of::() * (inputs.len() + outputs.len()); + let indirect_descs_dma = + Dma::::new(pages(size_indirect_descs), BufferDirection::DriverToDevice)?; + let indirect_descs = nonnull_slice_from_raw_parts( + indirect_descs_dma.vaddr(0).cast::(), + size_indirect_descs, + ); + + self.indirect_desc_vec[head as usize] = Some(indirect_descs_dma); + + let mut index = 0; + let mut desc = Descriptor::default(); + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // Write to desc_shadow then copy. + desc = Descriptor::default(); + + unsafe { + desc.set_buf::(buffer, direction, DescFlags::NEXT); + } + desc.next = index + 1; + + unsafe { + (*indirect_descs.as_ptr())[index as usize] = desc.clone(); + } + index += 1; + } + desc.flags.remove(DescFlags::NEXT); unsafe { - desc.set_buf::(buffer, direction, DescFlags::NEXT); + (*indirect_descs.as_ptr())[(index - 1) as usize] = desc.clone(); + } + + let indirect_descs = unsafe { + core::slice::from_raw_parts( + indirect_descs.as_ptr() as *const u8, + size_indirect_descs, + ) + }; + + // Push the address of indirect_descs into queue's descriptor table + let desc = &mut self.desc_shadow[usize::from(last)]; + unsafe { + desc.set_buf::( + indirect_descs.into(), + BufferDirection::DriverToDevice, + DescFlags::INDIRECT, + ); } - last = self.free_head; self.free_head = desc.next; self.write_desc(last); - } + } else { + // A buffer consists of zero or more device-readable physically-contiguous elements followed by zero or more physically-contiguous device-writable elements (each has at least one element). + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // Write to desc_shadow then copy. + let desc = &mut self.desc_shadow[usize::from(self.free_head)]; + unsafe { + desc.set_buf::(buffer, direction, DescFlags::NEXT); + } + last = self.free_head; + self.free_head = desc.next; - // set last_elem.next = NULL - self.desc_shadow[usize::from(last)] - .flags - .remove(DescFlags::NEXT); - self.write_desc(last); + self.write_desc(last); + } - self.num_used += (inputs.len() + outputs.len()) as u16; + // set last_elem.next = NULL + self.desc_shadow[usize::from(last)] + .flags + .remove(DescFlags::NEXT); + self.write_desc(last); + } + + if self.indirect_desc { + self.num_used += 1; + } else { + self.num_used += (inputs.len() + outputs.len()) as u16; + } let avail_slot = self.avail_idx & (SIZE as u16 - 1); // Safe because self.avail is properly aligned, dereferenceable and initialised. @@ -164,6 +247,10 @@ impl VirtQueue { // Write barrier so that device sees changes to descriptor table and available ring before // change to available index. + // The driver performs a suitable memory barrier to ensure the device sees the updated + // descriptor table and available ring before the next step. + // + // Ref: Section 2.7.13 fence(Ordering::SeqCst); // increase head of avail ring @@ -174,6 +261,8 @@ impl VirtQueue { } // Write barrier so that device can see change to available index after this method returns. + // The driver performs a suitable memory barrier to ensure that it updates the idx field + // before checking for notification suppression. fence(Ordering::SeqCst); Ok(head) @@ -188,13 +277,14 @@ impl VirtQueue { inputs: &'a [&'a [u8]], outputs: &'a mut [&'a mut [u8]], transport: &mut impl Transport, + support_event: bool, ) -> Result { // Safe because we don't return until the same token has been popped, so the buffers remain // valid and are not otherwise accessed until then. let token = unsafe { self.add(inputs, outputs) }?; // Notify the queue. - if self.should_notify() { + if self.should_notify(support_event) { transport.notify(self.queue_idx); } @@ -211,18 +301,23 @@ impl VirtQueue { /// Returns whether the driver should notify the device after adding a new buffer to the /// virtqueue. /// + /// Ref: virtio_ring.c virtqueue_kick_prepare_split /// This will be false if the device has supressed notifications. - pub fn should_notify(&self) -> bool { + pub fn should_notify(&self, support_event: bool) -> bool { // Read barrier, so we read a fresh value from the device. fence(Ordering::SeqCst); // Safe because self.used points to a valid, aligned, initialised, dereferenceable, readable // instance of UsedRing. - unsafe { (*self.used.as_ptr()).flags & 0x0001 == 0 } + if support_event { + unsafe { self.avail_idx == (*self.used.as_ptr()).avail_event.wrapping_add(1) } + } else { + unsafe { ((*self.used.as_ptr()).flags & 0x0001) == 0 } + } } - /// Copies the descriptor at the given index from `desc_shadow` to `desc`, so it can be seen by - /// the device. + /// Copies the descriptor at the given index from `desc_shadow` to `desc`, so it can be seen + /// by the device. fn write_desc(&mut self, index: u16) { let index = usize::from(index); // Safe because self.desc is properly aligned, dereferenceable and initialised, and nothing @@ -237,8 +332,8 @@ impl VirtQueue { // Read barrier, so we read a fresh value from the device. fence(Ordering::SeqCst); - // Safe because self.used points to a valid, aligned, initialised, dereferenceable, readable - // instance of UsedRing. + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, + // readable instance of UsedRing. self.last_used_idx != unsafe { (*self.used.as_ptr()).idx } } @@ -264,7 +359,7 @@ impl VirtQueue { /// list. Unsharing may involve copying data back to the original buffers, so they must be /// passed in too. /// - /// This will push all linked descriptors at the front of the free list. + /// Ref: linux virtio_ring.c detach_buf_split. /// /// # Safety /// @@ -280,11 +375,10 @@ impl VirtQueue { self.free_head = head; let mut next = Some(head); - for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + if self.indirect_desc { + // Recycle the only one descriptor in the ring let desc_index = next.expect("Descriptor chain was shorter than expected."); let desc = &mut self.desc_shadow[usize::from(desc_index)]; - - let paddr = desc.addr; desc.unset_buf(); self.num_used -= 1; next = desc.next(); @@ -294,11 +388,51 @@ impl VirtQueue { self.write_desc(desc_index); - // Safe because the caller ensures that the buffer is valid and matches the descriptor - // from which we got `paddr`. - unsafe { - // Unshare the buffer (and perhaps copy its contents back to the original buffer). - H::unshare(paddr as usize, buffer, direction); + // Release indirect descriptors + let size_indirect_descs = size_of::() * (inputs.len() + outputs.len()); + let indirect_descs = nonnull_slice_from_raw_parts( + self.indirect_desc_vec[desc_index as usize] + .as_ref() + .unwrap() + .vaddr(0) + .cast::(), + size_indirect_descs, + ); + + let mut indirect_desc_index = 0 as usize; + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + // let desc = &mut self.indirect_descs[usize::from(indirect_desc_index)]; + let paddr = unsafe { (*indirect_descs.as_ptr())[indirect_desc_index].addr }; + + // Safe because the caller ensures that the buffer is valid and matches the + // descriptor from which we got `paddr`. + unsafe { + // Unshare the buffer (and perhaps copy its contents back to the original buffer). + H::unshare(paddr as usize, buffer, direction); + } + indirect_desc_index += 1; + } + } else { + for (buffer, direction) in InputOutputIter::new(inputs, outputs) { + let desc_index = next.expect("Descriptor chain was shorter than expected."); + let desc = &mut self.desc_shadow[usize::from(desc_index)]; + + let paddr = desc.addr; + desc.unset_buf(); + self.num_used -= 1; + next = desc.next(); + if next.is_none() { + desc.next = original_free_head; + } + + self.write_desc(desc_index); + + // Safe because the caller ensures that the buffer is valid and matches the + // descriptor from which we got `paddr`. + unsafe { + // Unshare the buffer (and perhaps copy its contents back to the original buffer). + H::unshare(paddr as usize, buffer, direction); + } } } @@ -327,6 +461,56 @@ impl VirtQueue { } // Read barrier not necessary, as can_pop already has one. + // Get the index of the start of the descriptor chain for the next element in the used ring. + let last_used_slot = self.last_used_idx & (SIZE as u16 - 1); + let index; + let len; + // Safe because self.used points to a valid, aligned, initialised, dereferenceable, + // readable instance of UsedRing. + unsafe { + index = (*self.used.as_ptr()).ring[last_used_slot as usize].id as u16; + len = (*self.used.as_ptr()).ring[last_used_slot as usize].len; + } + + if index != token { + // The device used a different descriptor chain to the one we were expecting. + return Err(Error::WrongToken); + } + + // Safe because the caller ensures the buffers are valid and match the descriptor. + unsafe { + self.recycle_descriptors(index, inputs, outputs); + } + + if self.indirect_desc { + self.indirect_desc_vec[index as usize] = None; + } + self.last_used_idx = self.last_used_idx.wrapping_add(1); + + Ok(len) + } + + // TODO: will be deleted in the further + /// If the given token is next on the device used queue, pops it and returns the total buffer + /// length which was used (written) by the device. + /// + /// Ref: linux virtio_ring.c virtqueue_get_buf_ctx + /// + /// # Safety + /// + /// The buffers in `inputs` and `outputs` must match the set of buffers originally added to the + /// queue by `add` when it returned the token being passed in here. + pub unsafe fn pop_used_async<'a>( + &mut self, + token: u16, + inputs: &'a [&'a [u8]], + outputs: &'a mut [&'a mut [u8]], + ) -> Result { + if !self.can_pop() { + return Err(Error::NotReady); + } + // Read barrier not necessary, as can_pop already has one. + // Get the index of the start of the descriptor chain for the next element in the used ring. let last_used_slot = self.last_used_idx & (SIZE as u16 - 1); let index; @@ -342,6 +526,9 @@ impl VirtQueue { // The device used a different descriptor chain to the one we were expecting. return Err(Error::WrongToken); } + if self.indirect_desc { + self.indirect_desc_vec[index as usize] = None; + } // Safe because the caller ensures the buffers are valid and match the descriptor. unsafe { @@ -381,6 +568,7 @@ impl VirtQueueLayout { /// Ref: 2.6.2 Legacy Interfaces: A Note on Virtqueue Layout fn allocate_legacy(queue_size: u16) -> Result { let (desc, avail, used) = queue_part_sizes(queue_size); + // 需要在Available Ring 和 Used Ring 之间插入一段 padding space以达到align这两段空间的目的 let size = align_up(desc + avail) + align_up(used); // Allocate contiguous pages. let dma = Dma::new(size / PAGE_SIZE, BufferDirection::Both)?; @@ -500,12 +688,21 @@ fn queue_part_sizes(queue_size: u16) -> (usize, usize, usize) { (desc, avail, used) } +/// The descriptor table refers to the buffers the driver is using for the device. +/// Each descriptor describes a buffer which is read-only for the device (“device-readable”) or +/// write-only for the device (“device-writable”), +/// +/// Ref: 2.7.5 The Virtqueue Descriptor Table #[repr(C, align(16))] -#[derive(Clone, Debug, FromBytes)] +#[derive(Clone, Debug, FromBytes, Default)] pub(crate) struct Descriptor { + /// a *physical* address addr: u64, + len: u32, + flags: DescFlags, + /// Next field if flags & NEXT next: u16, } @@ -515,17 +712,19 @@ impl Descriptor { /// # Safety /// /// The caller must ensure that the buffer lives at least as long as the descriptor is active. + /// + /// Ref: Section 2.7.13.1 unsafe fn set_buf( &mut self, buf: NonNull<[u8]>, direction: BufferDirection, extra_flags: DescFlags, ) { - // Safe because our caller promises that the buffer is valid. unsafe { self.addr = H::share(buf, direction) as u64; } self.len = buf.len() as u32; + // If buffer is device-writable, set d.flags to VIRTQ_DESC_F_WRITE, otherwise 0. self.flags = extra_flags | match direction { BufferDirection::DeviceToDriver => DescFlags::WRITE, @@ -562,8 +761,11 @@ struct DescFlags(u16); bitflags! { impl DescFlags: u16 { + /// This marks a buffer as continuing via the next field const NEXT = 1; + /// This marks a buffer as device write-only (otherwise device read-only) const WRITE = 2; + /// This means the buffer contains a list of buffer descriptors const INDIRECT = 4; } } @@ -571,31 +773,44 @@ bitflags! { /// The driver uses the available ring to offer buffers to the device: /// each ring entry refers to the head of a descriptor chain. /// It is only written by the driver and read by the device. +/// +/// Ref: Section 2.7.6 The Virtqueue Available Ring #[repr(C)] #[derive(Debug)] struct AvailRing { flags: u16, - /// A driver MUST NOT decrement the idx. + /// idx field indicates where the driver would put the next descriptor entry in the ring (modulo the queue size). This starts at 0, and increases. idx: u16, ring: [u16; SIZE], + /// Only if VIRTIO_F_EVENT_IDX used_event: u16, // unused } /// The used ring is where the device returns buffers once it is done with them: /// it is only written to by the device, and read by the driver. +/// +/// Ref: Section 2.7.8 The Virtqueue Used Ring #[repr(C)] #[derive(Debug)] struct UsedRing { flags: u16, + /// idx field indicates where the device would put the next descriptor entry in the ring + /// (modulo the queue size). This starts at 0, and increases. idx: u16, ring: [UsedElem; SIZE], + + /// Only if VIRTIO_F_EVENT_IDX avail_event: u16, // unused } #[repr(C)] #[derive(Debug)] struct UsedElem { + /// id indicates the head entry of the descriptor chain describing the buffer (this matches + /// an entry placed in the available ring by the guest earlier), id: u32, + /// The number of bytes written into the device writable portion of the buffer described by + /// the descriptor chain. len: u32, } @@ -729,7 +944,7 @@ mod tests { let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); // Size not a power of 2. assert_eq!( - VirtQueue::::new(&mut transport, 0).unwrap_err(), + SplitQueue::::new(&mut transport, 0, false).unwrap_err(), Error::InvalidParam ); } @@ -739,7 +954,7 @@ mod tests { let mut header = VirtIOHeader::make_fake_header(MODERN_VERSION, 1, 0, 0, 4); let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); assert_eq!( - VirtQueue::::new(&mut transport, 0).unwrap_err(), + SplitQueue::::new(&mut transport, 0, false).unwrap_err(), Error::InvalidParam ); } @@ -748,9 +963,9 @@ mod tests { fn queue_already_used() { let mut header = VirtIOHeader::make_fake_header(MODERN_VERSION, 1, 0, 0, 4); let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); - VirtQueue::::new(&mut transport, 0).unwrap(); + SplitQueue::::new(&mut transport, 0, false).unwrap(); assert_eq!( - VirtQueue::::new(&mut transport, 0).unwrap_err(), + SplitQueue::::new(&mut transport, 0, false).unwrap_err(), Error::AlreadyUsed ); } @@ -759,7 +974,7 @@ mod tests { fn add_empty() { let mut header = VirtIOHeader::make_fake_header(MODERN_VERSION, 1, 0, 0, 4); let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); - let mut queue = VirtQueue::::new(&mut transport, 0).unwrap(); + let mut queue = SplitQueue::::new(&mut transport, 0, false).unwrap(); assert_eq!( unsafe { queue.add(&[], &mut []) }.unwrap_err(), Error::InvalidParam @@ -770,7 +985,7 @@ mod tests { fn add_too_many() { let mut header = VirtIOHeader::make_fake_header(MODERN_VERSION, 1, 0, 0, 4); let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); - let mut queue = VirtQueue::::new(&mut transport, 0).unwrap(); + let mut queue = SplitQueue::::new(&mut transport, 0, false).unwrap(); assert_eq!(queue.available_desc(), 4); assert_eq!( unsafe { queue.add(&[&[], &[], &[]], &mut [&mut [], &mut []]) }.unwrap_err(), @@ -782,7 +997,7 @@ mod tests { fn add_buffers() { let mut header = VirtIOHeader::make_fake_header(MODERN_VERSION, 1, 0, 0, 4); let mut transport = unsafe { MmioTransport::new(NonNull::from(&mut header)) }.unwrap(); - let mut queue = VirtQueue::::new(&mut transport, 0).unwrap(); + let mut queue = SplitQueue::::new(&mut transport, 0, false).unwrap(); assert_eq!(queue.available_desc(), 4); // Add a buffer chain consisting of two device-readable parts followed by two diff --git a/src/transport/fake.rs b/src/transport/fake.rs index 6ab61fc9..5288bc43 100644 --- a/src/transport/fake.rs +++ b/src/transport/fake.rs @@ -1,6 +1,6 @@ use super::{DeviceStatus, DeviceType, Transport}; use crate::{ - queue::{fake_read_write_queue, Descriptor}, + queue::split_queue::{fake_read_write_queue, Descriptor}, PhysAddr, Result, }; use alloc::{sync::Arc, vec::Vec}; diff --git a/src/transport/mmio.rs b/src/transport/mmio.rs index d938a97d..8015659f 100644 --- a/src/transport/mmio.rs +++ b/src/transport/mmio.rs @@ -3,7 +3,7 @@ use super::{DeviceStatus, DeviceType, Transport}; use crate::{ align_up, - queue::Descriptor, + queue::split_queue::Descriptor, volatile::{volread, volwrite, ReadOnly, Volatile, WriteOnly}, Error, PhysAddr, PAGE_SIZE, };