Skip to content

Commit

Permalink
feat: define overflow pages and a function for chunking large values
Browse files Browse the repository at this point in the history
  • Loading branch information
rphmeier committed Aug 24, 2024
1 parent 9ff8125 commit be038f1
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 40 deletions.
2 changes: 1 addition & 1 deletion nomt/src/beatree/allocator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use free_list::FreeList;

mod free_list;

/// The number of a page
/// The number of a page.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct PageNumber(pub u32);

Expand Down
1 change: 1 addition & 0 deletions nomt/src/beatree/leaf/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@

pub mod node;
pub mod store;
pub mod overflow;
60 changes: 36 additions & 24 deletions nomt/src/beatree/leaf/node.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
// Here is the layout of a leaf node:
//
// ```rust,ignore
// n: u16
// cell_pointers: [(key ++ offset); n]
// padding: [u8] // empty space between cell_pointers and cells
// cells: [[u8]; n]
// ```
//
// | n | [(key ++ offset); n] | ---- | [[u8]; n] |
//
// Where key is an [u8; 32], and offset is the byte offset in the node
// to the beginning of the value.
//
// Cell pointers are saved in order of the key, and consequently, so are the cells.
// The length of the value is determined by the difference between the start offsets
// of this value and the next.
//
// Cells are left-aligned and thus the last value is always attached to the end.
//
// The offset of the first cell also serves to detect potential overlap
// between the growth of cell_pointers and cells.
//
// Overflow pages: TODO
/// Here is the layout of a leaf node:
///
/// ```rust,ignore
/// n: u16
/// cell_pointers: [(key ++ offset); n]
/// padding: [u8] // empty space between cell_pointers and cells
/// cells: [Cell; n]
/// value cell: [u8]
/// overflow cell: (u64, [NodePointer]) | semantically, (value_size, [NodePointer]).
/// ```
///
/// | n | [(key ++ offset); n] | ---- | [[u8]; n] |
///
/// Where key is an [u8; 32], and offset is the byte offset in the node
/// to the beginning of the value.
///
/// Cell pointers are saved in order of the key, and consequently, so are the cells.
/// The length of the value is determined by the difference between the start offsets
/// of this value and the next.
///
/// When a cell is an overflow cell, the high bit in the offset is set to `1`. Only the low
/// 15 bits should count when considering the offset.
///
/// Cells are left-aligned and thus the last value is always attached to the end.
///
/// The offset of the first cell also serves to detect potential overlap
/// between the growth of cell_pointers and cells.
use std::ops::Range;

Expand All @@ -30,8 +33,17 @@ use crate::{
io::{Page, PAGE_SIZE},
};

/// The size of the leaf node body: everything excluding the mandatory header.
pub const LEAF_NODE_BODY_SIZE: usize = PAGE_SIZE - 2;

/// The maximum value size before overflow pages are used.
pub const MAX_LEAF_VALUE_SIZE: usize = (LEAF_NODE_BODY_SIZE / 3) - 32;

/// The maximum number of node pointers which may appear directly in an overflow cell.
///
/// Note that this gives an overflow value cell maximum size of 100 bytes.
pub const MAX_OVERFLOW_CELL_NODE_POINTERS: usize = 23;

pub struct LeafNode {
pub inner: Box<Page>,
}
Expand Down
98 changes: 98 additions & 0 deletions nomt/src/beatree/leaf/overflow.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/// Overflow pages are used to store values which exceed the maximum size.
///
/// Large values are chunked into pages in a deterministic way, optimized for parallel fetching.
///
/// The format of an overflow page is:
/// ```rust,ignore
/// n_pointers: u16
/// n_bytes: u16
/// pointers: [PageNumber; n_pointers]
/// bytes: [u8; n_bytes]
/// ```
use crate::{
beatree::PageNumber,
io::{Page, PAGE_SIZE},
};

use super::{
node::MAX_OVERFLOW_CELL_NODE_POINTERS,
store::LeafStoreWriter,
};

const BODY_SIZE: usize = PAGE_SIZE - 4;
const MAX_PNS: usize = BODY_SIZE / 4;

/// Encode a large value into freshly allocated overflow pages. Returns a vector of page pointers.
pub fn chunk(value: &[u8], leaf_writer: &mut LeafStoreWriter) -> Vec<PageNumber> {
assert!(!value.is_empty());

let total_pages = total_needed_pages(value.len());
let cell_pages = std::cmp::min(total_pages, MAX_OVERFLOW_CELL_NODE_POINTERS);
let cell = (0..cell_pages).map(|_| leaf_writer.preallocate()).collect::<Vec<_>>();
let other_pages = (0..total_pages).skip(cell_pages).map(|_| leaf_writer.preallocate()).collect::<Vec<_>>();

let mut all_pages = cell.iter().cloned().chain(other_pages.iter().cloned());
let mut to_write = other_pages.iter().cloned();

let mut value = value;
// loop over all page numbers.
for pn in all_pages {
assert!(!value.is_empty());

// allocate a page.
let mut page = Box::new(Page::zeroed());
let mut pns_written = 0;

// write as many page numbers as possible.
while pns_written < MAX_PNS {
let Some(pn) = to_write.next() else { break };
let start = 4 + pns_written * 4;
let end = start + 4;
page[start..end].copy_from_slice(&pn.0.to_le_bytes());
pns_written += 1;
}

// then write as many value bytes as possible.
let bytes = std::cmp::min(BODY_SIZE - pns_written * 4, value.len());

// write the header.
page[0..2].copy_from_slice(&(pns_written as u16).to_le_bytes());
page[2..4].copy_from_slice(&(bytes as u16).to_le_bytes());

let start = 4 + pns_written * 4;
let end = start + bytes;
page[start..end].copy_from_slice(&value[..bytes]);
value = &value[bytes..];

// write the page.
leaf_writer.write_preallocated(pn, page);
}

cell
}

fn total_needed_pages(value_size: usize) -> usize {
let mut encoded_size = value_size;
let mut total_pages = needed_pages(encoded_size);

// the encoded size is equal to the size of the value plus the number of node pointers that
// will appear in pages.
// TODO: there's probably a closed form for this.
loop {
// account for the fact that some of the pages are going to be in the cell and not
// in pages, therefore they don't increase the payload size.
let pages_in_pages = total_pages.saturating_sub(MAX_OVERFLOW_CELL_NODE_POINTERS);

encoded_size += pages_in_pages * 4;
let new_total = needed_pages(encoded_size);
if new_total == total_pages { break }
total_pages = new_total;
}

total_pages
}

fn needed_pages(size: usize) -> usize {
(size + BODY_SIZE - 1) / BODY_SIZE
}
21 changes: 15 additions & 6 deletions nomt/src/beatree/leaf/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,29 @@ pub fn create(

impl LeafStoreReader {
/// Returns the leaf page with the specified page number.
pub fn query(&self, pn: PageNumber) -> LeafNode {
let page = self.allocator_reader.query(pn);

LeafNode { inner: page }
pub fn query(&self, pn: PageNumber) -> Box<Page> {
self.allocator_reader.query(pn)
}
}

impl LeafStoreWriter {
pub fn allocate(&mut self, leaf_page: LeafNode) -> PageNumber {
/// Preallocate a page number.
pub fn preallocate(&mut self) -> PageNumber {
self.allocator_writer.allocate()
}

/// Write a leaf node, allocating a page number.
pub fn write(&mut self, leaf_page: LeafNode) -> PageNumber {
let pn = self.allocator_writer.allocate();
self.pending.push((pn, leaf_page.inner));
self.write_preallocated(pn, leaf_page.inner);
pn
}

/// Write a page under a preallocated page number.
pub fn write_preallocated(&mut self, pn: PageNumber, page: Box<Page>) {
self.pending.push((pn, page));
}

pub fn release(&mut self, id: PageNumber) {
self.allocator_writer.release(id)
}
Expand Down
6 changes: 4 additions & 2 deletions nomt/src/beatree/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use super::{
allocator::PageNumber,
branch::{self, BranchId},
index::Index,
leaf, Key,
leaf::{self, node::LeafNode}, Key,
};

mod reconstruction;
Expand Down Expand Up @@ -39,7 +39,9 @@ pub fn lookup(
Some((_, leaf_pn)) => leaf_pn,
};

let leaf = leaf_store.query(leaf_pn);
let leaf = LeafNode { inner: leaf_store.query(leaf_pn) };

// TODO: handle overflow.
Ok(leaf.get(&key).map(|v| v.to_vec()))
}

Expand Down
8 changes: 4 additions & 4 deletions nomt/src/beatree/ops/update/leaf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ impl LeafUpdater {
let node = self.build_leaf(&self.ops[last_ops_start..]);
let separator = self.separator();

let pn = leaf_writer.allocate(node);
let pn = leaf_writer.write(node);
branch_updater.ingest(separator, pn);

self.ops.clear();
Expand Down Expand Up @@ -264,7 +264,7 @@ impl LeafUpdater {
}

// write the node and provide it to the branch above.
let pn = leaf_writer.allocate(new_node);
let pn = leaf_writer.write(new_node);
branch_updater.ingest(separator, pn);

start += item_count;
Expand Down Expand Up @@ -319,7 +319,7 @@ impl LeafUpdater {

let left_leaf = self.build_leaf(left_ops);

let left_pn = leaf_writer.allocate(left_leaf);
let left_pn = leaf_writer.write(left_leaf);

branch_updater.ingest(left_separator, left_pn);

Expand All @@ -335,7 +335,7 @@ impl LeafUpdater {

if right_gauge.body_size() >= LEAF_MERGE_THRESHOLD || self.cutoff.is_none() {
let right_leaf = self.build_leaf(right_ops);
let right_pn = leaf_writer.allocate(right_leaf);
let right_pn = leaf_writer.write(right_leaf);
branch_updater.ingest(right_separator, right_pn);

self.ops.clear();
Expand Down
6 changes: 3 additions & 3 deletions nomt/src/beatree/ops/update/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::beatree::{
branch::{BranchNodePool, BRANCH_NODE_BODY_SIZE},
index::Index,
leaf::{
node::LEAF_NODE_BODY_SIZE,
node::{LeafNode, LEAF_NODE_BODY_SIZE},
store::{LeafStoreReader, LeafStoreWriter},
},
Key,
Expand Down Expand Up @@ -118,7 +118,7 @@ impl Updater {
})
.map(|(id, separator)| BaseLeaf {
id,
node: ctx.leaf_reader.query(id),
node: LeafNode { inner: ctx.leaf_reader.query(id) },
iter_pos: 0,
separator,
});
Expand Down Expand Up @@ -210,7 +210,7 @@ impl Updater {
fn reset_leaf_base(&mut self, target: Key, ctx: &Ctx) -> Result<(), ()> {
let branch = self.branch_updater.base().ok_or(())?;
let (i, leaf_pn) = super::search_branch(&branch.node, target).ok_or(())?;
let leaf = ctx.leaf_reader.query(leaf_pn);
let leaf = LeafNode { inner: ctx.leaf_reader.query(leaf_pn) };

let separator = reconstruct_key(branch.node.prefix(), branch.node.separator(i));

Expand Down

0 comments on commit be038f1

Please sign in to comment.