From f54e3e9147bf1dd341e22a0fc01cf5c5d71843e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 14 Oct 2024 17:54:03 +0200 Subject: [PATCH] Also consider offloaded timelines for obtaining retain_lsn (#9308) Also consider offloaded timelines for obtaining `retain_lsn`. This is required for correctness for all timelines that have not been flattened yet: otherwise we GC data that might still be required for reading. This somewhat counteracts the original purpose of timeline offloading of not having to iterate over offloaded timelines, but sadly it's required. In the future, we can improve the way the offloaded timelines are stored. We also make the `retain_lsn` optional so that in the future, when we implement flattening, we can make it None. This also applies to full timeline objects by the way, where it would probably make most sense to add a bool flag whether the timeline is successfully flattened, and if it is, one can exclude it from `retain_lsn` as well. Also, track whether a timeline was offloaded or not in `retain_lsn` so that the `retain_lsn` can be excluded from visibility and size calculation. Part of #8088 --- pageserver/src/tenant.rs | 56 ++++++++++++++++---- pageserver/src/tenant/size.rs | 8 +-- pageserver/src/tenant/timeline.rs | 21 +++++--- pageserver/src/tenant/timeline/compaction.rs | 9 ++-- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2818d04dc69..397778d4c834 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -493,6 +493,8 @@ pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, // TODO: once we persist offloaded state, make this lazily constructed pub remote_client: Arc, @@ -504,10 +506,14 @@ pub struct OffloadedTimeline { impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { + let ancestor_retain_lsn = timeline + .get_ancestor_timeline_id() + .map(|_timeline_id| timeline.get_ancestor_lsn()); Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_retain_lsn, remote_client: timeline.remote_client.clone(), delete_progress: timeline.delete_progress.clone(), @@ -515,6 +521,12 @@ impl OffloadedTimeline { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub enum MaybeOffloaded { + Yes, + No, +} + #[derive(Clone)] pub enum TimelineOrOffloaded { Timeline(Arc), @@ -2253,12 +2265,13 @@ impl Tenant { if activating { let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor); + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. @@ -3298,6 +3311,7 @@ impl Tenant { fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, + timelines_offloaded: &std::sync::MutexGuard>>, ) { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. @@ -3305,20 +3319,37 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + ancestor_children.push(( + timeline_entry.get_ancestor_lsn(), + *timeline_id, + MaybeOffloaded::No, + )); } }); + timelines_offloaded + .iter() + .for_each(|(timeline_id, timeline_entry)| { + let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { + return; + }; + let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { + return; + }; + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); + }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches for timeline in timelines.values() { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); @@ -4878,7 +4909,10 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); + assert_eq!( + branchpoints[0], + (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) + ); } // You can read the key from the child branch even though the parent is @@ -8261,8 +8295,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8489,8 +8523,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8723,7 +8757,7 @@ mod tests { // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x10), space: Lsn(0x10), @@ -8737,7 +8771,7 @@ mod tests { // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x50), space: Lsn(0x50), diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 41d558d3f68a..4a4c698b5655 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -12,7 +12,7 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::Timeline; +use crate::tenant::{MaybeOffloaded, Timeline}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -264,10 +264,12 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) + .filter(|(lsn, _child_id, is_offloaded)| { + lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No + }) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2fd4e699cfbf..8f098d0e8299 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -139,8 +139,10 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::{ - config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + config::TenantConf, + storage_layer::{inmemory_layer, LayerVisibilityHint}, upload_queue::NotInitialized, + MaybeOffloaded, }; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; @@ -450,7 +452,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -467,8 +469,13 @@ impl GcInfo { self.cutoffs.select_min() } - pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { - self.retain_lsns.push((child_lsn, child_id)); + pub(super) fn insert_child( + &mut self, + child_id: TimelineId, + child_lsn: Lsn, + is_offloaded: MaybeOffloaded, + ) { + self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } @@ -2164,7 +2171,9 @@ impl Timeline { if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); - ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + // If we construct an explicit timeline object, it's obviously not offloaded + let is_offloaded = MaybeOffloaded::No; + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } Arc::new_cyclic(|myself| { @@ -4875,7 +4884,7 @@ impl Timeline { let retain_lsns = gc_info .retain_lsns .iter() - .map(|(lsn, _child_id)| *lsn) + .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9f64471432e3..8b9ace1e5bbf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::DeltaLayer; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -639,7 +639,10 @@ impl Timeline { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); - for (child_lsn, _child_timeline_id) in &children { + for (child_lsn, _child_timeline_id, is_offloaded) in &children { + if *is_offloaded == MaybeOffloaded::Yes { + continue; + } readable_points.push(*child_lsn); } readable_points.push(head_lsn); @@ -1741,7 +1744,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); - for (lsn, _timeline_id) in &gc_info.retain_lsns { + for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); }