Skip to content

Commit

Permalink
Supports to sync historical files without NewFile gossip message (#269)
Browse files Browse the repository at this point in the history
* Supports to randomly sync historical files

* Add name for random file sync batcher

* Remove sync store metrics since multiple random batcher created

* opt log

* ignore pruned or finalized historical file

* Add python tests for historical file sync
  • Loading branch information
boqiu authored Nov 15, 2024
1 parent 4566ead commit e912522
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 18 deletions.
1 change: 1 addition & 0 deletions node/storage/src/log_store/tx_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ const LOG_SYNC_PROGRESS_KEY: &str = "log_sync_progress";
const NEXT_TX_KEY: &str = "next_tx_seq";
const LOG_LATEST_BLOCK_NUMBER_KEY: &str = "log_latest_block_number_key";

#[derive(Debug)]
pub enum TxStatus {
Finalized,
Pruned,
Expand Down
17 changes: 10 additions & 7 deletions node/sync/src/auto_sync/batcher.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::{controllers::SyncState, SyncRequest, SyncResponse, SyncSender};
use anyhow::{bail, Result};
use serde::{Deserialize, Serialize};
use shared_types::TxSeqOrRoot;
use std::{collections::HashSet, fmt::Debug, sync::Arc, time::Duration};
use storage_async::Store;
use tokio::sync::RwLock;
Expand Down Expand Up @@ -84,14 +85,16 @@ impl Batcher {
}

async fn poll_tx(&self, tx_seq: u64) -> Result<Option<SyncResult>> {
// file already exists
if self.store.check_tx_completed(tx_seq).await?
|| self.store.check_tx_pruned(tx_seq).await?
// file already finalized or even pruned
if let Some(tx_status) = self
.store
.get_store()
.get_tx_status(TxSeqOrRoot::TxSeq(tx_seq))?
{
// File may be finalized during file sync, e.g. user uploaded file via RPC.
// In this case, just terminate the file sync.
let num_terminated = self.terminate_file_sync(tx_seq, false).await;
info!(%tx_seq, %num_terminated, "Terminate file sync due to file already finalized in db");
let num_terminated: usize = self.terminate_file_sync(tx_seq, false).await;
if num_terminated > 0 {
info!(%tx_seq, %num_terminated, ?tx_status, "Terminate file sync due to file already completed in db");
}
return Ok(Some(SyncResult::Completed));
}

Expand Down
17 changes: 11 additions & 6 deletions node/sync/src/auto_sync/batcher_random.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,30 @@ use tokio::time::sleep;
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RandomBatcherState {
pub name: String,
pub tasks: Vec<u64>,
pub pending_txs: usize,
pub ready_txs: usize,
}

#[derive(Clone)]
pub struct RandomBatcher {
name: String,
config: Config,
batcher: Batcher,
sync_store: Arc<SyncStore>,
}

impl RandomBatcher {
pub fn new(
name: String,
config: Config,
store: Store,
sync_send: SyncSender,
sync_store: Arc<SyncStore>,
) -> Self {
Self {
name,
config,
batcher: Batcher::new(
config.max_random_workers,
Expand All @@ -50,14 +54,15 @@ impl RandomBatcher {
let (pending_txs, ready_txs) = self.sync_store.stat().await?;

Ok(RandomBatcherState {
name: self.name.clone(),
tasks: self.batcher.tasks().await,
pending_txs,
ready_txs,
})
}

pub async fn start(mut self, catched_up: Arc<AtomicBool>) {
info!("Start to sync files");
info!("Start to sync files, state = {:?}", self.get_state().await);

// wait for log entry sync catched up
while !catched_up.load(Ordering::Relaxed) {
Expand All @@ -66,11 +71,11 @@ impl RandomBatcher {
}

loop {
if let Ok(state) = self.get_state().await {
metrics::RANDOM_STATE_TXS_SYNCING.update(state.tasks.len() as u64);
metrics::RANDOM_STATE_TXS_READY.update(state.ready_txs as u64);
metrics::RANDOM_STATE_TXS_PENDING.update(state.pending_txs as u64);
}
// if let Ok(state) = self.get_state().await {
// metrics::RANDOM_STATE_TXS_SYNCING.update(state.tasks.len() as u64);
// metrics::RANDOM_STATE_TXS_READY.update(state.ready_txs as u64);
// metrics::RANDOM_STATE_TXS_PENDING.update(state.pending_txs as u64);
// }

match self.sync_once().await {
Ok(true) => {}
Expand Down
108 changes: 108 additions & 0 deletions node/sync/src/auto_sync/historical_tx_writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
use std::sync::{
atomic::{AtomicU64, Ordering},
Arc,
};

use anyhow::Result;
use serde::{Deserialize, Serialize};
use storage::log_store::log_manager::DATA_DB_KEY;
use storage_async::Store;
use tokio::time::sleep;

use crate::Config;

use super::sync_store::{Queue, SyncStore};

const KEY_NEXT_TX_SEQ: &str = "sync.manager.historical.next_tx_seq";

#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct HistoricalTxWriterState {
pub next_tx_seq: u64,
pub pending_txs: usize,
pub ready_txs: usize,
}

pub struct HistoricalTxWriter {
config: Config,
store: Store,
sync_store: Arc<SyncStore>,
next_tx_seq: Arc<AtomicU64>,
}

impl HistoricalTxWriter {
pub async fn new(config: Config, store: Store, sync_store: Arc<SyncStore>) -> Result<Self> {
let next_tx_seq = store
.get_config_decoded(&KEY_NEXT_TX_SEQ, DATA_DB_KEY)
.await?;

Ok(Self {
config,
store,
sync_store,
next_tx_seq: Arc::new(AtomicU64::new(next_tx_seq.unwrap_or(0))),
})
}

pub async fn get_state(&self) -> Result<HistoricalTxWriterState> {
let (pending_txs, ready_txs) = self.sync_store.stat().await?;

Ok(HistoricalTxWriterState {
next_tx_seq: self.next_tx_seq.load(Ordering::Relaxed),
pending_txs,
ready_txs,
})
}

pub async fn start(mut self) {
info!(
"Start to write historical files into sync store, state = {:?}",
self.get_state().await
);

loop {
match self.write_once().await {
Ok(true) => {}
Ok(false) => {
trace!(
"There is no tx to write in sync store, state = {:?}",
self.get_state().await
);
sleep(self.config.auto_sync_idle_interval).await;
}
Err(err) => {
warn!(%err, "Failed to write tx once, state = {:?}", self.get_state().await);
sleep(self.config.auto_sync_error_interval).await;
}
}
}
}

async fn write_once(&mut self) -> Result<bool> {
let mut next_tx_seq = self.next_tx_seq.load(Ordering::Relaxed);

// no tx to write in sync store
if next_tx_seq >= self.store.get_store().next_tx_seq() {
return Ok(false);
}

// write tx in sync store if not finalized or pruned
if self
.store
.get_store()
.get_tx_status(shared_types::TxSeqOrRoot::TxSeq(next_tx_seq))?
.is_none()
{
self.sync_store.insert(next_tx_seq, Queue::Ready).await?;
}

// move forward
next_tx_seq += 1;
self.store
.set_config_encoded(&KEY_NEXT_TX_SEQ, &next_tx_seq, DATA_DB_KEY)
.await?;
self.next_tx_seq.store(next_tx_seq, Ordering::Relaxed);

Ok(true)
}
}
35 changes: 34 additions & 1 deletion node/sync/src/auto_sync/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::{Config, SyncSender};
use super::{
batcher_random::RandomBatcher,
batcher_serial::SerialBatcher,
historical_tx_writer::HistoricalTxWriter,
sync_store::{Queue, SyncStore},
};

Expand Down Expand Up @@ -76,7 +77,13 @@ impl AutoSyncManager {
};

// sync randomly
let random = RandomBatcher::new(config, store, sync_send, sync_store);
let random = RandomBatcher::new(
"random".into(),
config,
store.clone(),
sync_send.clone(),
sync_store,
);
executor.spawn(random.clone().start(catched_up.clone()), "auto_sync_random");

// handle on catched up notification
Expand All @@ -85,6 +92,32 @@ impl AutoSyncManager {
"auto_sync_wait_for_catchup",
);

// sync randomly for files without NewFile announcement
if config.neighbors_only {
let historical_sync_store = Arc::new(SyncStore::new_with_name(
store.clone(),
"pendingv2_historical",
"readyv2_historical",
));

let writer =
HistoricalTxWriter::new(config, store.clone(), historical_sync_store.clone())
.await?;
executor.spawn(writer.start(), "auto_sync_historical_writer");

let random_historical = RandomBatcher::new(
"random_historical".into(),
config,
store,
sync_send,
historical_sync_store,
);
executor.spawn(
random_historical.start(catched_up.clone()),
"auto_sync_random_historical",
);
}

Ok(Self {
serial,
random,
Expand Down
6 changes: 3 additions & 3 deletions node/sync/src/auto_sync/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ lazy_static::lazy_static! {
pub static ref SEQUENTIAL_SYNC_RESULT_TIMEOUT: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_sequential_sync_result_timeout");

// random auto sync
pub static ref RANDOM_STATE_TXS_SYNCING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_syncing", 1024);
pub static ref RANDOM_STATE_TXS_READY: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_ready", 1024);
pub static ref RANDOM_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_pending", 1024);
// pub static ref RANDOM_STATE_TXS_SYNCING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_syncing", 1024);
// pub static ref RANDOM_STATE_TXS_READY: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_ready", 1024);
// pub static ref RANDOM_STATE_TXS_PENDING: Arc<dyn Histogram> = Sample::ExpDecay(0.015).register("sync_auto_random_state_txs_pending", 1024);

pub static ref RANDOM_SYNC_RESULT_COMPLETED: Arc<dyn Meter> = register_meter("sync_auto_random_sync_result_completed");
pub static ref RANDOM_SYNC_RESULT_FAILED: Arc<dyn Counter<usize>> = CounterUsize::register("sync_auto_random_sync_result_failed");
Expand Down
1 change: 1 addition & 0 deletions node/sync/src/auto_sync/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod batcher;
pub mod batcher_random;
pub mod batcher_serial;
mod historical_tx_writer;
pub mod manager;
mod metrics;
pub mod sync_store;
Expand Down
13 changes: 12 additions & 1 deletion tests/sync_auto_random_v2_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,27 @@ def setup_params(self):
}

def run_test(self):
# Stop the last node to verify historical file sync
self.stop_storage_node(self.num_nodes - 1)

# Submit and upload files on node 0
data_root_1 = self.__upload_file__(0, 256 * 1024)
data_root_2 = self.__upload_file__(0, 256 * 1024)

# Files should be available on other nodes via auto sync
for i in range(1, self.num_nodes):
for i in range(1, self.num_nodes - 1):
wait_until(lambda: self.nodes[i].zgs_get_file_info(data_root_1) is not None)
wait_until(lambda: self.nodes[i].zgs_get_file_info(data_root_1)["finalized"])
wait_until(lambda: self.nodes[i].zgs_get_file_info(data_root_2) is not None)
wait_until(lambda: self.nodes[i].zgs_get_file_info(data_root_2)["finalized"])

# Start the last node to verify historical file sync
self.start_storage_node(self.num_nodes - 1)
self.nodes[self.num_nodes - 1].wait_for_rpc_connection()
wait_until(lambda: self.nodes[self.num_nodes - 1].zgs_get_file_info(data_root_1) is not None)
wait_until(lambda: self.nodes[self.num_nodes - 1].zgs_get_file_info(data_root_1)["finalized"])
wait_until(lambda: self.nodes[self.num_nodes - 1].zgs_get_file_info(data_root_2) is not None)
wait_until(lambda: self.nodes[self.num_nodes - 1].zgs_get_file_info(data_root_2)["finalized"])

if __name__ == "__main__":
AutoRandomSyncV2Test().main()

0 comments on commit e912522

Please sign in to comment.