diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs index 5f185f5f0d10..8987e780a609 100644 --- a/src/meta-srv/src/error.rs +++ b/src/meta-srv/src/error.rs @@ -537,8 +537,18 @@ pub enum Error { location: Location, }, + #[snafu(display("Expected to retry later"))] + RetryLater { + location: Location, + source: BoxedError, + }, + #[snafu(display("Expected to retry later, reason: {}", reason))] - RetryLater { reason: String, location: Location }, + RetryLaterWithReason { + reason: String, + location: Location, + source: BoxedError, + }, #[snafu(display("Failed to update table metadata, err_msg: {}", err_msg))] UpdateTableMetadata { err_msg: String, location: Location }, @@ -628,6 +638,7 @@ impl Error { /// Returns `true` if the error is retryable. pub fn is_retryable(&self) -> bool { matches!(self, Error::RetryLater { .. }) + || matches!(self, Error::RetryLaterWithReason { .. }) } } @@ -665,6 +676,7 @@ impl ErrorExt for Error { | Error::MailboxTimeout { .. } | Error::MailboxReceiver { .. } | Error::RetryLater { .. } + | Error::RetryLaterWithReason { .. } | Error::StartGrpc { .. } | Error::UpdateTableMetadata { .. } | Error::NoEnoughAvailableDatanode { .. } diff --git a/src/meta-srv/src/procedure/region_failover/activate_region.rs b/src/meta-srv/src/procedure/region_failover/activate_region.rs index a2b1c8fd9303..4be5bb782930 100644 --- a/src/meta-srv/src/procedure/region_failover/activate_region.rs +++ b/src/meta-srv/src/procedure/region_failover/activate_region.rs @@ -17,6 +17,7 @@ use std::time::Duration; use api::v1::meta::MailboxMessage; use async_trait::async_trait; +use common_error::ext::BoxedError; use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply}; use common_meta::key::datanode_table::{DatanodeTableKey, RegionInfo}; use common_meta::peer::Peer; @@ -30,6 +31,7 @@ use super::update_metadata::UpdateRegionMetadata; use super::{RegionFailoverContext, State}; use crate::error::{ self, Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu, + UnexpectedSnafu, }; use crate::handler::HeartbeatMailbox; use crate::procedure::region_failover::OPEN_REGION_MESSAGE_TIMEOUT; @@ -162,19 +164,26 @@ impl ActivateRegion { // would be in vain. Then why not just end the failover procedure? Because we // currently lack the methods or any maintenance tools to manage the whole // procedures things, it would be easier to let the procedure keep running. - let reason = format!( + let violated = format!( "Region {failed_region:?} is not opened by Datanode {:?}, error: {error:?}", self.candidate, ); - RetryLaterSnafu { reason }.fail() + + UnexpectedSnafu { violated } + .fail() + .map_err(BoxedError::new) + .context(RetryLaterSnafu) } } Err(Error::MailboxTimeout { .. }) => { - let reason = format!( + let violated = format!( "Mailbox received timeout for activate failed region {failed_region:?} on Datanode {:?}", self.candidate, ); - RetryLaterSnafu { reason }.fail() + UnexpectedSnafu { violated } + .fail() + .map_err(BoxedError::new) + .context(RetryLaterSnafu) } Err(e) => Err(e), } diff --git a/src/meta-srv/src/procedure/region_failover/deactivate_region.rs b/src/meta-srv/src/procedure/region_failover/deactivate_region.rs index 650c794126a6..a8a09c38cb5c 100644 --- a/src/meta-srv/src/procedure/region_failover/deactivate_region.rs +++ b/src/meta-srv/src/procedure/region_failover/deactivate_region.rs @@ -16,6 +16,7 @@ use std::time::Duration; use api::v1::meta::MailboxMessage; use async_trait::async_trait; +use common_error::ext::BoxedError; use common_meta::instruction::{Instruction, InstructionReply, SimpleReply}; use common_meta::peer::Peer; use common_meta::rpc::router::RegionStatus; @@ -28,6 +29,7 @@ use super::activate_region::ActivateRegion; use super::{RegionFailoverContext, State}; use crate::error::{ self, Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu, + UnexpectedSnafu, }; use crate::handler::HeartbeatMailbox; use crate::service::mailbox::{Channel, MailboxReceiver}; @@ -117,11 +119,15 @@ impl DeactivateRegion { } else { // Under rare circumstances would a Datanode fail to close a Region. // So simply retry. - let reason = format!( + UnexpectedSnafu { + violated: format!( "Region {failed_region:?} is not closed by Datanode {}, error: {error:?}", failed_region.datanode_id, - ); - RetryLaterSnafu { reason }.fail() + ), + } + .fail() + .map_err(BoxedError::new) + .context(RetryLaterSnafu) } } Err(Error::MailboxTimeout { .. }) => { diff --git a/src/meta-srv/src/procedure/region_failover/failover_start.rs b/src/meta-srv/src/procedure/region_failover/failover_start.rs index 003cd7e4b9c8..d88de13d6206 100644 --- a/src/meta-srv/src/procedure/region_failover/failover_start.rs +++ b/src/meta-srv/src/procedure/region_failover/failover_start.rs @@ -13,17 +13,17 @@ // limitations under the License. use async_trait::async_trait; -use common_error::ext::ErrorExt; +use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_meta::peer::Peer; use common_meta::RegionIdent; use common_telemetry::info; use serde::{Deserialize, Serialize}; -use snafu::ensure; +use snafu::{ensure, location, Location}; use super::deactivate_region::DeactivateRegion; use super::{RegionFailoverContext, State}; -use crate::error::{RegionFailoverCandidatesNotFoundSnafu, Result, RetryLaterSnafu}; +use crate::error::{self, RegionFailoverCandidatesNotFoundSnafu, Result}; use crate::selector::SelectorOptions; #[derive(Serialize, Deserialize, Debug)] @@ -93,10 +93,10 @@ impl State for RegionFailoverStart { .await .map_err(|e| { if e.status_code() == StatusCode::RuntimeResourcesExhausted { - RetryLaterSnafu { - reason: format!("{e}"), + error::Error::RetryLater { + source: BoxedError::new(e), + location: location!(), } - .build() } else { e } diff --git a/src/meta-srv/src/procedure/region_failover/update_metadata.rs b/src/meta-srv/src/procedure/region_failover/update_metadata.rs index 2548f03763f0..42c5489d3f8c 100644 --- a/src/meta-srv/src/procedure/region_failover/update_metadata.rs +++ b/src/meta-srv/src/procedure/region_failover/update_metadata.rs @@ -15,6 +15,7 @@ use std::collections::HashMap; use async_trait::async_trait; +use common_error::ext::BoxedError; use common_meta::key::datanode_table::RegionInfo; use common_meta::key::table_route::TableRouteKey; use common_meta::peer::Peer; @@ -27,7 +28,7 @@ use store_api::storage::RegionNumber; use super::invalidate_cache::InvalidateCache; use super::{RegionFailoverContext, State}; -use crate::error::{self, Result, RetryLaterSnafu, TableRouteNotFoundSnafu}; +use crate::error::{self, Result, TableRouteNotFoundSnafu, UnexpectedSnafu}; use crate::lock::keys::table_metadata_lock_key; use crate::lock::Opts; @@ -173,14 +174,17 @@ impl State for UpdateRegionMetadata { self.update_metadata(ctx, failed_region) .await .map_err(|e| { - RetryLaterSnafu { - reason: format!( - "Failed to update metadata for failed region: {}, error: {}", - failed_region, e - ), - } - .build() - })?; + BoxedError::new( + UnexpectedSnafu { + violated: format!( + "Failed to update metadata for failed region: {}, error: {}", + failed_region, e + ), + } + .build(), + ) + }) + .context(error::RetryLaterSnafu)?; Ok(Box::new(InvalidateCache)) } } diff --git a/src/meta-srv/src/procedure/region_migration.rs b/src/meta-srv/src/procedure/region_migration.rs index ff5d0ecc45f7..15202c18a3e9 100644 --- a/src/meta-srv/src/procedure/region_migration.rs +++ b/src/meta-srv/src/procedure/region_migration.rs @@ -30,6 +30,7 @@ use std::fmt::Debug; use std::time::Duration; use api::v1::meta::MailboxMessage; +use common_error::ext::BoxedError; use common_meta::instruction::Instruction; use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue}; use common_meta::key::table_info::TableInfoValue; @@ -45,7 +46,7 @@ use common_procedure::error::{ use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status, StringKey}; pub use manager::RegionMigrationProcedureTask; use serde::{Deserialize, Serialize}; -use snafu::{location, Location, OptionExt, ResultExt}; +use snafu::{OptionExt, ResultExt}; use store_api::storage::RegionId; use tokio::time::Instant; @@ -220,10 +221,8 @@ impl Context { .get(table_id) .await .context(error::TableMetadataManagerSnafu) - .map_err(|e| error::Error::RetryLater { - reason: e.to_string(), - location: location!(), - })? + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? .context(error::TableRouteNotFoundSnafu { table_id })?; *table_route_value = Some(table_route); @@ -256,10 +255,8 @@ impl Context { .get(table_id) .await .context(error::TableMetadataManagerSnafu) - .map_err(|e| error::Error::RetryLater { - reason: e.to_string(), - location: location!(), - })? + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? .context(error::TableInfoNotFoundSnafu { table_id })?; *table_info_value = Some(table_info); @@ -289,10 +286,8 @@ impl Context { }) .await .context(error::TableMetadataManagerSnafu) - .map_err(|e| error::Error::RetryLater { - reason: e.to_string(), - location: location!(), - })? + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? .context(error::DatanodeTableNotFoundSnafu { table_id, datanode_id, diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs index 73ded293657e..6b8901e22568 100644 --- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs @@ -16,6 +16,7 @@ use std::any::Any; use std::time::Duration; use api::v1::meta::MailboxMessage; +use common_error::ext::BoxedError; use common_meta::distributed_time_constants::{MAILBOX_RTT_SECS, REGION_LEASE_SECS}; use common_meta::instruction::{ DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply, @@ -146,13 +147,15 @@ impl DowngradeLeaderRegion { }; if error.is_some() { - return error::RetryLaterSnafu { - reason: format!( + return error::UnexpectedSnafu { + violated: format!( "Failed to downgrade the region {} on Datanode {:?}, error: {:?}", region_id, leader, error ), } - .fail(); + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu); } if !exists { @@ -169,11 +172,15 @@ impl DowngradeLeaderRegion { Ok(()) } Err(error::Error::MailboxTimeout { .. }) => { - let reason = format!( - "Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}", - leader, - ); - error::RetryLaterSnafu { reason }.fail() + error::UnexpectedSnafu { + violated: format!( + "Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}", + leader, + ), + } + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? } Err(err) => Err(err), } @@ -374,7 +381,7 @@ mod tests { assert_matches!(err, Error::RetryLater { .. }); assert!(err.is_retryable()); - assert!(err.to_string().contains("test mocked")); + assert!(format!("{err:?}").contains("test mocked"), "err: {err:?}",); } #[tokio::test] diff --git a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs index 0335a447c7e7..8de31e98cda0 100644 --- a/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/open_candidate_region.rs @@ -16,6 +16,7 @@ use std::any::Any; use std::time::Duration; use api::v1::meta::MailboxMessage; +use common_error::ext::BoxedError; use common_meta::distributed_time_constants::MAILBOX_RTT_SECS; use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply}; use common_meta::key::datanode_table::RegionInfo; @@ -155,21 +156,24 @@ impl OpenCandidateRegion { if result { Ok(()) } else { - error::RetryLaterSnafu { - reason: format!( - "Region {region_id} is not opened by datanode {:?}, error: {error:?}", - candidate, - ), - } - .fail() + + error::UnexpectedSnafu { violated:format!( + "Region {region_id} is not opened by datanode {:?}, error: {error:?}", + candidate, + ) } + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? } } Err(error::Error::MailboxTimeout { .. }) => { - let reason = format!( + error::UnexpectedSnafu { violated:format!( "Mailbox received timeout for open candidate region {region_id} on datanode {:?}", candidate, - ); - error::RetryLaterSnafu { reason }.fail() + ) } + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu) } Err(e) => Err(e), } @@ -383,7 +387,7 @@ mod tests { assert_matches!(err, Error::RetryLater { .. }); assert!(err.is_retryable()); - assert!(err.to_string().contains("test mocked")); + assert!(format!("{err:?}").contains("test mocked")); } #[tokio::test] diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs index f764a424e864..bcd5a82b4573 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/downgrade_leader_region.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_error::ext::BoxedError; use common_meta::rpc::router::RegionStatus; use snafu::ResultExt; @@ -61,9 +62,12 @@ impl UpdateMetadata { .context(error::TableMetadataManagerSnafu) { ctx.remove_table_route_value(); - return error::RetryLaterSnafu { - reason: format!("Failed to update the table route during the downgrading leader region, error: {err}") - }.fail(); + return error::UnexpectedSnafu { + violated: format!("Failed to update the table route during the downgrading leader region, error: {err}"), + } + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu); } ctx.remove_table_route_value(); @@ -163,13 +167,9 @@ mod tests { ctx.volatile_ctx.table_route = Some(original_table_route); let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err(); - assert!(ctx.volatile_ctx.table_route.is_none()); - - assert_matches!(err, Error::RetryLater { .. }); - assert!(err.is_retryable()); - assert!(err.to_string().contains("Failed to update the table route")); + assert!(format!("{err:?}").contains("Failed to update the table route")); } #[tokio::test] diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs index 6d066a1b863f..75a3c3a5f8e2 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/rollback_downgraded_region.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_error::ext::BoxedError; use snafu::ResultExt; use crate::error::{self, Result}; @@ -45,9 +46,10 @@ impl UpdateMetadata { .context(error::TableMetadataManagerSnafu) { ctx.remove_table_route_value(); - return error::RetryLaterSnafu { - reason: format!("Failed to update the table route during the rollback downgraded leader region, error: {err}") - }.fail(); + return Err(BoxedError::new(err)). + context(error::RetryLaterWithReasonSnafu{ + reason: format!("Failed to update the table route during the rollback downgraded leader region: {region_id}") + }); } ctx.remove_table_route_value(); @@ -157,9 +159,8 @@ mod tests { .await .unwrap_err(); assert!(ctx.volatile_ctx.table_route.is_none()); - assert_matches!(err, Error::RetryLater { .. }); assert!(err.is_retryable()); - assert!(err.to_string().contains("Failed to update the table route")); + assert!(format!("{err:?}").contains("Failed to update the table route")); state.rollback_downgraded_region(&mut ctx).await.unwrap(); diff --git a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs index e69d0cd714e1..227a5bcbf89a 100644 --- a/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/update_metadata/upgrade_candidate_region.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_error::ext::BoxedError; use common_meta::key::datanode_table::RegionInfo; use common_meta::rpc::router::{region_distribution, RegionRoute}; use common_telemetry::{info, warn}; @@ -168,9 +169,10 @@ impl UpdateMetadata { .context(error::TableMetadataManagerSnafu) { ctx.remove_table_route_value(); - return error::RetryLaterSnafu { - reason: format!("Failed to update the table route during the upgrading candidate region, error: {err}") - }.fail(); + return Err(BoxedError::new(err)). + context(error::RetryLaterWithReasonSnafu{ + reason: format!("Failed to update the table route during the upgrading candidate region: {region_id}")}, + ); }; ctx.remove_table_route_value(); @@ -354,15 +356,12 @@ mod tests { .register(2, RegionId::new(table_id, 1)) .unwrap(); ctx.volatile_ctx.opening_region_guard = Some(guard); - let err = state.upgrade_candidate_region(&mut ctx).await.unwrap_err(); assert!(ctx.volatile_ctx.table_route.is_none()); assert!(ctx.volatile_ctx.opening_region_guard.is_some()); - assert_matches!(err, Error::RetryLater { .. }); - assert!(err.is_retryable()); - assert!(err.to_string().contains("Failed to update the table route")); + assert!(format!("{err:?}").contains("Failed to update the table route")); } #[tokio::test] diff --git a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs index 514be461a8f0..2e9a84e397b8 100644 --- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs +++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs @@ -16,6 +16,7 @@ use std::any::Any; use std::time::Duration; use api::v1::meta::MailboxMessage; +use common_error::ext::BoxedError; use common_meta::distributed_time_constants::MAILBOX_RTT_SECS; use common_meta::instruction::{Instruction, InstructionReply, UpgradeRegion, UpgradeRegionReply}; use common_procedure::Status; @@ -146,13 +147,15 @@ impl UpgradeCandidateRegion { // Notes: The order of handling is important. if error.is_some() { - return error::RetryLaterSnafu { - reason: format!( + error::UnexpectedSnafu { + violated: format!( "Failed to upgrade the region {} on datanode {:?}, error: {:?}", region_id, candidate, error ), } - .fail(); + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu)? } ensure!( @@ -166,23 +169,29 @@ impl UpgradeCandidateRegion { ); if self.require_ready && !ready { - return error::RetryLaterSnafu { - reason: format!( + return error::UnexpectedSnafu { + violated: format!( "Candidate region {} still replaying the wal on datanode {:?}", region_id, candidate ), } - .fail(); + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu); } Ok(()) } Err(error::Error::MailboxTimeout { .. }) => { - let reason = format!( + error::UnexpectedSnafu { + violated: format!( "Mailbox received timeout for upgrade candidate region {region_id} on datanode {:?}", candidate, - ); - error::RetryLaterSnafu { reason }.fail() + ), + } + .fail() + .map_err(BoxedError::new) + .context(error::RetryLaterSnafu) } Err(err) => Err(err), } @@ -336,7 +345,7 @@ mod tests { assert_matches!(err, Error::RetryLater { .. }); assert!(err.is_retryable()); - assert!(err.to_string().contains("test mocked")); + assert!(format!("{err:?}").contains("test mocked")); } #[tokio::test] @@ -398,7 +407,7 @@ mod tests { assert_matches!(err, Error::RetryLater { .. }); assert!(err.is_retryable()); - assert!(err.to_string().contains("still replaying the wal")); + assert!(format!("{err:?}").contains("still replaying the wal")); // Sets the `require_ready` to false. state.require_ready = false; diff --git a/src/meta-srv/src/procedure/utils.rs b/src/meta-srv/src/procedure/utils.rs index 8cf2a34c612d..094ab4d14c0e 100644 --- a/src/meta-srv/src/procedure/utils.rs +++ b/src/meta-srv/src/procedure/utils.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#[cfg(feature = "mock")] +#[cfg(any(test, feature = "mock"))] pub mod mock { use std::io::Error; use std::sync::Arc;