Skip to content

Commit

Permalink
fix: print detailed error
Browse files Browse the repository at this point in the history
  • Loading branch information
WenyXu committed Jan 11, 2024
1 parent 8ec1e42 commit 8e99354
Show file tree
Hide file tree
Showing 13 changed files with 134 additions and 88 deletions.
14 changes: 13 additions & 1 deletion src/meta-srv/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,8 +537,18 @@ pub enum Error {
location: Location,
},

#[snafu(display("Expected to retry later"))]
RetryLater {
location: Location,
source: BoxedError,
},

#[snafu(display("Expected to retry later, reason: {}", reason))]
RetryLater { reason: String, location: Location },
RetryLaterWithReason {
reason: String,
location: Location,
source: BoxedError,
},

#[snafu(display("Failed to update table metadata, err_msg: {}", err_msg))]
UpdateTableMetadata { err_msg: String, location: Location },
Expand Down Expand Up @@ -628,6 +638,7 @@ impl Error {
/// Returns `true` if the error is retryable.
pub fn is_retryable(&self) -> bool {
matches!(self, Error::RetryLater { .. })
|| matches!(self, Error::RetryLaterWithReason { .. })
}
}

Expand Down Expand Up @@ -665,6 +676,7 @@ impl ErrorExt for Error {
| Error::MailboxTimeout { .. }
| Error::MailboxReceiver { .. }
| Error::RetryLater { .. }
| Error::RetryLaterWithReason { .. }
| Error::StartGrpc { .. }
| Error::UpdateTableMetadata { .. }
| Error::NoEnoughAvailableDatanode { .. }
Expand Down
17 changes: 13 additions & 4 deletions src/meta-srv/src/procedure/region_failover/activate_region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use std::time::Duration;

use api::v1::meta::MailboxMessage;
use async_trait::async_trait;
use common_error::ext::BoxedError;
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::key::datanode_table::{DatanodeTableKey, RegionInfo};
use common_meta::peer::Peer;
Expand All @@ -30,6 +31,7 @@ use super::update_metadata::UpdateRegionMetadata;
use super::{RegionFailoverContext, State};
use crate::error::{
self, Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu,
UnexpectedSnafu,
};
use crate::handler::HeartbeatMailbox;
use crate::procedure::region_failover::OPEN_REGION_MESSAGE_TIMEOUT;
Expand Down Expand Up @@ -162,19 +164,26 @@ impl ActivateRegion {
// would be in vain. Then why not just end the failover procedure? Because we
// currently lack the methods or any maintenance tools to manage the whole
// procedures things, it would be easier to let the procedure keep running.
let reason = format!(
let violated = format!(
"Region {failed_region:?} is not opened by Datanode {:?}, error: {error:?}",
self.candidate,
);
RetryLaterSnafu { reason }.fail()

UnexpectedSnafu { violated }
.fail()
.map_err(BoxedError::new)
.context(RetryLaterSnafu)
}
}
Err(Error::MailboxTimeout { .. }) => {
let reason = format!(
let violated = format!(
"Mailbox received timeout for activate failed region {failed_region:?} on Datanode {:?}",
self.candidate,
);
RetryLaterSnafu { reason }.fail()
UnexpectedSnafu { violated }
.fail()
.map_err(BoxedError::new)
.context(RetryLaterSnafu)
}
Err(e) => Err(e),
}
Expand Down
12 changes: 9 additions & 3 deletions src/meta-srv/src/procedure/region_failover/deactivate_region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::time::Duration;

use api::v1::meta::MailboxMessage;
use async_trait::async_trait;
use common_error::ext::BoxedError;
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionStatus;
Expand All @@ -28,6 +29,7 @@ use super::activate_region::ActivateRegion;
use super::{RegionFailoverContext, State};
use crate::error::{
self, Error, Result, RetryLaterSnafu, SerializeToJsonSnafu, UnexpectedInstructionReplySnafu,
UnexpectedSnafu,
};
use crate::handler::HeartbeatMailbox;
use crate::service::mailbox::{Channel, MailboxReceiver};
Expand Down Expand Up @@ -117,11 +119,15 @@ impl DeactivateRegion {
} else {
// Under rare circumstances would a Datanode fail to close a Region.
// So simply retry.
let reason = format!(
UnexpectedSnafu {
violated: format!(
"Region {failed_region:?} is not closed by Datanode {}, error: {error:?}",
failed_region.datanode_id,
);
RetryLaterSnafu { reason }.fail()
),
}
.fail()
.map_err(BoxedError::new)
.context(RetryLaterSnafu)
}
}
Err(Error::MailboxTimeout { .. }) => {
Expand Down
12 changes: 6 additions & 6 deletions src/meta-srv/src/procedure/region_failover/failover_start.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@
// limitations under the License.

use async_trait::async_trait;
use common_error::ext::ErrorExt;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_meta::peer::Peer;
use common_meta::RegionIdent;
use common_telemetry::info;
use serde::{Deserialize, Serialize};
use snafu::ensure;
use snafu::{ensure, location, Location};

use super::deactivate_region::DeactivateRegion;
use super::{RegionFailoverContext, State};
use crate::error::{RegionFailoverCandidatesNotFoundSnafu, Result, RetryLaterSnafu};
use crate::error::{self, RegionFailoverCandidatesNotFoundSnafu, Result};
use crate::selector::SelectorOptions;

#[derive(Serialize, Deserialize, Debug)]
Expand Down Expand Up @@ -93,10 +93,10 @@ impl State for RegionFailoverStart {
.await
.map_err(|e| {
if e.status_code() == StatusCode::RuntimeResourcesExhausted {
RetryLaterSnafu {
reason: format!("{e}"),
error::Error::RetryLater {
source: BoxedError::new(e),
location: location!(),
}
.build()
} else {
e
}
Expand Down
22 changes: 13 additions & 9 deletions src/meta-srv/src/procedure/region_failover/update_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use std::collections::HashMap;

use async_trait::async_trait;
use common_error::ext::BoxedError;
use common_meta::key::datanode_table::RegionInfo;
use common_meta::key::table_route::TableRouteKey;
use common_meta::peer::Peer;
Expand All @@ -27,7 +28,7 @@ use store_api::storage::RegionNumber;

use super::invalidate_cache::InvalidateCache;
use super::{RegionFailoverContext, State};
use crate::error::{self, Result, RetryLaterSnafu, TableRouteNotFoundSnafu};
use crate::error::{self, Result, TableRouteNotFoundSnafu, UnexpectedSnafu};
use crate::lock::keys::table_metadata_lock_key;
use crate::lock::Opts;

Expand Down Expand Up @@ -173,14 +174,17 @@ impl State for UpdateRegionMetadata {
self.update_metadata(ctx, failed_region)
.await
.map_err(|e| {
RetryLaterSnafu {
reason: format!(
"Failed to update metadata for failed region: {}, error: {}",
failed_region, e
),
}
.build()
})?;
BoxedError::new(
UnexpectedSnafu {
violated: format!(
"Failed to update metadata for failed region: {}, error: {}",
failed_region, e
),
}
.build(),
)
})
.context(error::RetryLaterSnafu)?;
Ok(Box::new(InvalidateCache))
}
}
Expand Down
21 changes: 8 additions & 13 deletions src/meta-srv/src/procedure/region_migration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ use std::fmt::Debug;
use std::time::Duration;

use api::v1::meta::MailboxMessage;
use common_error::ext::BoxedError;
use common_meta::instruction::Instruction;
use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue};
use common_meta::key::table_info::TableInfoValue;
Expand All @@ -45,7 +46,7 @@ use common_procedure::error::{
use common_procedure::{Context as ProcedureContext, LockKey, Procedure, Status, StringKey};
pub use manager::RegionMigrationProcedureTask;
use serde::{Deserialize, Serialize};
use snafu::{location, Location, OptionExt, ResultExt};
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionId;
use tokio::time::Instant;

Expand Down Expand Up @@ -220,10 +221,8 @@ impl Context {
.get(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(|e| error::Error::RetryLater {
reason: e.to_string(),
location: location!(),
})?
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)?
.context(error::TableRouteNotFoundSnafu { table_id })?;

*table_route_value = Some(table_route);
Expand Down Expand Up @@ -256,10 +255,8 @@ impl Context {
.get(table_id)
.await
.context(error::TableMetadataManagerSnafu)
.map_err(|e| error::Error::RetryLater {
reason: e.to_string(),
location: location!(),
})?
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)?
.context(error::TableInfoNotFoundSnafu { table_id })?;

*table_info_value = Some(table_info);
Expand Down Expand Up @@ -289,10 +286,8 @@ impl Context {
})
.await
.context(error::TableMetadataManagerSnafu)
.map_err(|e| error::Error::RetryLater {
reason: e.to_string(),
location: location!(),
})?
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)?
.context(error::DatanodeTableNotFoundSnafu {
table_id,
datanode_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::any::Any;
use std::time::Duration;

use api::v1::meta::MailboxMessage;
use common_error::ext::BoxedError;
use common_meta::distributed_time_constants::{MAILBOX_RTT_SECS, REGION_LEASE_SECS};
use common_meta::instruction::{
DowngradeRegion, DowngradeRegionReply, Instruction, InstructionReply,
Expand Down Expand Up @@ -146,13 +147,15 @@ impl DowngradeLeaderRegion {
};

if error.is_some() {
return error::RetryLaterSnafu {
reason: format!(
return error::UnexpectedSnafu {
violated: format!(
"Failed to downgrade the region {} on Datanode {:?}, error: {:?}",
region_id, leader, error
),
}
.fail();
.fail()
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu);
}

if !exists {
Expand All @@ -169,11 +172,15 @@ impl DowngradeLeaderRegion {
Ok(())
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
"Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}",
leader,
);
error::RetryLaterSnafu { reason }.fail()
error::UnexpectedSnafu {
violated: format!(
"Mailbox received timeout for downgrade leader region {region_id} on datanode {:?}",
leader,
),
}
.fail()
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)?
}
Err(err) => Err(err),
}
Expand Down Expand Up @@ -374,7 +381,7 @@ mod tests {

assert_matches!(err, Error::RetryLater { .. });
assert!(err.is_retryable());
assert!(err.to_string().contains("test mocked"));
assert!(format!("{err:?}").contains("test mocked"), "err: {err:?}",);
}

#[tokio::test]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::any::Any;
use std::time::Duration;

use api::v1::meta::MailboxMessage;
use common_error::ext::BoxedError;
use common_meta::distributed_time_constants::MAILBOX_RTT_SECS;
use common_meta::instruction::{Instruction, InstructionReply, OpenRegion, SimpleReply};
use common_meta::key::datanode_table::RegionInfo;
Expand Down Expand Up @@ -155,21 +156,24 @@ impl OpenCandidateRegion {
if result {
Ok(())
} else {
error::RetryLaterSnafu {
reason: format!(
"Region {region_id} is not opened by datanode {:?}, error: {error:?}",
candidate,
),
}
.fail()

error::UnexpectedSnafu { violated:format!(
"Region {region_id} is not opened by datanode {:?}, error: {error:?}",
candidate,
) }
.fail()
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)?
}
}
Err(error::Error::MailboxTimeout { .. }) => {
let reason = format!(
error::UnexpectedSnafu { violated:format!(
"Mailbox received timeout for open candidate region {region_id} on datanode {:?}",
candidate,
);
error::RetryLaterSnafu { reason }.fail()
) }
.fail()
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu)
}
Err(e) => Err(e),
}
Expand Down Expand Up @@ -383,7 +387,7 @@ mod tests {

assert_matches!(err, Error::RetryLater { .. });
assert!(err.is_retryable());
assert!(err.to_string().contains("test mocked"));
assert!(format!("{err:?}").contains("test mocked"));
}

#[tokio::test]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use common_error::ext::BoxedError;
use common_meta::rpc::router::RegionStatus;
use snafu::ResultExt;

Expand Down Expand Up @@ -61,9 +62,12 @@ impl UpdateMetadata {
.context(error::TableMetadataManagerSnafu)
{
ctx.remove_table_route_value();
return error::RetryLaterSnafu {
reason: format!("Failed to update the table route during the downgrading leader region, error: {err}")
}.fail();
return error::UnexpectedSnafu {
violated: format!("Failed to update the table route during the downgrading leader region, error: {err}"),
}
.fail()
.map_err(BoxedError::new)
.context(error::RetryLaterSnafu);
}

ctx.remove_table_route_value();
Expand Down Expand Up @@ -163,13 +167,9 @@ mod tests {
ctx.volatile_ctx.table_route = Some(original_table_route);

let err = state.downgrade_leader_region(&mut ctx).await.unwrap_err();

assert!(ctx.volatile_ctx.table_route.is_none());

assert_matches!(err, Error::RetryLater { .. });

assert!(err.is_retryable());
assert!(err.to_string().contains("Failed to update the table route"));
assert!(format!("{err:?}").contains("Failed to update the table route"));
}

#[tokio::test]
Expand Down
Loading

0 comments on commit 8e99354

Please sign in to comment.