Skip to content

Commit

Permalink
Merge branch 'main' into refine-runtime
Browse files Browse the repository at this point in the history
  • Loading branch information
waynexia authored Jul 25, 2024
2 parents d7c3d76 + b81d3a2 commit 9eb42df
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 11 deletions.
14 changes: 7 additions & 7 deletions config/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,15 +266,15 @@
| `procedure.retry_delay` | String | `500ms` | Initial retry delay of procedures, increases exponentially |
| `procedure.max_metadata_value_size` | String | `1500KiB` | Auto split large value<br/>GreptimeDB procedure uses etcd as the default metadata storage backend.<br/>The etcd the maximum size of any request is 1.5 MiB<br/>1500KiB = 1536KiB (1.5MiB) - 36KiB (reserved size of key)<br/>Comments out the `max_metadata_value_size`, for don't split large value (no limit). |
| `failure_detector` | -- | -- | -- |
| `failure_detector.threshold` | Float | `8.0` | -- |
| `failure_detector.min_std_deviation` | String | `100ms` | -- |
| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | -- |
| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | -- |
| `failure_detector.threshold` | Float | `8.0` | The threshold value used by the failure detector to determine failure conditions. |
| `failure_detector.min_std_deviation` | String | `100ms` | The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations. |
| `failure_detector.acceptable_heartbeat_pause` | String | `10000ms` | The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable. |
| `failure_detector.first_heartbeat_estimate` | String | `1000ms` | The initial estimate of the heartbeat interval used by the failure detector. |
| `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.timeout` | String | `10s` | -- |
| `datanode.client.connect_timeout` | String | `10s` | -- |
| `datanode.client.tcp_nodelay` | Bool | `true` | -- |
| `datanode.client.timeout` | String | `10s` | Operation timeout. |
| `datanode.client.connect_timeout` | String | `10s` | Connect server timeout. |
| `datanode.client.tcp_nodelay` | Bool | `true` | `TCP_NODELAY` option for accepted connections. |
| `wal` | -- | -- | -- |
| `wal.provider` | String | `raft_engine` | -- |
| `wal.broker_endpoints` | Array | -- | The broker endpoints of the Kafka cluster. |
Expand Down
15 changes: 15 additions & 0 deletions config/metasrv.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,32 @@ max_metadata_value_size = "1500KiB"

# Failure detectors options.
[failure_detector]

## The threshold value used by the failure detector to determine failure conditions.
threshold = 8.0

## The minimum standard deviation of the heartbeat intervals, used to calculate acceptable variations.
min_std_deviation = "100ms"

## The acceptable pause duration between heartbeats, used to determine if a heartbeat interval is acceptable.
acceptable_heartbeat_pause = "10000ms"

## The initial estimate of the heartbeat interval used by the failure detector.
first_heartbeat_estimate = "1000ms"

## Datanode options.
[datanode]

## Datanode client options.
[datanode.client]

## Operation timeout.
timeout = "10s"

## Connect server timeout.
connect_timeout = "10s"

## `TCP_NODELAY` option for accepted connections.
tcp_nodelay = true

[wal]
Expand Down
21 changes: 17 additions & 4 deletions src/datanode/src/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ use std::time::Duration;
use std::{env, path};

use common_base::readable_size::ReadableSize;
use common_telemetry::info;
use object_store::layers::{LruCacheLayer, RetryLayer};
use common_telemetry::{info, warn};
use object_store::layers::{LruCacheLayer, RetryInterceptor, RetryLayer};
use object_store::services::Fs;
use object_store::util::{join_dir, normalize_dir, with_instrument_layers};
use object_store::{HttpClient, ObjectStore, ObjectStoreBuilder};
use object_store::{Error, HttpClient, ObjectStore, ObjectStoreBuilder};
use snafu::prelude::*;

use crate::config::{ObjectStoreConfig, DEFAULT_OBJECT_STORE_CACHE_SIZE};
Expand All @@ -55,7 +55,11 @@ pub(crate) async fn new_object_store(
// Enable retry layer and cache layer for non-fs object storages
let object_store = if !matches!(store, ObjectStoreConfig::File(..)) {
let object_store = create_object_store_with_cache(object_store, &store).await?;
object_store.layer(RetryLayer::new().with_jitter())
object_store.layer(
RetryLayer::new()
.with_jitter()
.with_notify(PrintDetailedError),
)
} else {
object_store
};
Expand Down Expand Up @@ -171,3 +175,12 @@ pub(crate) fn build_http_client() -> Result<HttpClient> {

HttpClient::build(http_builder).context(error::InitBackendSnafu)
}

struct PrintDetailedError;

// PrintDetailedError is a retry interceptor that prints error in Debug format in retrying.
impl RetryInterceptor for PrintDetailedError {
fn intercept(&self, err: &Error, dur: Duration) {
warn!("Retry after {}s, error: {:#?}", dur.as_secs_f64(), err);
}
}

0 comments on commit 9eb42df

Please sign in to comment.