Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] use tower::retry::Retry for retries #12614

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/rust/engine/fs/store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ use double_checked_cell_async::DoubleCheckedCell;
use fs::{default_cache_path, FileContent, RelativePath};
use futures::future::{self, BoxFuture, Either, FutureExt, TryFutureExt};
use grpc_util::prost::MessageExt;
use grpc_util::retry::{retry_call, status_is_retryable};
use grpc_util::retry::{retry_call, status_code_is_retryable};
use grpc_util::status_to_str;
use hashing::Digest;
use parking_lot::Mutex;
Expand Down Expand Up @@ -557,7 +557,7 @@ impl Store {
remote,
|remote| async move { remote.load_bytes_with(digest, Ok).await },
|err| match err {
ByteStoreError::Grpc(status) => status_is_retryable(status),
ByteStoreError::Grpc(status) => status_code_is_retryable(status.code()),
_ => false,
},
)
Expand Down Expand Up @@ -846,7 +846,7 @@ impl Store {
.await
},
|err| match err {
ByteStoreError::Grpc(status) => status_is_retryable(status),
ByteStoreError::Grpc(status) => status_code_is_retryable(status.code()),
_ => false,
},
)
Expand Down
12 changes: 6 additions & 6 deletions src/rust/engine/fs/store/src/remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use bytes::{Bytes, BytesMut};
use double_checked_cell_async::DoubleCheckedCell;
use futures::Future;
use futures::StreamExt;
use grpc_util::retry::{retry_call, status_is_retryable};
use grpc_util::retry::{retry_call, status_code_is_retryable};
use grpc_util::{headers_to_http_header_map, layered_service, status_to_str, LayeredService};
use hashing::Digest;
use log::Level;
Expand Down Expand Up @@ -174,7 +174,7 @@ impl ByteStore {
mmap,
|mmap| self.store_bytes_source(digest, move |range| Bytes::copy_from_slice(&mmap[range])),
|err| match err {
ByteStoreError::Grpc(status) => status_is_retryable(status),
ByteStoreError::Grpc(status) => status_code_is_retryable(status.code()),
_ => false,
},
)
Expand All @@ -191,7 +191,7 @@ impl ByteStore {
bytes,
|bytes| self.store_bytes_source(digest, move |range| bytes.slice(range)),
|err| match err {
ByteStoreError::Grpc(status) => status_is_retryable(status),
ByteStoreError::Grpc(status) => status_code_is_retryable(status.code()),
_ => false,
},
)
Expand Down Expand Up @@ -480,12 +480,12 @@ impl ByteStore {
let store2 = store.clone();
let client = store2.cas_client.as_ref().clone();
let response = retry_call(
client,
move |mut client| {
client,
move |mut client| {
let request = request.clone();
async move { client.find_missing_blobs(request).await }
},
status_is_retryable,
|s| status_code_is_retryable(s.code()),
)
.await
.map_err(status_to_str)?;
Expand Down
2 changes: 1 addition & 1 deletion src/rust/engine/grpc_util/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ tokio = { version = "1.4", features = ["net", "process", "rt-multi-thread", "syn
tokio-rustls = "0.22"
tokio-util = { version = "0.6", features = ["codec"] }
tonic = { version = "0.5", features = ["transport", "codegen", "tls", "tls-roots", "prost"] }
tower = { version = "0.4", features = ["limit"] }
tower = { version = "0.4", features = ["limit", "retry"] }
tower-layer = "0.3"
tower-service = "0.3"
webpki = "0.21"
Expand Down
11 changes: 8 additions & 3 deletions src/rust/engine/grpc_util/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ use itertools::Itertools;
use tokio_rustls::rustls::ClientConfig;
use tonic::transport::{Channel, ClientTlsConfig, Endpoint};
use tower::limit::ConcurrencyLimit;
use tower::retry::{Retry, RetryLayer};
use tower::ServiceBuilder;

pub mod headers;
Expand All @@ -49,17 +50,21 @@ pub mod prost;
pub mod retry;
pub mod tls;

use crate::retry::ExponentialBackoffPolicy;

// NB: Rather than boxing our tower/tonic services, we define a type alias that fully defines the
// Service layers that we use universally. If this type becomes unwieldy, or our various Services
// diverge in which layers they use, we should instead use a Box<dyn Service<..>>.
pub type LayeredService = SetRequestHeaders<ConcurrencyLimit<Channel>>;
pub type LayeredService<'a, Req, Res> =
Retry<ExponentialBackoffPolicy<'a, Req, Res>, SetRequestHeaders<ConcurrencyLimit<Channel>>>;

pub fn layered_service(
pub fn layered_service<'a, Req, Res>(
channel: Channel,
concurrency_limit: usize,
http_headers: HeaderMap,
) -> LayeredService {
) -> LayeredService<'a, Req, Res> {
ServiceBuilder::new()
.layer(RetryLayer::new(ExponentialBackoffPolicy::new()))
.layer(SetRequestHeadersLayer::new(http_headers))
.concurrency_limit(concurrency_limit)
.service(channel)
Expand Down
80 changes: 72 additions & 8 deletions src/rust/engine/grpc_util/src/retry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@

use std::time::Duration;

use futures::Future;
use futures::future::BoxFuture;
use futures::{Future, FutureExt};
use rand::{thread_rng, Rng};
use std::marker::PhantomData;
use tonic::{Code, Status};
use tower::retry::Policy;

pub fn status_is_retryable(status: &Status) -> bool {
const INTERVAL_DURATION: Duration = Duration::from_millis(20);
const MAX_RETRIES: usize = 3;
const MAX_BACKOFF_DURATION: Duration = Duration::from_secs(5);

pub fn status_code_is_retryable(code: &Code) -> bool {
matches!(
status.code(),
code,
Code::Aborted
| Code::Cancelled
| Code::Internal
Expand All @@ -19,6 +26,67 @@ pub fn status_is_retryable(status: &Status) -> bool {
)
}

pub struct ExponentialBackoffPolicy<'a, Req, Res> {
retries_remaining: usize,
_marker: PhantomData<&'a (Req, Res)>,
}

impl<'a, Req, Res> ExponentialBackoffPolicy<'a, Req, Res> {
pub fn new() -> Self {
ExponentialBackoffPolicy {
retries_remaining: MAX_RETRIES,
_marker: PhantomData,
}
}
}

impl<'a, Req, Res> Policy<Req, Res, Status> for ExponentialBackoffPolicy<'a, Req, Res>
where
Req: Clone + Send + Sync + 'static,
Res: Send + Sync + 'static,
{
type Future = BoxFuture<'a, Self>;

fn retry(&self, req: &Req, result: Result<&Res, &Status>) -> Option<Self::Future> {
match result {
// Request was successful, so do not retry.
Ok(_) => None,

Err(status) => {
let retries_remaining = self.retries_remaining;
let code = status.code();

if status_code_is_retryable(&code) {
if retries_remaining == 0 {
// No more retries left.
None
} else {
Some(async move {
let multiplier =
thread_rng().gen_range(0..2_u32.pow((MAX_RETRIES - retries_remaining) as u32) + 1);
let sleep_time = INTERVAL_DURATION * multiplier;
let sleep_time = sleep_time.min(MAX_BACKOFF_DURATION);
tokio::time::sleep(sleep_time).await;

ExponentialBackoffPolicy {
retries_remaining: self.retries_remaining - 1,
_marker: PhantomData,
}
}.boxed())
}
} else {
// This error is not retryable so do not bother retrying.
None
}
}
}
}

fn clone_request(&self, req: &Req) -> Option<Req> {
Some(req.clone())
}
}

/// Retry a gRPC client operation using exponential back-off to delay between attempts.
#[inline]
pub async fn retry_call<T, E, C, F, G, Fut>(client: C, f: F, is_retryable: G) -> Result<T, E>
Expand All @@ -28,10 +96,6 @@ where
G: Fn(&E) -> bool,
Fut: Future<Output = Result<T, E>>,
{
const INTERVAL_DURATION: Duration = Duration::from_millis(20);
const MAX_RETRIES: u32 = 3;
const MAX_BACKOFF_DURATION: Duration = Duration::from_secs(5);

let mut num_retries = 0;
let last_error = loop {
// Delay before the next send attempt if this is a retry.
Expand All @@ -57,7 +121,7 @@ where

num_retries += 1;

if num_retries >= MAX_RETRIES {
if num_retries >= MAX_RETRIES as u32 {
break last_error;
}
};
Expand Down
4 changes: 2 additions & 2 deletions src/rust/engine/process_execution/src/remote.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use futures::FutureExt;
use futures::{Stream, StreamExt};
use grpc_util::headers_to_http_header_map;
use grpc_util::prost::MessageExt;
use grpc_util::retry::{retry_call, status_is_retryable};
use grpc_util::retry::{retry_call, status_code_is_retryable};
use grpc_util::{layered_service, status_to_str, LayeredService};
use hashing::{Digest, Fingerprint};
use log::{debug, trace, warn, Level};
Expand Down Expand Up @@ -1375,7 +1375,7 @@ pub async fn check_action_cache(
let request = apply_headers(Request::new(request), &context.build_id);
async move { client.get_action_result(request).await }
},
status_is_retryable,
|s| status_code_is_retryable(s.code()),
)
.await;

Expand Down
4 changes: 2 additions & 2 deletions src/rust/engine/process_execution/src/remote_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use bazel_protos::require_digest;
use fs::RelativePath;
use futures::future::BoxFuture;
use futures::FutureExt;
use grpc_util::retry::status_is_retryable;
use grpc_util::retry::status_code_is_retryable;
use grpc_util::{
headers_to_http_header_map, layered_service, retry::retry_call, status_to_str, LayeredService,
};
Expand Down Expand Up @@ -489,7 +489,7 @@ impl CommandRunner {
.await
}
},
status_is_retryable,
|s| status_code_is_retryable(s.code()),
)
.await
.map_err(status_to_str)?;
Expand Down