-
Notifications
You must be signed in to change notification settings - Fork 124
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: resume connectors on network errors (#1956)
* fix: retry on network errors in MySQL connector Resume database queries on network errors. Select queries with multiple rows resume from the last row received. CDC continues from its last position. * fix: retry on network errors in Postgres connector Similarly to the MySQL connector, select queries resume from the last row received. The CDC resumes from the position where it was stopped. * fix: retry on network errors in Kafka connector Detect network failures, reconnect, and resume. * fix: retry on network errors in Object Store connector This is not a complete solution but we should use retry infrastructure provided by the object_store crate. * chore: add sleep between retries --------- Co-authored-by: chubei <[email protected]>
- Loading branch information
Showing
34 changed files
with
970 additions
and
202 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
114 changes: 114 additions & 0 deletions
114
dozer-ingestion/src/connectors/kafka/stream_consumer_helper.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
use crate::errors::ConnectorError; | ||
use crate::errors::KafkaError::KafkaConnectionError; | ||
use rdkafka::{ | ||
consumer::{BaseConsumer, Consumer}, | ||
message::BorrowedMessage, | ||
util::Timeout, | ||
ClientConfig, Message, Offset, | ||
}; | ||
use std::collections::HashMap; | ||
|
||
pub struct StreamConsumerHelper; | ||
|
||
pub type OffsetsMap = HashMap<String, (i32, i64)>; // key: topic, value: (partition, offset) | ||
|
||
impl StreamConsumerHelper { | ||
pub async fn start( | ||
client_config: &ClientConfig, | ||
topics: &[&str], | ||
) -> Result<BaseConsumer, ConnectorError> { | ||
Self::resume_impl(client_config, topics, None).await | ||
} | ||
|
||
pub async fn resume( | ||
client_config: &ClientConfig, | ||
topics: &[&str], | ||
offsets: &OffsetsMap, | ||
) -> Result<BaseConsumer, ConnectorError> { | ||
Self::resume_impl(client_config, topics, Some(offsets)).await | ||
} | ||
|
||
pub fn update_offsets(offsets: &mut OffsetsMap, message: &BorrowedMessage<'_>) { | ||
let _ = offsets.insert( | ||
message.topic().into(), | ||
(message.partition(), message.offset()), | ||
); | ||
} | ||
|
||
async fn resume_impl( | ||
client_config: &ClientConfig, | ||
topics: &[&str], | ||
offsets: Option<&OffsetsMap>, | ||
) -> Result<BaseConsumer, ConnectorError> { | ||
loop { | ||
match Self::try_resume(client_config, topics, offsets).await { | ||
Ok(con) => return Ok(con), | ||
Err(err) if is_network_failure(&err) => { | ||
const RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_secs(5); | ||
dozer_types::log::error!( | ||
"stream resume error {err}. retrying in {RETRY_INTERVAL:?}..." | ||
); | ||
tokio::time::sleep(RETRY_INTERVAL).await; | ||
continue; | ||
} | ||
Err(err) => Err(KafkaConnectionError(err))?, | ||
} | ||
} | ||
} | ||
|
||
async fn try_resume( | ||
client_config: &ClientConfig, | ||
topics: &[&str], | ||
offsets: Option<&OffsetsMap>, | ||
) -> Result<BaseConsumer, rdkafka::error::KafkaError> { | ||
let con: BaseConsumer = client_config.create()?; | ||
con.subscribe(topics.iter().as_slice())?; | ||
|
||
if let Some(offsets) = offsets { | ||
for (topic, &(partition, offset)) in offsets.iter() { | ||
con.seek(topic, partition, Offset::Offset(offset), Timeout::Never)?; | ||
} | ||
} | ||
|
||
Ok(con) | ||
} | ||
} | ||
|
||
pub fn is_network_failure(err: &rdkafka::error::KafkaError) -> bool { | ||
use rdkafka::error::KafkaError::*; | ||
let error_code = match err { | ||
ConsumerCommit(error_code) => error_code, | ||
Flush(error_code) => error_code, | ||
Global(error_code) => error_code, | ||
GroupListFetch(error_code) => error_code, | ||
MessageConsumption(error_code) => error_code, | ||
MessageProduction(error_code) => error_code, | ||
MetadataFetch(error_code) => error_code, | ||
OffsetFetch(error_code) => error_code, | ||
Rebalance(error_code) => error_code, | ||
SetPartitionOffset(error_code) => error_code, | ||
StoreOffset(error_code) => error_code, | ||
MockCluster(error_code) => error_code, | ||
Transaction(rdkafka_err) => return rdkafka_err.is_retriable(), | ||
other => { | ||
dozer_types::log::warn!( | ||
"unregonized kafka error error: {other}. treating as non-network error." | ||
); | ||
return false; | ||
} | ||
}; | ||
use rdkafka::types::RDKafkaErrorCode::*; | ||
matches!( | ||
error_code, | ||
Fail | BrokerTransportFailure | ||
| Resolve | ||
| MessageTimedOut | ||
| AllBrokersDown | ||
| OperationTimedOut | ||
| TimedOutQueue | ||
| Retry | ||
| PollExceeded | ||
| RequestTimedOut | ||
| NetworkException | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.