From 30bae9dc3bf48405b53a6ded4d0d1fd1029809b0 Mon Sep 17 00:00:00 2001 From: Zhe Wu Date: Sun, 5 May 2024 21:33:43 -0700 Subject: [PATCH] Quorum driver take server requested retry interval into consideration --- crates/sui-core/src/authority_aggregator.rs | 32 +++++++++++++++---- crates/sui-core/src/quorum_driver/mod.rs | 19 ++++++++--- crates/sui-core/src/quorum_driver/tests.rs | 2 +- crates/sui-core/src/test_authority_clients.rs | 6 ++-- .../unit_tests/authority_aggregator_tests.rs | 16 +++++++--- crates/sui-types/src/error.rs | 7 ++++ 6 files changed, 62 insertions(+), 20 deletions(-) diff --git a/crates/sui-core/src/authority_aggregator.rs b/crates/sui-core/src/authority_aggregator.rs index e27ed8486598b4..e6034da9953bc1 100644 --- a/crates/sui-core/src/authority_aggregator.rs +++ b/crates/sui-core/src/authority_aggregator.rs @@ -295,6 +295,21 @@ pub fn group_errors(errors: Vec<(SuiError, Vec, StakeUnit)>) -> G .collect() } +#[derive(Debug)] +struct RetryableOverloadInfo { + stake: StakeUnit, + requested_retry_after: Duration, +} + +impl Default for RetryableOverloadInfo { + fn default() -> Self { + Self { + stake: 0, + requested_retry_after: Duration::from_secs(1), + } + } +} + #[derive(Debug)] struct ProcessTransactionState { // The list of signatures gathered at any point @@ -309,7 +324,7 @@ struct ProcessTransactionState { // Validators that are overloaded with txns pending execution. overloaded_stake: StakeUnit, // Validators that are overloaded and request client to retry. - retryable_overloaded_stake: StakeUnit, + retryable_overload_info: RetryableOverloadInfo, // If there are conflicting transactions, we note them down and may attempt to retry conflicting_tx_digests: BTreeMap, StakeUnit)>, @@ -1046,7 +1061,7 @@ where object_or_package_not_found_stake: 0, non_retryable_stake: 0, overloaded_stake: 0, - retryable_overloaded_stake: 0, + retryable_overload_info: Default::default(), retryable: true, conflicting_tx_digests: Default::default(), conflicting_tx_total_stake: 0, @@ -1119,7 +1134,8 @@ where // // TODO: currently retryable overload and above overload error look redundant. We want to have a unified // code path to handle both overload scenarios. - state.retryable_overloaded_stake += weight; + state.retryable_overload_info.stake += weight; + state.retryable_overload_info.requested_retry_after = state.retryable_overload_info.requested_retry_after.max(Duration::from_secs(err.retry_after_secs())); } else if !retryable && !state.record_conflicting_transaction_if_any(name, weight, &err) { // We don't count conflicting transactions as non-retryable errors here @@ -1264,13 +1280,17 @@ where // we have heard from *all* validators. Check if any SystemOverloadRetryAfter error caused the txn // to fail. If so, return explicit SystemOverloadRetryAfter error for continuous retry (since objects) // are locked in validators. If not, retry regular RetryableTransaction error. - if state.tx_signatures.total_votes() + state.retryable_overloaded_stake >= quorum_threshold + if state.tx_signatures.total_votes() + state.retryable_overload_info.stake + >= quorum_threshold { // TODO: make use of retry_after_secs, which is currently not used. return AggregatorProcessTransactionError::SystemOverloadRetryAfter { - overload_stake: state.retryable_overloaded_stake, + overload_stake: state.retryable_overload_info.stake, errors: group_errors(state.errors), - retry_after_secs: 0, + retry_after_secs: state + .retryable_overload_info + .requested_retry_after + .as_secs(), }; } diff --git a/crates/sui-core/src/quorum_driver/mod.rs b/crates/sui-core/src/quorum_driver/mod.rs index 449247c6b79ec2..2a0a8430edaf7b 100644 --- a/crates/sui-core/src/quorum_driver/mod.rs +++ b/crates/sui-core/src/quorum_driver/mod.rs @@ -157,7 +157,7 @@ impl QuorumDriver { ); return Ok(()); } - self.backoff_and_enqueue(request, tx_cert, old_retry_times, client_addr) + self.backoff_and_enqueue(request, tx_cert, old_retry_times, client_addr, None) .await } @@ -168,9 +168,11 @@ impl QuorumDriver { tx_cert: Option, old_retry_times: u32, client_addr: Option, + retry_after: Option, ) -> SuiResult<()> { - let next_retry_after = - Instant::now() + Duration::from_millis(200 * u64::pow(2, old_retry_times)); + let next_retry_after = Instant::now() + + Duration::from_millis(200 * u64::pow(2, old_retry_times)) + .max(retry_after.unwrap_or(Duration::from_secs(0))); sleep_until(next_retry_after).await; let tx_cert = match tx_cert { @@ -373,7 +375,11 @@ where retry_after_secs, }) => { self.metrics.total_retryable_overload_errors.inc(); - debug!(?tx_digest, ?errors, "System overload and retry"); + debug!( + ?tx_digest, + ?errors, + "System overload and retry after secs {retry_after_secs}", + ); Err(Some(QuorumDriverError::SystemOverloadRetryAfter { overload_stake, errors, @@ -875,7 +881,9 @@ where client_addr, )); } - Some(QuorumDriverError::SystemOverloadRetryAfter { .. }) => { + Some(QuorumDriverError::SystemOverloadRetryAfter { + retry_after_secs, .. + }) => { // Special case for SystemOverloadRetryAfter error. In this case, due to that objects are already // locked inside validators, we need to perform continuous retry and ignore `max_retry_times`. // TODO: the txn can potentially be retried unlimited times, therefore, we need to bound the number @@ -887,6 +895,7 @@ where tx_cert, old_retry_times, client_addr, + Some(Duration::from_secs(retry_after_secs)), )); } Some(qd_error) => { diff --git a/crates/sui-core/src/quorum_driver/tests.rs b/crates/sui-core/src/quorum_driver/tests.rs index 3dce375ce0dd3e..1fab4a5a82fae9 100644 --- a/crates/sui-core/src/quorum_driver/tests.rs +++ b/crates/sui-core/src/quorum_driver/tests.rs @@ -547,7 +547,7 @@ async fn test_quorum_driver_handling_overload_and_retry() { // Make local authority client to always return SystemOverloadedRetryAfter error. let fault_config = LocalAuthorityClientFaultConfig { - overload_retry_after_handle_transaction: true, + overload_retry_after_handle_transaction: Some(Duration::from_secs(30)), ..Default::default() }; let mut clients = aggregator.clone_inner_clients_test_only(); diff --git a/crates/sui-core/src/test_authority_clients.rs b/crates/sui-core/src/test_authority_clients.rs index bf08091c5a8a7a..5e3a6b50821d63 100644 --- a/crates/sui-core/src/test_authority_clients.rs +++ b/crates/sui-core/src/test_authority_clients.rs @@ -39,7 +39,7 @@ pub struct LocalAuthorityClientFaultConfig { pub fail_after_handle_transaction: bool, pub fail_before_handle_confirmation: bool, pub fail_after_handle_confirmation: bool, - pub overload_retry_after_handle_transaction: bool, + pub overload_retry_after_handle_transaction: Option, } impl LocalAuthorityClientFaultConfig { @@ -76,9 +76,9 @@ impl AuthorityAPI for LocalAuthorityClient { error: "Mock error after handle_transaction".to_owned(), }); } - if self.fault_config.overload_retry_after_handle_transaction { + if let Some(duration) = self.fault_config.overload_retry_after_handle_transaction { return Err(SuiError::ValidatorOverloadedRetryAfter { - retry_after_secs: 0, + retry_after_secs: duration.as_secs(), }); } result diff --git a/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs b/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs index bc2e7291e4e439..24e0f00d831e78 100644 --- a/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs +++ b/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs @@ -1958,14 +1958,17 @@ async fn test_handle_overload_retry_response() { 666, // this is a dummy value which does not matter ); - let overload_error = SuiError::ValidatorOverloadedRetryAfter { - retry_after_secs: 0, - }; let rpc_error = SuiError::RpcError("RPC".into(), "Error".into()); // Have 2f + 1 validators return the overload error and we should get the `SystemOverload` error. set_retryable_tx_info_response_error(&mut clients, &authority_keys); - set_tx_info_response_with_error(&mut clients, authority_keys.iter().skip(1), overload_error); + for (index, (name, _)) in authority_keys.iter().skip(1).enumerate() { + clients.get_mut(name).unwrap().set_tx_info_response_error( + SuiError::ValidatorOverloadedRetryAfter { + retry_after_secs: index as u64, + }, + ); + } let agg = get_genesis_agg(authorities.clone(), clients.clone()); assert_resp_err( @@ -1974,7 +1977,10 @@ async fn test_handle_overload_retry_response() { |e| { matches!( e, - AggregatorProcessTransactionError::SystemOverloadRetryAfter { .. } + AggregatorProcessTransactionError::SystemOverloadRetryAfter { + retry_after_secs, + .. + } if *retry_after_secs == (authority_keys.len() as u64 - 2) ) }, |e| { diff --git a/crates/sui-types/src/error.rs b/crates/sui-types/src/error.rs index 52def41456b8a5..e62a2a902b5129 100644 --- a/crates/sui-types/src/error.rs +++ b/crates/sui-types/src/error.rs @@ -820,6 +820,13 @@ impl SuiError { pub fn is_retryable_overload(&self) -> bool { matches!(self, SuiError::ValidatorOverloadedRetryAfter { .. }) } + + pub fn retry_after_secs(&self) -> u64 { + match self { + SuiError::ValidatorOverloadedRetryAfter { retry_after_secs } => *retry_after_secs, + _ => 0, + } + } } impl Ord for SuiError {