From 19df79309064c2f79855d0f5a722cf27daf1f4b6 Mon Sep 17 00:00:00 2001 From: Zhe Wu Date: Sun, 5 May 2024 21:33:43 -0700 Subject: [PATCH] Quorum driver take server requested retry interval into consideration --- crates/sui-core/src/authority_aggregator.rs | 5 ++++- crates/sui-core/src/quorum_driver/mod.rs | 19 ++++++++++++++----- crates/sui-core/src/quorum_driver/tests.rs | 2 +- crates/sui-core/src/test_authority_clients.rs | 6 +++--- .../unit_tests/authority_aggregator_tests.rs | 16 +++++++++++----- crates/sui-types/src/error.rs | 7 +++++++ 6 files changed, 40 insertions(+), 15 deletions(-) diff --git a/crates/sui-core/src/authority_aggregator.rs b/crates/sui-core/src/authority_aggregator.rs index e27ed8486598b4..6ac4a1d89053dd 100644 --- a/crates/sui-core/src/authority_aggregator.rs +++ b/crates/sui-core/src/authority_aggregator.rs @@ -310,6 +310,7 @@ struct ProcessTransactionState { overloaded_stake: StakeUnit, // Validators that are overloaded and request client to retry. retryable_overloaded_stake: StakeUnit, + server_requested_retry_after: Duration, // If there are conflicting transactions, we note them down and may attempt to retry conflicting_tx_digests: BTreeMap, StakeUnit)>, @@ -1047,6 +1048,7 @@ where non_retryable_stake: 0, overloaded_stake: 0, retryable_overloaded_stake: 0, + server_requested_retry_after: Duration::from_secs(1), // Must be in second granularity retryable: true, conflicting_tx_digests: Default::default(), conflicting_tx_total_stake: 0, @@ -1120,6 +1122,7 @@ where // TODO: currently retryable overload and above overload error look redundant. We want to have a unified // code path to handle both overload scenarios. state.retryable_overloaded_stake += weight; + state.server_requested_retry_after = state.server_requested_retry_after.max(Duration::from_secs(err.retry_after_secs())); } else if !retryable && !state.record_conflicting_transaction_if_any(name, weight, &err) { // We don't count conflicting transactions as non-retryable errors here @@ -1270,7 +1273,7 @@ where return AggregatorProcessTransactionError::SystemOverloadRetryAfter { overload_stake: state.retryable_overloaded_stake, errors: group_errors(state.errors), - retry_after_secs: 0, + retry_after_secs: state.server_requested_retry_after.as_secs(), }; } diff --git a/crates/sui-core/src/quorum_driver/mod.rs b/crates/sui-core/src/quorum_driver/mod.rs index 449247c6b79ec2..2a0a8430edaf7b 100644 --- a/crates/sui-core/src/quorum_driver/mod.rs +++ b/crates/sui-core/src/quorum_driver/mod.rs @@ -157,7 +157,7 @@ impl QuorumDriver { ); return Ok(()); } - self.backoff_and_enqueue(request, tx_cert, old_retry_times, client_addr) + self.backoff_and_enqueue(request, tx_cert, old_retry_times, client_addr, None) .await } @@ -168,9 +168,11 @@ impl QuorumDriver { tx_cert: Option, old_retry_times: u32, client_addr: Option, + retry_after: Option, ) -> SuiResult<()> { - let next_retry_after = - Instant::now() + Duration::from_millis(200 * u64::pow(2, old_retry_times)); + let next_retry_after = Instant::now() + + Duration::from_millis(200 * u64::pow(2, old_retry_times)) + .max(retry_after.unwrap_or(Duration::from_secs(0))); sleep_until(next_retry_after).await; let tx_cert = match tx_cert { @@ -373,7 +375,11 @@ where retry_after_secs, }) => { self.metrics.total_retryable_overload_errors.inc(); - debug!(?tx_digest, ?errors, "System overload and retry"); + debug!( + ?tx_digest, + ?errors, + "System overload and retry after secs {retry_after_secs}", + ); Err(Some(QuorumDriverError::SystemOverloadRetryAfter { overload_stake, errors, @@ -875,7 +881,9 @@ where client_addr, )); } - Some(QuorumDriverError::SystemOverloadRetryAfter { .. }) => { + Some(QuorumDriverError::SystemOverloadRetryAfter { + retry_after_secs, .. + }) => { // Special case for SystemOverloadRetryAfter error. In this case, due to that objects are already // locked inside validators, we need to perform continuous retry and ignore `max_retry_times`. // TODO: the txn can potentially be retried unlimited times, therefore, we need to bound the number @@ -887,6 +895,7 @@ where tx_cert, old_retry_times, client_addr, + Some(Duration::from_secs(retry_after_secs)), )); } Some(qd_error) => { diff --git a/crates/sui-core/src/quorum_driver/tests.rs b/crates/sui-core/src/quorum_driver/tests.rs index 3dce375ce0dd3e..1fab4a5a82fae9 100644 --- a/crates/sui-core/src/quorum_driver/tests.rs +++ b/crates/sui-core/src/quorum_driver/tests.rs @@ -547,7 +547,7 @@ async fn test_quorum_driver_handling_overload_and_retry() { // Make local authority client to always return SystemOverloadedRetryAfter error. let fault_config = LocalAuthorityClientFaultConfig { - overload_retry_after_handle_transaction: true, + overload_retry_after_handle_transaction: Some(Duration::from_secs(30)), ..Default::default() }; let mut clients = aggregator.clone_inner_clients_test_only(); diff --git a/crates/sui-core/src/test_authority_clients.rs b/crates/sui-core/src/test_authority_clients.rs index bf08091c5a8a7a..5e3a6b50821d63 100644 --- a/crates/sui-core/src/test_authority_clients.rs +++ b/crates/sui-core/src/test_authority_clients.rs @@ -39,7 +39,7 @@ pub struct LocalAuthorityClientFaultConfig { pub fail_after_handle_transaction: bool, pub fail_before_handle_confirmation: bool, pub fail_after_handle_confirmation: bool, - pub overload_retry_after_handle_transaction: bool, + pub overload_retry_after_handle_transaction: Option, } impl LocalAuthorityClientFaultConfig { @@ -76,9 +76,9 @@ impl AuthorityAPI for LocalAuthorityClient { error: "Mock error after handle_transaction".to_owned(), }); } - if self.fault_config.overload_retry_after_handle_transaction { + if let Some(duration) = self.fault_config.overload_retry_after_handle_transaction { return Err(SuiError::ValidatorOverloadedRetryAfter { - retry_after_secs: 0, + retry_after_secs: duration.as_secs(), }); } result diff --git a/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs b/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs index bc2e7291e4e439..24e0f00d831e78 100644 --- a/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs +++ b/crates/sui-core/src/unit_tests/authority_aggregator_tests.rs @@ -1958,14 +1958,17 @@ async fn test_handle_overload_retry_response() { 666, // this is a dummy value which does not matter ); - let overload_error = SuiError::ValidatorOverloadedRetryAfter { - retry_after_secs: 0, - }; let rpc_error = SuiError::RpcError("RPC".into(), "Error".into()); // Have 2f + 1 validators return the overload error and we should get the `SystemOverload` error. set_retryable_tx_info_response_error(&mut clients, &authority_keys); - set_tx_info_response_with_error(&mut clients, authority_keys.iter().skip(1), overload_error); + for (index, (name, _)) in authority_keys.iter().skip(1).enumerate() { + clients.get_mut(name).unwrap().set_tx_info_response_error( + SuiError::ValidatorOverloadedRetryAfter { + retry_after_secs: index as u64, + }, + ); + } let agg = get_genesis_agg(authorities.clone(), clients.clone()); assert_resp_err( @@ -1974,7 +1977,10 @@ async fn test_handle_overload_retry_response() { |e| { matches!( e, - AggregatorProcessTransactionError::SystemOverloadRetryAfter { .. } + AggregatorProcessTransactionError::SystemOverloadRetryAfter { + retry_after_secs, + .. + } if *retry_after_secs == (authority_keys.len() as u64 - 2) ) }, |e| { diff --git a/crates/sui-types/src/error.rs b/crates/sui-types/src/error.rs index 52def41456b8a5..e62a2a902b5129 100644 --- a/crates/sui-types/src/error.rs +++ b/crates/sui-types/src/error.rs @@ -820,6 +820,13 @@ impl SuiError { pub fn is_retryable_overload(&self) -> bool { matches!(self, SuiError::ValidatorOverloadedRetryAfter { .. }) } + + pub fn retry_after_secs(&self) -> u64 { + match self { + SuiError::ValidatorOverloadedRetryAfter { retry_after_secs } => *retry_after_secs, + _ => 0, + } + } } impl Ord for SuiError {