paritytech · lexnv · Aug 5, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
diff --git a/prdoc/pr_5029.prdoc b/prdoc/pr_5029.prdoc
@@ -0,0 +1,16 @@
+# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0
+# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json
+
+title: Backoff slow peers to avoid duplicate requests
+
+doc:
+  - audience: Node Dev
+    description: |
+      This PR introduces a backoff strategy mechanism. Whenever a peer disconnects with an inflight
+      block (or state) request, the peer is backed off for a period of time before receiving requests.
+      After several attempts, the peer is disconnected and banned. The strategy aims to offload
+      the pressure from peers that are slow to respond or overloaded.
+
+crates:
+- name: sc-network-sync
+  bump: minor
diff --git a/substrate/client/network/sync/src/engine.rs b/substrate/client/network/sync/src/engine.rs
@@ -536,6 +536,10 @@ where
 			},
 			BlockAnnounceValidationResult::Failure { peer_id, disconnect } => {
 				if disconnect {
+					log::debug!(
+						target: LOG_TARGET,
+						"Disconnecting peer {peer_id} due to block announce validation failure",
+					);
 					self.network_service
 						.disconnect_peer(peer_id, self.block_announce_protocol_name.clone());
 				}

diff --git a/substrate/client/network/sync/src/strategy.rs b/substrate/client/network/sync/src/strategy.rs
@@ -20,6 +20,7 @@
 //! and specific syncing algorithms.
 
 pub mod chain_sync;
+mod disconnected_peers;
 mod state;
 pub mod state_sync;
 pub mod warp;

diff --git a/substrate/client/network/sync/src/strategy/chain_sync.rs b/substrate/client/network/sync/src/strategy/chain_sync.rs
@@ -33,6 +33,7 @@ use crate::{
 	justification_requests::ExtraRequests,
 	schema::v1::StateResponse,
 	strategy::{
+		disconnected_peers::DisconnectedPeers,
 		state_sync::{ImportResult, StateSync, StateSyncProvider},
 		warp::{WarpSyncPhase, WarpSyncProgress},
 	},
@@ -250,6 +251,7 @@ pub struct ChainSync<B: BlockT, Client> {
 	client: Arc<Client>,
 	/// The active peers that we are using to sync and their PeerSync status
 	peers: HashMap<PeerId, PeerSync<B>>,
+	disconnected_peers: DisconnectedPeers,
 	/// A `BlockCollection` of blocks that are being downloaded from peers
 	blocks: BlockCollection<B>,
 	/// The best block number in our queue of blocks to import
@@ -378,6 +380,7 @@ where
 		let mut sync = Self {
 			client,
 			peers: HashMap::new(),
+			disconnected_peers: DisconnectedPeers::new(),
 			blocks: BlockCollection::new(),
 			best_queued_hash: Default::default(),
 			best_queued_number: Zero::zero(),
@@ -1141,7 +1144,17 @@ where
 		if let Some(gap_sync) = &mut self.gap_sync {
 			gap_sync.blocks.clear_peer_download(peer_id)
 		}
-		self.peers.remove(peer_id);
+
+		if let Some(state) = self.peers.remove(peer_id) {
+			if !state.state.is_available() {
+				if let Some(bad_peer) =
+					self.disconnected_peers.on_disconnect_during_request(*peer_id)
+				{
+					self.actions.push(ChainSyncAction::DropPeer(bad_peer));
+				}
+			}
+		}
+
 		self.extra_justifications.peer_disconnected(peer_id);
 		self.allowed_requests.set_all();
 		self.fork_targets.retain(|_, target| {
@@ -1541,10 +1554,14 @@ where
 		let max_parallel = if is_major_syncing { 1 } else { self.max_parallel_downloads };
 		let max_blocks_per_request = self.max_blocks_per_request;
 		let gap_sync = &mut self.gap_sync;
+		let disconnected_peers = &mut self.disconnected_peers;
 		self.peers
 			.iter_mut()
 			.filter_map(move |(&id, peer)| {
-				if !peer.state.is_available() || !allowed_requests.contains(&id) {
+				if !peer.state.is_available() ||
+					!allowed_requests.contains(&id) ||
+					!disconnected_peers.is_peer_available(&id)
+				{
 					return None
 				}
 
@@ -1656,7 +1673,10 @@ where
 			}
 
 			for (id, peer) in self.peers.iter_mut() {
-				if peer.state.is_available() && peer.common_number >= sync.target_number() {
+				if peer.state.is_available() &&
+					peer.common_number >= sync.target_number() &&
+					self.disconnected_peers.is_peer_available(&id)
+				{
 					peer.state = PeerSyncState::DownloadingState;
 					let request = sync.next_request();
 					trace!(target: LOG_TARGET, "New StateRequest for {}: {:?}", id, request);

diff --git a/substrate/client/network/sync/src/strategy/disconnected_peers.rs b/substrate/client/network/sync/src/strategy/disconnected_peers.rs
@@ -0,0 +1,196 @@
+// This file is part of Substrate.
+
+// Copyright (C) Parity Technologies (UK) Ltd.
+// SPDX-License-Identifier: GPL-3.0-or-later WITH Classpath-exception-2.0
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+use crate::types::BadPeer;
+use sc_network::ReputationChange as Rep;
+use sc_network_types::PeerId;
+use schnellru::{ByLength, LruMap};
+
+const LOG_TARGET: &str = "sync::disconnected_peers";
+
+/// The maximum number of disconnected peers to keep track of.
+///
+/// When a peer disconnects, we must keep track if it was in the middle of a request.
+/// The peer may disconnect because it cannot keep up with the number of requests
+/// (ie not having enough resources available to handle the requests); or because it is malicious.
+const MAX_DISCONNECTED_PEERS_STATE: u32 = 512;
+
+/// The time we are going to backoff a peer that has disconnected with an inflight request.
+///
+/// The backoff time is calculated as `num_disconnects * DISCONNECTED_PEER_BACKOFF_SECONDS`.
+/// This is to prevent submitting a request to a peer that has disconnected because it could not
+/// keep up with the number of requests.
+///
+/// The peer may disconnect due to the keep-alive timeout, however disconnections without
+/// an inflight request are not tracked.
+const DISCONNECTED_PEER_BACKOFF_SECONDS: u64 = 60;
+
+/// Maximum number of disconnects with a request in flight before a peer is banned.
+const MAX_NUM_DISCONNECTS: u64 = 3;
+
+/// Peer disconnected with a request in flight after backoffs.
+///
+/// The peer may be slow to respond to the request after backoffs, or it refuses to respond.
+/// Report the peer and let the reputation system handle disconnecting the peer.
+pub const REPUTATION_REPORT: Rep = Rep::new_fatal("Peer disconnected with inflight after backoffs");
+
+/// The state of a disconnected peer with a request in flight.
+#[derive(Debug)]
+struct DisconnectedState {
+	/// The total number of disconnects.
+	num_disconnects: u64,
+	/// The time at the last disconnect.
+	last_disconnect: std::time::Instant,
+}
+
+impl DisconnectedState {
+	/// Create a new `DisconnectedState`.
+	pub fn new() -> Self {
+		Self { num_disconnects: 1, last_disconnect: std::time::Instant::now() }
+	}
+
+	/// Increment the number of disconnects.
+	pub fn increment(&mut self) {
+		self.num_disconnects = self.num_disconnects.saturating_add(1);
+		self.last_disconnect = std::time::Instant::now();
+	}
+
+	/// Get the number of disconnects.
+	pub fn num_disconnects(&self) -> u64 {
+		self.num_disconnects
+	}
+
+	/// Get the time of the last disconnect.
+	pub fn last_disconnect(&self) -> std::time::Instant {
+		self.last_disconnect
+	}
+}
+
+/// Tracks the state of disconnected peers with a request in flight.
+///
+/// This helps to prevent submitting requests to peers that have disconnected
+/// before responding to the request to offload the peer.
+pub struct DisconnectedPeers {
+	/// The state of disconnected peers.
+	disconnected_peers: LruMap<PeerId, DisconnectedState>,
+	/// Backoff duration in seconds.
+	backoff_seconds: u64,
+}
+
+impl DisconnectedPeers {
+	/// Create a new `DisconnectedPeers`.
+	pub fn new() -> Self {
+		Self {
+			disconnected_peers: LruMap::new(ByLength::new(MAX_DISCONNECTED_PEERS_STATE)),
+			backoff_seconds: DISCONNECTED_PEER_BACKOFF_SECONDS,
+		}
+	}
+
+	/// Insert a new peer to the persistent state if not seen before, or update the state if seen.
+	///
+	/// Returns true if the peer should be disconnected.
+	pub fn on_disconnect_during_request(&mut self, peer: PeerId) -> Option<BadPeer> {
+		if let Some(state) = self.disconnected_peers.get(&peer) {
+			state.increment();
+
+			let should_ban = state.num_disconnects() >= MAX_NUM_DISCONNECTS;
+			log::debug!(
+				target: LOG_TARGET,
+				"Disconnected known peer {peer} state: {state:?}, should ban: {should_ban}",
+			);
+
+			should_ban.then(|| {
+				// We can lose track of the peer state and let the banning mechanism handle
+				// the peer backoff.
+				//
+				// After the peer banning expires, if the peer continues to misbehave, it will be
+				// backed off again.
+				self.disconnected_peers.remove(&peer);
+				BadPeer(peer, REPUTATION_REPORT)
+			})
+		} else {
+			log::debug!(
+				target: LOG_TARGET,
+				"Added peer {peer} for the first time"
+			);
+			// First time we see this peer.
+			self.disconnected_peers.insert(peer, DisconnectedState::new());
+			None
+		}
+	}
+
+	/// Check if a peer is available for queries.
+	pub fn is_peer_available(&mut self, peer_id: &PeerId) -> bool {
+		let Some(state) = self.disconnected_peers.get(peer_id) else {
+			return true;
+		};
+
+		let elapsed = state.last_disconnect().elapsed();
+		if elapsed.as_secs() >= self.backoff_seconds * state.num_disconnects {
+			log::debug!(target: LOG_TARGET, "Peer {peer_id} is available for queries");
+			self.disconnected_peers.remove(peer_id);
+			true
+		} else {
+			log::debug!(target: LOG_TARGET,"Peer {peer_id} is backedoff");
+			false
+		}
+	}
+}
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+	use std::time::Duration;
+
+	#[test]
+	fn test_disconnected_peer_state() {
+		let mut state = DisconnectedPeers::new();
+		let peer = PeerId::random();
+
+		// Is not part of the disconnected peers yet.
+		assert_eq!(state.is_peer_available(&peer), true);
+
+		for _ in 0..MAX_NUM_DISCONNECTS - 1 {
+			assert!(state.on_disconnect_during_request(peer).is_none());
+			assert_eq!(state.is_peer_available(&peer), false);
+		}
+
+		assert!(state.on_disconnect_during_request(peer).is_some());
+		// Peer is supposed to get banned and disconnected.
+		// The state ownership moves to the PeerStore.
+		assert!(state.disconnected_peers.get(&peer).is_none());
+	}
+
+	#[test]
+	fn ensure_backoff_time() {
+		const TEST_BACKOFF_SECONDS: u64 = 2;
+		let mut state = DisconnectedPeers {
+			disconnected_peers: LruMap::new(ByLength::new(1)),
+			backoff_seconds: TEST_BACKOFF_SECONDS,
+		};
+		let peer = PeerId::random();
+
+		assert!(state.on_disconnect_during_request(peer).is_none());
+		assert_eq!(state.is_peer_available(&peer), false);
+
+		// Wait until the backoff time has passed
+		std::thread::sleep(Duration::from_secs(TEST_BACKOFF_SECONDS + 1));
+
+		assert_eq!(state.is_peer_available(&peer), true);
+	}
+}