From 05a8ba662f0afdd4a4e6e2f4e61e4ca2458d666c Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Wed, 14 Aug 2024 16:55:29 +0300 Subject: [PATCH] Fix OurViewChange small race (#5356) Always queue OurViewChange event before we send view changes to our peers, because otherwise we risk the peers sending us a message that can be processed by our subsystems before OurViewChange. Normally, this is not really a problem because the latency of the ViewChange we send to our peers is way higher that our subsystem processing OurViewChange, however on testnets like versi where CPU is sometimes overcommitted this race gets triggered occasionally, so let's fix it by sending the messages in the right order. --------- Signed-off-by: Alexandru Gheorghe --- polkadot/node/network/bridge/src/rx/mod.rs | 30 +++++++++++----------- prdoc/pr_5356.prdoc | 18 +++++++++++++ 2 files changed, 33 insertions(+), 15 deletions(-) create mode 100644 prdoc/pr_5356.prdoc diff --git a/polkadot/node/network/bridge/src/rx/mod.rs b/polkadot/node/network/bridge/src/rx/mod.rs index 56965ce6ba40..7745c42f78a1 100644 --- a/polkadot/node/network/bridge/src/rx/mod.rs +++ b/polkadot/node/network/bridge/src/rx/mod.rs @@ -962,6 +962,21 @@ fn update_our_view( ) }; + let our_view = OurView::new( + live_heads.iter().take(MAX_VIEW_HEADS).cloned().map(|a| (a.hash, a.span)), + finalized_number, + ); + + dispatch_validation_event_to_all_unbounded( + NetworkBridgeEvent::OurViewChange(our_view.clone()), + ctx.sender(), + ); + + dispatch_collation_event_to_all_unbounded( + NetworkBridgeEvent::OurViewChange(our_view), + ctx.sender(), + ); + let v1_validation_peers = filter_by_peer_version(&validation_peers, ValidationVersion::V1.into()); let v1_collation_peers = filter_by_peer_version(&collation_peers, CollationVersion::V1.into()); @@ -1007,21 +1022,6 @@ fn update_our_view( metrics, notification_sinks, ); - - let our_view = OurView::new( - live_heads.iter().take(MAX_VIEW_HEADS).cloned().map(|a| (a.hash, a.span)), - finalized_number, - ); - - dispatch_validation_event_to_all_unbounded( - NetworkBridgeEvent::OurViewChange(our_view.clone()), - ctx.sender(), - ); - - dispatch_collation_event_to_all_unbounded( - NetworkBridgeEvent::OurViewChange(our_view), - ctx.sender(), - ); } // Handle messages on a specific v1 peer-set. The peer is expected to be connected on that diff --git a/prdoc/pr_5356.prdoc b/prdoc/pr_5356.prdoc new file mode 100644 index 000000000000..a306be335440 --- /dev/null +++ b/prdoc/pr_5356.prdoc @@ -0,0 +1,18 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Fix OurViewChange small race + +doc: + - audience: Node Dev + description: | + Always queue OurViewChange event before we send view changes to our peers, because otherwise we risk + the peers sending us a message that can be processed by our subsystems before OurViewChange. + Normally, this is not really a problem because the latency of the ViewChange we send to our peers + is way higher than that of our subsystem processing OurViewChange, however on testnets like versi + where CPUs are sometimes overcommitted this race gets triggered occasionally, so let's fix it by + sending the messages in the right order. + +crates: + - name: polkadot-network-bridge + bump: minor