From df828ac1560e6532df472dc552339d7914ae66e6 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 18 Mar 2021 09:11:07 +0100 Subject: [PATCH 01/30] Indentation fix. --- node/network/bridge/src/network.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/node/network/bridge/src/network.rs b/node/network/bridge/src/network.rs index 28a32a5d33f7..5e87f27b2516 100644 --- a/node/network/bridge/src/network.rs +++ b/node/network/bridge/src/network.rs @@ -218,14 +218,14 @@ impl Network for Arc> { Recipient::Peer(peer_id) => Some(peer_id), Recipient::Authority(authority) => authority_discovery - .get_addresses_by_authority_id(authority) - .await - .and_then(|addrs| { - addrs - .into_iter() - .find_map(|addr| peer_id_from_multiaddr(&addr)) - }), - }; + .get_addresses_by_authority_id(authority) + .await + .and_then(|addrs| { + addrs + .into_iter() + .find_map(|addr| peer_id_from_multiaddr(&addr)) + }), + }; let peer_id = match peer_id { None => { From 06d4d90f870b570fa850338a6f28f4c2c14a84e2 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 18 Mar 2021 09:18:53 +0100 Subject: [PATCH 02/30] Prepare request-response for PoV fetching. --- .../availability-distribution/src/error.rs | 5 +- .../availability-distribution/src/lib.rs | 9 +- .../availability-distribution/src/metrics.rs | 27 +++++- .../src/responder.rs | 97 +++++++++++++++++-- node/network/bridge/src/multiplexer.rs | 5 + node/network/protocol/Cargo.toml | 1 + .../protocol/src/request_response/mod.rs | 15 ++- .../protocol/src/request_response/request.rs | 15 ++- .../protocol/src/request_response/v1.rs | 28 +++++- node/subsystem/src/messages.rs | 45 ++++++--- 10 files changed, 218 insertions(+), 29 deletions(-) diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 9960e7f1d7d0..47c556bd8532 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -28,8 +28,11 @@ use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError}; /// Errors of this subsystem. #[derive(Debug, Error)] pub enum Error { - #[error("Response channel to obtain QueryChunk failed")] + #[error("Response channel to obtain chunk failed")] QueryChunkResponseChannel(#[source] oneshot::Canceled), + + #[error("Response channel to obtain available data failed")] + QueryAvailableDataResponseChannel(#[source] oneshot::Canceled), #[error("Receive channel closed")] IncomingMessageChannel(#[source] SubsystemError), diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index 07f3db51b5a2..db34ba84527c 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -34,7 +34,7 @@ use requester::Requester; /// Responding to erasure chunk requests: mod responder; -use responder::answer_request_log; +use responder::{answer_chunk_request_log, answer_pov_request_log}; /// Cache for session information. mod session_cache; @@ -123,7 +123,12 @@ impl AvailabilityDistributionSubsystem { FromOverseer::Communication { msg: AvailabilityDistributionMessage::AvailabilityFetchingRequest(req), } => { - answer_request_log(&mut ctx, req, &self.metrics).await + answer_chunk_request_log(&mut ctx, req, &self.metrics).await + } + FromOverseer::Communication { + msg: AvailabilityDistributionMessage::PoVFetchingRequest(req), + } => { + answer_pov_request_log(&mut ctx, req, &self.metrics).await } } } diff --git a/node/network/availability-distribution/src/metrics.rs b/node/network/availability-distribution/src/metrics.rs index c07500996fa2..925bbc8fe430 100644 --- a/node/network/availability-distribution/src/metrics.rs +++ b/node/network/availability-distribution/src/metrics.rs @@ -24,7 +24,7 @@ pub const SUCCEEDED: &'static str = "succeeded"; /// Label for fail counters. pub const FAILED: &'static str = "failed"; -/// Label for chunks that could not be served, because they were not available. +/// Label for chunks/PoVs that could not be served, because they were not available. pub const NOT_FOUND: &'static str = "not-found"; /// Availability Distribution metrics. @@ -47,6 +47,12 @@ struct MetricsInner { /// to a chunk request. This includes `NoSuchChunk` responses. served_chunks: CounterVec, + /// Number of PoVs served. + /// + /// Note: Right now, `Succeeded` gets incremented whenever we were able to successfully respond + /// to a PoV request. This includes `NoSuchPoV` responses. + served_povs: CounterVec, + /// Number of times our first set of validators did not provide the needed chunk and we had to /// query further validators. retries: Counter, @@ -66,12 +72,19 @@ impl Metrics { } /// Increment counter on served chunks. - pub fn on_served(&self, label: &'static str) { + pub fn on_served_chunk(&self, label: &'static str) { if let Some(metrics) = &self.0 { metrics.served_chunks.with_label_values(&[label]).inc() } } + /// Increment counter on served PoVs. + pub fn on_served_pov(&self, label: &'static str) { + if let Some(metrics) = &self.0 { + metrics.served_povs.with_label_values(&[label]).inc() + } + } + /// Increment retry counter. pub fn on_retry(&self) { if let Some(metrics) = &self.0 { @@ -103,6 +116,16 @@ impl metrics::Metrics for Metrics { )?, registry, )?, + served_povs: prometheus::register( + CounterVec::new( + Opts::new( + "parachain_served_povs_total", + "Total number of povs served by this backer.", + ), + &["success"] + )?, + registry, + )?, retries: prometheus::register( Counter::new( "parachain_fetch_retries_total", diff --git a/node/network/availability-distribution/src/responder.rs b/node/network/availability-distribution/src/responder.rs index 6704de72a8b6..07ed4ee9d971 100644 --- a/node/network/availability-distribution/src/responder.rs +++ b/node/network/availability-distribution/src/responder.rs @@ -28,10 +28,36 @@ use polkadot_subsystem::{ use crate::error::{Error, Result}; use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}}; -/// Variant of `answer_request` that does Prometheus metric and logging on errors. +/// Variant of `answer_pov_request` that does Prometheus metric and logging on errors. +/// +/// Any errors of `answer_pov_request` will simply be logged. +pub async fn answer_pov_request_log( + ctx: &mut Context, + req: IncomingRequest, + metrics: &Metrics, +) -> () +where + Context: SubsystemContext, +{ + let res = answer_pov_request(ctx, req).await; + match res { + Ok(result) => + metrics.on_served_pov(if result {SUCCEEDED} else {NOT_FOUND}), + Err(err) => { + tracing::warn!( + target: LOG_TARGET, + err= ?err, + "Serving PoV failed with error" + ); + metrics.on_served(FAILED); + } + } +} + +/// Variant of `answer_chunk_request` that does Prometheus metric and logging on errors. /// /// Any errors of `answer_request` will simply be logged. -pub async fn answer_request_log( +pub async fn answer_chunk_request_log( ctx: &mut Context, req: IncomingRequest, metrics: &Metrics, @@ -39,10 +65,10 @@ pub async fn answer_request_log( where Context: SubsystemContext, { - let res = answer_request(ctx, req).await; + let res = answer_chunk_request(ctx, req).await; match res { Ok(result) => - metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}), + metrics.on_served_chunk(if result {SUCCEEDED} else {NOT_FOUND}), Err(err) => { tracing::warn!( target: LOG_TARGET, @@ -54,10 +80,52 @@ where } } +/// Answer an incoming PoV fetch request by querying the av store. +/// +/// Returns: Ok(true) if chunk was found and served. +pub async fn answer_pov_request( + ctx: &mut Context, + req: IncomingRequest, +) -> Result +where + Context: SubsystemContext, +{ + let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); + let _child_span = span.child_builder("answer-chunk-request") + .with_chunk_index(req.payload.index.0) + .build(); + + let av_data = query_available_data(ctx, req.payload.candidate_hash, req.payload.index).await?; + + let result = av_data.is_some(); + + let response = match av_data { + None => v1::PoVFetchingResponse::NoSuchPoV, + Some(av_data) => { + let pov = match CompressedPoV::compress(&av_data.pov) { + Ok(pov) => pov, + Err(error) => { + tracing::error!( + target: LOG_TARGET, + error = ?error, + "Failed to create `CompressedPov`", + ); + // this should really not happen, let this request time out: + return false + } + }; + v1::PoVFetchingResponse::PoV(pov) + } + }; + + req.send_response(response).map_err(|_| Error::SendResponse)?; + Ok(result) +} + /// Answer an incoming chunk request by querying the av store. /// /// Returns: Ok(true) if chunk was found and served. -pub async fn answer_request( +pub async fn answer_chunk_request( ctx: &mut Context, req: IncomingRequest, ) -> Result @@ -65,7 +133,6 @@ where Context: SubsystemContext, { let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); - span.add_stage(jaeger::Stage::AvailabilityDistribution); let _child_span = span.child_builder("answer-chunk-request") .with_chunk_index(req.payload.index.0) .build(); @@ -101,3 +168,21 @@ where rx.await.map_err(|e| Error::QueryChunkResponseChannel(e)) } + +/// Query PoV from the availability store. +#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))] +async fn query_available_data( + ctx: &mut Context, + candidate_hash: CandidateHash, +) -> Result> +where + Context: SubsystemContext, +{ + let (tx, rx) = oneshot::channel(); + ctx.send_message(AllMessages::AvailabilityStore( + AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx), + )) + .await; + + rx.await.map_err(|e| Error::QueryAvailableDataResponseChannel(e)) +} diff --git a/node/network/bridge/src/multiplexer.rs b/node/network/bridge/src/multiplexer.rs index 3901a91078ea..b8c5eb9cc49b 100644 --- a/node/network/bridge/src/multiplexer.rs +++ b/node/network/bridge/src/multiplexer.rs @@ -141,6 +141,11 @@ fn multiplex_single( decode_with_peer::(peer, payload)?, pending_response, )), + Protocol::PoVFetching => From::from(IncomingRequest::new( + peer, + decode_with_peer::(peer, payload)?, + pending_response, + )), }; Ok(r) } diff --git a/node/network/protocol/Cargo.toml b/node/network/protocol/Cargo.toml index f7a0c72b2a08..6f32c346e53b 100644 --- a/node/network/protocol/Cargo.toml +++ b/node/network/protocol/Cargo.toml @@ -13,3 +13,4 @@ parity-scale-codec = { version = "2.0.0", default-features = false, features = [ sc-network = { git = "https://github.com/paritytech/substrate", branch = "master" } strum = { version = "0.20", features = ["derive"] } futures = "0.3.12" +thiserror = "1.0.23" diff --git a/node/network/protocol/src/request_response/mod.rs b/node/network/protocol/src/request_response/mod.rs index 53d12926c401..5f1030cab82a 100644 --- a/node/network/protocol/src/request_response/mod.rs +++ b/node/network/protocol/src/request_response/mod.rs @@ -59,6 +59,8 @@ pub enum Protocol { AvailabilityFetching, /// Protocol for fetching collations from collators. CollationFetching, + /// Protocol for fetching seconded PoVs from validators of the same group. + PoVFetching, } /// Default request timeout in seconds. @@ -66,7 +68,7 @@ pub enum Protocol { /// When decreasing this value, take into account that the very first request might need to open a /// connection, which can be slow. If this causes problems, we should ensure connectivity via peer /// sets. -const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3); +const DEFAULT_REQUEST_TIMEOUT: Duration = Duration::from_secs(3); /// Request timeout where we can assume the connection is already open (e.g. we have peers in a /// peer set as well). @@ -107,6 +109,14 @@ impl Protocol { request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED, inbound_queue: Some(tx), }, + Protocol::PoVFetching => RequestResponseConfig { + name: p_name, + max_request_size: 1_000, + /// Same consideration as for `CollationFetching. + max_response_size: 10_000_000, + request_timeout: DEFAULT_REQUEST_TIMEOUT, + inbound_queue: Some(tx), + }, }; (rx, cfg) } @@ -122,6 +132,8 @@ impl Protocol { Protocol::AvailabilityFetching => 100, // 10 seems reasonable, considering group sizes of max 10 validators. Protocol::CollationFetching => 10, + // 10 seems reasonable, considering group sizes of max 10 validators. + Protocol::PoVFetching => 10, } } @@ -135,6 +147,7 @@ impl Protocol { match self { Protocol::AvailabilityFetching => "/polkadot/req_availability/1", Protocol::CollationFetching => "/polkadot/req_collation/1", + Protocol::PoVFetching => "/polkadot/req_pov/1", } } } diff --git a/node/network/protocol/src/request_response/request.rs b/node/network/protocol/src/request_response/request.rs index da0d4cd53b3f..3ef25c38073c 100644 --- a/node/network/protocol/src/request_response/request.rs +++ b/node/network/protocol/src/request_response/request.rs @@ -17,6 +17,7 @@ use futures::channel::oneshot; use futures::prelude::Future; +use thiserror::Error; use parity_scale_codec::{Decode, Encode, Error as DecodingError}; use sc_network as network; use sc_network::config as netconfig; @@ -42,6 +43,8 @@ pub enum Requests { AvailabilityFetching(OutgoingRequest), /// Fetch a collation from a collator which previously announced it. CollationFetching(OutgoingRequest), + /// Fetch a PoV from a validator which previously sent out a seconded statement. + PoVFetching(OutgoingRequest), } impl Requests { @@ -50,6 +53,7 @@ impl Requests { match self { Self::AvailabilityFetching(_) => Protocol::AvailabilityFetching, Self::CollationFetching(_) => Protocol::CollationFetching, + Self::PoVFetching(_) => Protocol::PoVFetching, } } @@ -64,6 +68,7 @@ impl Requests { match self { Self::AvailabilityFetching(r) => r.encode_request(), Self::CollationFetching(r) => r.encode_request(), + Self::PoVFetching(r) => r.encode_request(), } } } @@ -92,15 +97,19 @@ pub struct OutgoingRequest { } /// Any error that can occur when sending a request. +#[derive(Debug, Error)] pub enum RequestError { /// Response could not be decoded. - InvalidResponse(DecodingError), + #[error("Response could not be decoded")] + InvalidResponse(#[source] DecodingError), /// Some error in substrate/libp2p happened. - NetworkError(network::RequestFailure), + #[error("Some network error occurred")] + NetworkError(#[source] network::RequestFailure), /// Response got canceled by networking. - Canceled(oneshot::Canceled), + #[error("Response channel got canceled")] + Canceled(#[source] oneshot::Canceled), } /// Responses received for an `OutgoingRequest`. diff --git a/node/network/protocol/src/request_response/v1.rs b/node/network/protocol/src/request_response/v1.rs index 269b55424b6a..3e4e7adcbb73 100644 --- a/node/network/protocol/src/request_response/v1.rs +++ b/node/network/protocol/src/request_response/v1.rs @@ -18,7 +18,7 @@ use parity_scale_codec::{Decode, Encode}; -use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, ErasureChunk, ValidatorIndex, CompressedPoV, Hash}; +use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, ErasureChunk, ValidatorIndex, CompressedPoV, Hash, CandidateDescriptor}; use polkadot_primitives::v1::Id as ParaId; use super::request::IsRequest; @@ -101,3 +101,29 @@ impl IsRequest for CollationFetchingRequest { type Response = CollationFetchingResponse; const PROTOCOL: Protocol = Protocol::CollationFetching; } + +/// Request the advertised collation at that relay-parent. +#[derive(Debug, Clone, Encode, Decode)] +pub struct PoVFetchingRequest { + /// Relay parent we want a collation for. + pub relay_parent: Hash, + /// Candidate descriptor. + pub descriptor: CandidateDescriptor, +} + +/// Responses to `PoVFetchingRequest`. +#[derive(Debug, Clone, Encode, Decode)] +pub enum PoVFetchingResponse { + /// Deliver requested PoV. + #[codec(index = 0)] + PoV(CompressedPoV), + /// PoV was not found in store. + #[codec(index = 1)] + NoSuchPoV, +} + +impl IsRequest for PoVFetchingRequest { + type Response = PoVFetchingResponse; + const PROTOCOL: Protocol = Protocol::PoVFetching; +} + diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index 0df612bd37f7..c5c2b091a4cd 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -260,10 +260,12 @@ impl NetworkBridgeMessage { } /// Availability Distribution Message. -#[derive(Debug, derive_more::From)] +#[derive(Debug)] pub enum AvailabilityDistributionMessage { /// Incoming network request for an availability chunk. - AvailabilityFetchingRequest(IncomingRequest) + AvailabilityFetchingRequest(IncomingRequest), + /// Incoming network request for a seconded PoV. + PoVFetchingRequest(IncomingRequest), } /// Availability Recovery Message. @@ -281,14 +283,14 @@ pub enum AvailabilityRecoveryMessage { NetworkBridgeUpdateV1(NetworkBridgeEvent), } -impl AvailabilityDistributionMessage { - /// If the current variant contains the relay parent hash, return it. - pub fn relay_parent(&self) -> Option { - match self { - Self::AvailabilityFetchingRequest(_) => None, - } - } -} +// impl AvailabilityDistributionMessage { +// /// If the current variant contains the relay parent hash, return it. +// pub fn relay_parent(&self) -> Option { +// match self { +// Self::AvailabilityFetchingRequest(_) => None, +// } +// } +// } /// Bitfield distribution message. #[derive(Debug, derive_more::From)] @@ -736,18 +738,35 @@ pub enum AllMessages { GossipSupport(GossipSupportMessage), } +impl From> for AvailabilityDistributionMessage { + fn from(req: IncomingRequest) -> Self { + Self::PoVFetchingRequest(req) + } +} +impl From> for AvailabilityDistributionMessage { + fn from(req: IncomingRequest) -> Self { + Self::AvailabilityFetchingRequest(req) + } +} impl From> for AllMessages { fn from(req: IncomingRequest) -> Self { From::::from(From::from(req)) } } + +impl From> for CollatorProtocolMessage { + fn from(req: IncomingRequest) -> Self { + Self::CollationFetchingRequest(req) + } +} impl From> for AllMessages { fn from(req: IncomingRequest) -> Self { From::::from(From::from(req)) } } -impl From> for CollatorProtocolMessage { - fn from(req: IncomingRequest) -> Self { - Self::CollationFetchingRequest(req) + +impl From> for AllMessages { + fn from(req: IncomingRequest) -> Self { + From::::from(From::from(req)) } } From 6a940eb0c6d13039dfa543a12d6ecbf91a5fc857 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 18 Mar 2021 10:46:33 +0100 Subject: [PATCH 03/30] Drop old PoV distribution. --- Cargo.lock | 20 - Cargo.toml | 1 - node/network/pov-distribution/Cargo.toml | 25 - node/network/pov-distribution/src/error.rs | 33 - node/network/pov-distribution/src/lib.rs | 887 ----------- node/network/pov-distribution/src/tests.rs | 1570 -------------------- node/service/Cargo.toml | 2 - 7 files changed, 2538 deletions(-) delete mode 100644 node/network/pov-distribution/Cargo.toml delete mode 100644 node/network/pov-distribution/src/error.rs delete mode 100644 node/network/pov-distribution/src/lib.rs delete mode 100644 node/network/pov-distribution/src/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 6b53e9089cad..3b7ccf112e49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5803,25 +5803,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "polkadot-pov-distribution" -version = "0.1.0" -dependencies = [ - "assert_matches", - "env_logger 0.8.2", - "futures 0.3.12", - "log", - "polkadot-node-network-protocol", - "polkadot-node-subsystem", - "polkadot-node-subsystem-test-helpers", - "polkadot-node-subsystem-util", - "polkadot-primitives", - "sp-core", - "sp-keyring", - "thiserror", - "tracing", -] - [[package]] name = "polkadot-primitives" version = "0.8.29" @@ -6107,7 +6088,6 @@ dependencies = [ "polkadot-node-subsystem-util", "polkadot-overseer", "polkadot-parachain", - "polkadot-pov-distribution", "polkadot-primitives", "polkadot-rpc", "polkadot-runtime", diff --git a/Cargo.toml b/Cargo.toml index b1b5068fb67b..5948a18ee905 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,6 @@ members = [ "node/core/runtime-api", "node/network/approval-distribution", "node/network/bridge", - "node/network/pov-distribution", "node/network/protocol", "node/network/statement-distribution", "node/network/bitfield-distribution", diff --git a/node/network/pov-distribution/Cargo.toml b/node/network/pov-distribution/Cargo.toml deleted file mode 100644 index ae405638c4c6..000000000000 --- a/node/network/pov-distribution/Cargo.toml +++ /dev/null @@ -1,25 +0,0 @@ -[package] -name = "polkadot-pov-distribution" -version = "0.1.0" -authors = ["Parity Technologies "] -edition = "2018" - -[dependencies] -futures = "0.3.12" -thiserror = "1.0.23" -tracing = "0.1.25" - -polkadot-primitives = { path = "../../../primitives" } -polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" } -polkadot-node-subsystem-util = { path = "../../subsystem-util" } -polkadot-node-network-protocol = { path = "../../network/protocol" } - -[dev-dependencies] -assert_matches = "1.4.0" -env_logger = "0.8.1" -log = "0.4.13" - -sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } -sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } - -polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } diff --git a/node/network/pov-distribution/src/error.rs b/node/network/pov-distribution/src/error.rs deleted file mode 100644 index 754c1e56c523..000000000000 --- a/node/network/pov-distribution/src/error.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! The `Error` and `Result` types used by the subsystem. - -use thiserror::Error; - -#[derive(Debug, Error)] -pub enum Error { - #[error(transparent)] - Subsystem(#[from] polkadot_subsystem::SubsystemError), - #[error(transparent)] - OneshotRecv(#[from] futures::channel::oneshot::Canceled), - #[error(transparent)] - Runtime(#[from] polkadot_subsystem::errors::RuntimeApiError), - #[error(transparent)] - Util(#[from] polkadot_node_subsystem_util::Error), -} - -pub type Result = std::result::Result; diff --git a/node/network/pov-distribution/src/lib.rs b/node/network/pov-distribution/src/lib.rs deleted file mode 100644 index 585cdcf9e873..000000000000 --- a/node/network/pov-distribution/src/lib.rs +++ /dev/null @@ -1,887 +0,0 @@ -// Copyright 2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! PoV Distribution Subsystem of Polkadot. -//! -//! This is a gossip implementation of code that is responsible for distributing PoVs -//! among validators. - -#![deny(unused_crate_dependencies)] -#![warn(missing_docs)] - -use polkadot_primitives::v1::{CandidateDescriptor, CompressedPoV, CoreIndex, CoreState, Hash, Id as ParaId, PoV, ValidatorId}; -use polkadot_subsystem::{ - ActiveLeavesUpdate, OverseerSignal, SubsystemContext, SubsystemResult, SubsystemError, Subsystem, - FromOverseer, SpawnedSubsystem, - jaeger, - messages::{ - PoVDistributionMessage, AllMessages, NetworkBridgeMessage, NetworkBridgeEvent, - }, -}; -use polkadot_node_subsystem_util::{ - validator_discovery, - request_validators_ctx, - request_validator_groups_ctx, - request_availability_cores_ctx, - metrics::{self, prometheus}, -}; -use polkadot_node_network_protocol::{ - peer_set::PeerSet, v1 as protocol_v1, PeerId, OurView, UnifiedReputationChange as Rep, -}; - -use futures::prelude::*; -use futures::channel::oneshot; - -use std::collections::{hash_map::{Entry, HashMap}, HashSet}; -use std::sync::Arc; - -mod error; - -#[cfg(test)] -mod tests; - -const COST_APPARENT_FLOOD: Rep = Rep::CostMajor("Peer appears to be flooding us with PoV requests"); -const COST_UNEXPECTED_POV: Rep = Rep::CostMajor("Peer sent us an unexpected PoV"); -const COST_AWAITED_NOT_IN_VIEW: Rep - = Rep::CostMinor("Peer claims to be awaiting something outside of its view"); - -const BENEFIT_FRESH_POV: Rep = Rep::BenefitMinorFirst("Peer supplied us with an awaited PoV"); -const BENEFIT_LATE_POV: Rep = Rep::BenefitMinor("Peer supplied us with an awaited PoV, \ - but was not the first to do so"); - -const LOG_TARGET: &str = "parachain::pov-distribution"; - -/// The PoV Distribution Subsystem. -pub struct PoVDistribution { - // Prometheus metrics - metrics: Metrics, -} - -impl Subsystem for PoVDistribution - where C: SubsystemContext -{ - fn start(self, ctx: C) -> SpawnedSubsystem { - // Swallow error because failure is fatal to the node and we log with more precision - // within `run`. - let future = self.run(ctx) - .map_err(|e| SubsystemError::with_origin("pov-distribution", e)) - .boxed(); - SpawnedSubsystem { - name: "pov-distribution-subsystem", - future, - } - } -} - -#[derive(Default)] -struct State { - /// A state of things going on on a per-relay-parent basis. - relay_parent_state: HashMap, - - /// Info on peers. - peer_state: HashMap, - - /// Our own view. - our_view: OurView, - - /// Connect to relevant groups of validators at different relay parents. - connection_requests: validator_discovery::ConnectionRequests, - - /// Metrics. - metrics: Metrics, -} - -struct BlockBasedState { - known: HashMap, CompressedPoV)>, - - /// All the PoVs we are or were fetching, coupled with channels expecting the data. - /// - /// This may be an empty list, which indicates that we were once awaiting this PoV but have - /// received it already. - fetching: HashMap>>>, - - n_validators: usize, -} - -#[derive(Default)] -struct PeerState { - /// A set of awaited PoV-hashes for each relay-parent in the peer's view. - awaited: HashMap>, -} - -fn awaiting_message(relay_parent: Hash, awaiting: Vec) - -> protocol_v1::ValidationProtocol -{ - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, awaiting) - ) -} - -fn send_pov_message( - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) -> protocol_v1::ValidationProtocol { - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov.clone()) - ) -} - -/// Handles the signal. If successful, returns `true` if the subsystem should conclude, -/// `false` otherwise. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_signal( - state: &mut State, - ctx: &mut impl SubsystemContext, - signal: OverseerSignal, -) -> SubsystemResult { - match signal { - OverseerSignal::Conclude => Ok(true), - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated }) => { - let _timer = state.metrics.time_handle_signal(); - - for (relay_parent, span) in activated { - let _span = span.child_builder("pov-dist") - .with_stage(jaeger::Stage::PoVDistribution) - .build(); - - match request_validators_ctx(relay_parent, ctx).await { - Ok(vals_rx) => { - let n_validators = match vals_rx.await? { - Ok(v) => v.len(), - Err(e) => { - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - - // Not adding bookkeeping here might make us behave funny, but we - // shouldn't take down the node on spurious runtime API errors. - // - // and this is "behave funny" as in be bad at our job, but not in any - // slashable or security-related way. - continue; - } - }; - - state.relay_parent_state.insert(relay_parent, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators, - }); - } - Err(e) => { - // continue here also as above. - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - } - } - } - - for relay_parent in deactivated { - state.connection_requests.remove_all(&relay_parent); - state.relay_parent_state.remove(&relay_parent); - } - - Ok(false) - } - OverseerSignal::BlockFinalized(..) => Ok(false), - } -} - -/// Notify peers that we are awaiting a given PoV hash. -/// -/// This only notifies peers who have the relay parent in their view. -#[tracing::instrument(level = "trace", skip(peers, ctx), fields(subsystem = LOG_TARGET))] -async fn notify_all_we_are_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - pov_hash: Hash, -) { - // We use `awaited` as a proxy for which heads are in the peer's view. - let peers_to_send: Vec<_> = peers.iter() - .filter_map(|(peer, state)| if state.awaited.contains_key(&relay_parent) { - Some(peer.clone()) - } else { - None - }) - .collect(); - - if peers_to_send.is_empty() { - return; - } - - let payload = awaiting_message(relay_parent, vec![pov_hash]); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; -} - -/// Notify one peer about everything we're awaiting at a given relay-parent. -#[tracing::instrument(level = "trace", skip(ctx, relay_parent_state), fields(subsystem = LOG_TARGET))] -async fn notify_one_we_are_awaiting_many( - peer: &PeerId, - ctx: &mut impl SubsystemContext, - relay_parent_state: &HashMap, - relay_parent: Hash, -) { - let awaiting_hashes = relay_parent_state.get(&relay_parent).into_iter().flat_map(|s| { - // Send the peer everything we are fetching at this relay-parent - s.fetching.iter() - .filter(|(_, senders)| !senders.is_empty()) // that has not been completed already. - .map(|(pov_hash, _)| *pov_hash) - }).collect::>(); - - if awaiting_hashes.is_empty() { - return; - } - - let payload = awaiting_message(relay_parent, awaiting_hashes); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - vec![peer.clone()], - payload, - ))).await; -} - -/// Distribute a PoV to peers who are awaiting it. -#[tracing::instrument(level = "trace", skip(peers, ctx, metrics, pov), fields(subsystem = LOG_TARGET))] -async fn distribute_to_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - metrics: &Metrics, - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) { - // Send to all peers who are awaiting the PoV and have that relay-parent in their view. - // - // Also removes it from their awaiting set. - let peers_to_send: Vec<_> = peers.iter_mut() - .filter_map(|(peer, state)| state.awaited.get_mut(&relay_parent).and_then(|awaited| { - if awaited.remove(&pov_hash) { - Some(peer.clone()) - } else { - None - } - })) - .collect(); - - if peers_to_send.is_empty() { return; } - - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; - - metrics.on_pov_distributed(); -} - -/// Connect to relevant validators in case we are not already. -async fn connect_to_relevant_validators( - connection_requests: &mut validator_discovery::ConnectionRequests, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: &CandidateDescriptor, -) { - let para_id = descriptor.para_id; - if let Ok(Some(relevant_validators)) = - determine_relevant_validators(ctx, relay_parent, para_id).await - { - // We only need one connection request per (relay_parent, para_id) - // so here we take this shortcut to avoid calling `connect_to_validators` - // more than once. - if !connection_requests.contains_request(&relay_parent, para_id) { - tracing::debug!(target: LOG_TARGET, validators=?relevant_validators, "connecting to validators"); - match validator_discovery::connect_to_validators( - ctx, - relay_parent, - relevant_validators, - PeerSet::Validation, - ).await { - Ok(new_connection_request) => { - connection_requests.put(relay_parent, para_id, new_connection_request); - } - Err(e) => { - tracing::debug!( - target: LOG_TARGET, - "Failed to create a validator connection request {:?}", - e, - ); - } - } - } - } -} - -/// Get the Id of the Core that is assigned to the para being collated on if any -/// and the total number of cores. -async fn determine_core( - ctx: &mut impl SubsystemContext, - para_id: ParaId, - relay_parent: Hash, -) -> error::Result> { - let cores = request_availability_cores_ctx(relay_parent, ctx).await?.await??; - - for (idx, core) in cores.iter().enumerate() { - if let CoreState::Scheduled(occupied) = core { - if occupied.para_id == para_id { - return Ok(Some(((idx as u32).into(), cores.len()))); - } - } - } - - Ok(None) -} - -/// Figure out a group of validators assigned to a given `ParaId`. -async fn determine_validators_for_core( - ctx: &mut impl SubsystemContext, - core_index: CoreIndex, - num_cores: usize, - relay_parent: Hash, -) -> error::Result>> { - let groups = request_validator_groups_ctx(relay_parent, ctx).await?.await??; - - let group_index = groups.1.group_for_core(core_index, num_cores); - - let connect_to_validators = match groups.0.get(group_index.0 as usize) { - Some(group) => group.clone(), - None => return Ok(None), - }; - - let validators = request_validators_ctx(relay_parent, ctx).await?.await??; - - let validators = connect_to_validators - .into_iter() - .map(|idx| validators[idx.0 as usize].clone()) - .collect(); - - Ok(Some(validators)) -} - -async fn determine_relevant_validators( - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - para_id: ParaId, -) -> error::Result>> { - // Determine which core the para_id is assigned to. - let (core, num_cores) = match determine_core(ctx, para_id, relay_parent).await? { - Some(core) => core, - None => { - tracing::warn!( - target: LOG_TARGET, - "Looks like no core is assigned to {:?} at {:?}", - para_id, - relay_parent, - ); - - return Ok(None); - } - }; - - determine_validators_for_core(ctx, core, num_cores, relay_parent).await -} - -/// Handles a `FetchPoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, response_sender), fields(subsystem = LOG_TARGET))] -async fn handle_fetch( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - response_sender: oneshot::Sender>, -) { - let _timer = state.metrics.time_handle_fetch(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - if let Some((pov, _)) = relay_parent_state.known.get(&descriptor.pov_hash) { - let _ = response_sender.send(pov.clone()); - return; - } - - { - match relay_parent_state.fetching.entry(descriptor.pov_hash) { - Entry::Occupied(mut e) => { - // we are already awaiting this PoV if there is an entry. - e.get_mut().push(response_sender); - return; - } - Entry::Vacant(e) => { - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - e.insert(vec![response_sender]); - } - } - } - - if relay_parent_state.fetching.len() > 2 * relay_parent_state.n_validators { - tracing::warn!( - relay_parent_state.fetching.len = relay_parent_state.fetching.len(), - "other subsystems have requested PoV distribution to fetch more PoVs than reasonably expected", - ); - return; - } - - // Issue an `Awaiting` message to all peers with this in their view. - notify_all_we_are_awaiting( - &mut state.peer_state, - ctx, - relay_parent, - descriptor.pov_hash - ).await -} - -/// Handles a `DistributePoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))] -async fn handle_distribute( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - pov: Arc, -) { - let _timer = state.metrics.time_handle_distribute(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - - if let Some(our_awaited) = relay_parent_state.fetching.get_mut(&descriptor.pov_hash) { - // Drain all the senders, but keep the entry in the map around intentionally. - // - // It signals that we were at one point awaiting this, so we will be able to tell - // why peers are sending it to us. - for response_sender in our_awaited.drain(..) { - let _ = response_sender.send(pov.clone()); - } - } - - let encoded_pov = match CompressedPoV::compress(&*pov) { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Failed to create `CompressedPov`." - ); - return - } - }; - - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - descriptor.pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(descriptor.pov_hash, (pov, encoded_pov)); -} - -/// Report a reputation change for a peer. -#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))] -async fn report_peer( - ctx: &mut impl SubsystemContext, - peer: PeerId, - rep: Rep, -) { - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(peer, rep))).await -} - -/// Handle a notification from a peer that they are awaiting some PoVs. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_awaiting( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hashes: Vec, -) { - if !state.our_view.contains(&relay_parent) { - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - tracing::warn!("PoV Distribution relay parent state out-of-sync with our view"); - return; - } - Some(s) => s, - }; - - let peer_awaiting = match - state.peer_state.get_mut(&peer).and_then(|s| s.awaited.get_mut(&relay_parent)) - { - None => { - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - Some(a) => a, - }; - - let will_be_awaited = peer_awaiting.len() + pov_hashes.len(); - if will_be_awaited <= 2 * relay_parent_state.n_validators { - for pov_hash in pov_hashes { - // For all requested PoV hashes, if we have it, we complete the request immediately. - // Otherwise, we note that the peer is awaiting the PoV. - if let Some((_, ref pov)) = relay_parent_state.known.get(&pov_hash) { - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload) - )).await; - } else { - peer_awaiting.insert(pov_hash); - } - } - } else { - report_peer(ctx, peer, COST_APPARENT_FLOOD).await; - } -} - -/// Handle an incoming PoV from our peer. Reports them if unexpected, rewards them if not. -/// -/// Completes any requests awaiting that PoV. -#[tracing::instrument(level = "trace", skip(ctx, state, encoded_pov), fields(subsystem = LOG_TARGET))] -async fn handle_incoming_pov( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hash: Hash, - encoded_pov: CompressedPoV, -) { - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - }, - Some(r) => r, - }; - - let pov = match encoded_pov.decompress() { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Could not extract PoV", - ); - return; - } - }; - - let pov = { - // Do validity checks and complete all senders awaiting this PoV. - let fetching = match relay_parent_state.fetching.get_mut(&pov_hash) { - None => { - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - Some(f) => f, - }; - - let hash = pov.hash(); - if hash != pov_hash { - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - - let pov = Arc::new(pov); - - if fetching.is_empty() { - // fetching is empty whenever we were awaiting something and - // it was completed afterwards. - report_peer(ctx, peer.clone(), BENEFIT_LATE_POV).await; - } else { - // fetching is non-empty when the peer just provided us with data we needed. - report_peer(ctx, peer.clone(), BENEFIT_FRESH_POV).await; - } - - for response_sender in fetching.drain(..) { - let _ = response_sender.send(pov.clone()); - } - - pov - }; - - // make sure we don't consider this peer as awaiting that PoV anymore. - if let Some(peer_state) = state.peer_state.get_mut(&peer) { - peer_state.awaited.remove(&pov_hash); - } - - // distribute the PoV to all other peers who are awaiting it. - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(pov_hash, (pov, encoded_pov)); -} - -/// Handles a newly or already connected validator in the context of some relay leaf. -fn handle_validator_connected(state: &mut State, peer_id: PeerId) { - state.peer_state.entry(peer_id).or_default(); -} - -/// Handles a network bridge update. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_network_update( - state: &mut State, - ctx: &mut impl SubsystemContext, - update: NetworkBridgeEvent, -) { - let _timer = state.metrics.time_handle_network_update(); - - match update { - NetworkBridgeEvent::PeerConnected(peer, _observed_role) => { - handle_validator_connected(state, peer); - } - NetworkBridgeEvent::PeerDisconnected(peer) => { - state.peer_state.remove(&peer); - } - NetworkBridgeEvent::PeerViewChange(peer_id, view) => { - if let Some(peer_state) = state.peer_state.get_mut(&peer_id) { - // prune anything not in the new view. - peer_state.awaited.retain(|relay_parent, _| view.contains(&relay_parent)); - - // introduce things from the new view. - for relay_parent in view.iter() { - if let Entry::Vacant(entry) = peer_state.awaited.entry(*relay_parent) { - entry.insert(HashSet::new()); - - // Notify the peer about everything we're awaiting at the new relay-parent. - notify_one_we_are_awaiting_many( - &peer_id, - ctx, - &state.relay_parent_state, - *relay_parent, - ).await; - } - } - } - - } - NetworkBridgeEvent::PeerMessage(peer, message) => { - match message { - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, pov_hashes) - => handle_awaiting( - state, - ctx, - peer, - relay_parent, - pov_hashes, - ).await, - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov) - => handle_incoming_pov( - state, - ctx, - peer, - relay_parent, - pov_hash, - pov, - ).await, - } - } - NetworkBridgeEvent::OurViewChange(view) => { - state.our_view = view; - } - } -} - -impl PoVDistribution { - /// Create a new instance of `PovDistribution`. - pub fn new(metrics: Metrics) -> Self { - Self { metrics } - } - - #[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))] - async fn run( - self, - ctx: impl SubsystemContext, - ) -> SubsystemResult<()> { - self.run_with_state(ctx, State::default()).await - } - - async fn run_with_state( - self, - mut ctx: impl SubsystemContext, - mut state: State, - ) -> SubsystemResult<()> { - state.metrics = self.metrics; - - loop { - // `select_biased` is used since receiving connection notifications and - // peer view update messages may be racy and we want connection notifications - // first. - futures::select_biased! { - v = state.connection_requests.next().fuse() => handle_validator_connected(&mut state, v.peer_id), - v = ctx.recv().fuse() => { - match v? { - FromOverseer::Signal(signal) => if handle_signal( - &mut state, - &mut ctx, - signal, - ).await? { - return Ok(()); - } - FromOverseer::Communication { msg } => match msg { - PoVDistributionMessage::FetchPoV(relay_parent, descriptor, response_sender) => - handle_fetch( - &mut state, - &mut ctx, - relay_parent, - descriptor, - response_sender, - ).await, - PoVDistributionMessage::DistributePoV(relay_parent, descriptor, pov) => - handle_distribute( - &mut state, - &mut ctx, - relay_parent, - descriptor, - pov, - ).await, - PoVDistributionMessage::NetworkBridgeUpdateV1(event) => - handle_network_update( - &mut state, - &mut ctx, - event, - ).await, - } - } - } - } - } - } -} - - - -#[derive(Clone)] -struct MetricsInner { - povs_distributed: prometheus::Counter, - handle_signal: prometheus::Histogram, - handle_fetch: prometheus::Histogram, - handle_distribute: prometheus::Histogram, - handle_network_update: prometheus::Histogram, -} - -/// Availability Distribution metrics. -#[derive(Default, Clone)] -pub struct Metrics(Option); - -impl Metrics { - fn on_pov_distributed(&self) { - if let Some(metrics) = &self.0 { - metrics.povs_distributed.inc(); - } - } - - /// Provide a timer for `handle_signal` which observes on drop. - fn time_handle_signal(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_signal.start_timer()) - } - - /// Provide a timer for `handle_fetch` which observes on drop. - fn time_handle_fetch(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_fetch.start_timer()) - } - - /// Provide a timer for `handle_distribute` which observes on drop. - fn time_handle_distribute(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_distribute.start_timer()) - } - - /// Provide a timer for `handle_network_update` which observes on drop. - fn time_handle_network_update(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_network_update.start_timer()) - } -} - -impl metrics::Metrics for Metrics { - fn try_register(registry: &prometheus::Registry) -> std::result::Result { - let metrics = MetricsInner { - povs_distributed: prometheus::register( - prometheus::Counter::new( - "parachain_povs_distributed_total", - "Number of PoVs distributed to other peers." - )?, - registry, - )?, - handle_signal: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_signal", - "Time spent within `pov_distribution::handle_signal`", - ) - )?, - registry, - )?, - handle_fetch: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_fetch", - "Time spent within `pov_distribution::handle_fetch`", - ) - )?, - registry, - )?, - handle_distribute: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_distribute", - "Time spent within `pov_distribution::handle_distribute`", - ) - )?, - registry, - )?, - handle_network_update: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_network_update", - "Time spent within `pov_distribution::handle_network_update`", - ) - )?, - registry, - )?, - }; - Ok(Metrics(Some(metrics))) - } -} diff --git a/node/network/pov-distribution/src/tests.rs b/node/network/pov-distribution/src/tests.rs deleted file mode 100644 index 6d07eac6b5d1..000000000000 --- a/node/network/pov-distribution/src/tests.rs +++ /dev/null @@ -1,1570 +0,0 @@ -// Copyright 2020-2021 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -use super::*; - -use std::{time::Duration, sync::Arc}; - -use assert_matches::assert_matches; -use futures::executor; -use tracing::trace; - -use sp_keyring::Sr25519Keyring; - -use polkadot_primitives::v1::{AuthorityDiscoveryId, BlockData, CoreState, GroupRotationInfo, Id as ParaId, ScheduledCore, SessionIndex, SessionInfo, ValidatorIndex}; -use polkadot_subsystem::{messages::{RuntimeApiMessage, RuntimeApiRequest}, jaeger}; -use polkadot_node_subsystem_test_helpers as test_helpers; -use polkadot_node_subsystem_util::TimeoutExt; -use polkadot_node_network_protocol::{view, our_view}; - -fn make_pov(data: Vec) -> PoV { - PoV { block_data: BlockData(data) } -} - -fn make_peer_state(awaited: Vec<(Hash, Vec)>) - -> PeerState -{ - PeerState { - awaited: awaited.into_iter().map(|(rp, h)| (rp, h.into_iter().collect())).collect() - } -} - -fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -type VirtualOverseer = test_helpers::TestSubsystemContextHandle; - -struct TestHarness { - virtual_overseer: VirtualOverseer, -} - -fn test_harness>( - state: State, - test: impl FnOnce(TestHarness) -> T, -) { - let _ = env_logger::builder() - .is_test(true) - .filter( - Some("polkadot_pov_distribution"), - log::LevelFilter::Trace, - ) - .filter( - Some(LOG_TARGET), - log::LevelFilter::Trace, - ) - .try_init(); - - let pool = sp_core::testing::TaskExecutor::new(); - - let (context, virtual_overseer) = test_helpers::make_subsystem_context(pool.clone()); - - let subsystem = super::PoVDistribution::new(Metrics::default()); - - let subsystem = subsystem.run_with_state(context, state); - - let test_fut = test(TestHarness { virtual_overseer }); - - futures::pin_mut!(test_fut); - futures::pin_mut!(subsystem); - - executor::block_on(future::select(test_fut, subsystem)); -} - -const TIMEOUT: Duration = Duration::from_millis(100); - -async fn overseer_send( - overseer: &mut VirtualOverseer, - msg: PoVDistributionMessage, -) { - trace!("Sending message:\n{:?}", &msg); - overseer - .send(FromOverseer::Communication { msg }) - .timeout(TIMEOUT) - .await - .expect(&format!("{:?} is more than enough for sending messages.", TIMEOUT)); -} - -async fn overseer_recv( - overseer: &mut VirtualOverseer, -) -> AllMessages { - let msg = overseer_recv_with_timeout(overseer, TIMEOUT) - .await - .expect(&format!("{:?} is more than enough to receive messages", TIMEOUT)); - - trace!("Received message:\n{:?}", &msg); - - msg -} - -async fn overseer_recv_with_timeout( - overseer: &mut VirtualOverseer, - timeout: Duration, -) -> Option { - trace!("Waiting for message..."); - overseer - .recv() - .timeout(timeout) - .await -} - -async fn overseer_signal( - overseer: &mut VirtualOverseer, - signal: OverseerSignal, -) { - overseer - .send(FromOverseer::Signal(signal)) - .timeout(TIMEOUT) - .await - .expect(&format!("{:?} is more than enough for sending signals.", TIMEOUT)); -} - -#[derive(Clone)] -struct TestState { - chain_ids: Vec, - validators: Vec, - validator_public: Vec, - validator_authority_id: Vec, - validator_peer_id: Vec, - validator_groups: (Vec>, GroupRotationInfo), - relay_parent: Hash, - availability_cores: Vec, - session_index: SessionIndex, -} - -impl Default for TestState { - fn default() -> Self { - let chain_a = ParaId::from(1); - let chain_b = ParaId::from(2); - - let chain_ids = vec![chain_a, chain_b]; - - let validators = vec![ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ]; - - let validator_public = validator_pubkeys(&validators); - let validator_authority_id = validator_authority_id(&validators); - - let validator_peer_id = std::iter::repeat_with(|| PeerId::random()) - .take(validator_public.len()) - .collect(); - - let validator_groups = vec![vec![2, 0, 4], vec![1], vec![3]] - .into_iter().map(|g| g.into_iter().map(ValidatorIndex).collect()).collect(); - let group_rotation_info = GroupRotationInfo { - session_start_block: 0, - group_rotation_frequency: 100, - now: 1, - }; - let validator_groups = (validator_groups, group_rotation_info); - - let availability_cores = vec![ - CoreState::Scheduled(ScheduledCore { - para_id: chain_ids[0], - collator: None, - }), - CoreState::Scheduled(ScheduledCore { - para_id: chain_ids[1], - collator: None, - }), - ]; - - let relay_parent = Hash::repeat_byte(0x05); - - Self { - chain_ids, - validators, - validator_public, - validator_authority_id, - validator_peer_id, - validator_groups, - relay_parent, - availability_cores, - session_index: 1, - } - } -} - -async fn test_validator_discovery( - virtual_overseer: &mut VirtualOverseer, - expected_relay_parent: Hash, - session_index: SessionIndex, - validator_ids: &[ValidatorId], - discovery_ids: &[AuthorityDiscoveryId], - validator_group: &[ValidatorIndex], -) { - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::SessionIndexForChild(tx), - )) => { - assert_eq!(relay_parent, expected_relay_parent); - tx.send(Ok(session_index)).unwrap(); - } - ); - - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::SessionInfo(index, tx), - )) => { - assert_eq!(relay_parent, expected_relay_parent); - assert_eq!(index, session_index); - - let validators = validator_group.iter() - .map(|idx| validator_ids[idx.0 as usize].clone()) - .collect(); - - let discovery_keys = validator_group.iter() - .map(|idx| discovery_ids[idx.0 as usize].clone()) - .collect(); - - tx.send(Ok(Some(SessionInfo { - validators, - discovery_keys, - ..Default::default() - }))).unwrap(); - } - ); -} - -#[test] -fn ask_validators_for_povs() { - let test_state = TestState::default(); - - test_harness(State::default(), |test_harness| async move { - let mut virtual_overseer = test_harness.virtual_overseer; - - let pov_block = PoV { - block_data: BlockData(vec![42, 43, 44]), - }; - - let pov_hash = pov_block.hash(); - - let mut candidate = CandidateDescriptor::default(); - - let current = test_state.relay_parent.clone(); - candidate.para_id = test_state.chain_ids[0]; - candidate.pov_hash = pov_hash; - candidate.relay_parent = test_state.relay_parent; - - overseer_signal( - &mut virtual_overseer, - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { - activated: [(test_state.relay_parent, Arc::new(jaeger::Span::Disabled))][..].into(), - deactivated: [][..].into(), - }), - ).await; - - // first subsystem will try to obtain validators. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - let (tx, pov_fetch_result) = oneshot::channel(); - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::FetchPoV(test_state.relay_parent.clone(), candidate, tx), - ).await; - - // obtain the availability cores. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.availability_cores.clone())).unwrap(); - } - ); - - // Obtain the validator groups - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_groups.clone())).unwrap(); - } - ); - - // obtain the validators per relay parent - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - test_validator_discovery( - &mut virtual_overseer, - current, - test_state.session_index, - &test_state.validator_public, - &test_state.validator_authority_id, - &test_state.validator_groups.0[0], - ).await; - - // We now should connect to our validator group. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| test_state.validator_authority_id.contains(id))); - - let result = vec![ - (test_state.validator_authority_id[2].clone(), test_state.validator_peer_id[2].clone()), - (test_state.validator_authority_id[0].clone(), test_state.validator_peer_id[0].clone()), - (test_state.validator_authority_id[4].clone(), test_state.validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - for i in vec![2, 0, 4] { - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerViewChange( - test_state.validator_peer_id[i].clone(), - view![current], - ) - ) - ).await; - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - to_peers, - payload, - )) => { - assert_eq!(to_peers, vec![test_state.validator_peer_id[i].clone()]); - assert_eq!(payload, awaiting_message(current.clone(), vec![pov_hash.clone()])); - } - ); - } - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerMessage( - test_state.validator_peer_id[2].clone(), - protocol_v1::PoVDistributionMessage::SendPoV( - current, - pov_hash, - CompressedPoV::compress(&pov_block).unwrap(), - ), - ) - ) - ).await; - - assert_eq!(*pov_fetch_result.await.unwrap(), pov_block); - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(id, benefit)) => { - assert_eq!(benefit, BENEFIT_FRESH_POV); - assert_eq!(id, test_state.validator_peer_id[2].clone()); - } - ); - - // Now let's test that if some peer is ahead of us we would still - // send `Await` on `FetchPoV` message to it. - let next_leaf = Hash::repeat_byte(10); - - // A validator's view changes and now is lets say ahead of us. - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerViewChange( - test_state.validator_peer_id[2].clone(), - view![next_leaf], - ) - ) - ).await; - - let pov_block = PoV { - block_data: BlockData(vec![45, 46, 47]), - }; - - let pov_hash = pov_block.hash(); - - let candidate = CandidateDescriptor { - para_id: test_state.chain_ids[0], - pov_hash, - relay_parent: next_leaf.clone(), - ..Default::default() - }; - - let (tx, _pov_fetch_result) = oneshot::channel(); - - overseer_signal( - &mut virtual_overseer, - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { - activated: [(next_leaf, Arc::new(jaeger::Span::Disabled))][..].into(), - deactivated: [current.clone()][..].into(), - }) - ).await; - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::FetchPoV(next_leaf.clone(), candidate, tx), - ).await; - - // Obtain the availability cores. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.availability_cores.clone())).unwrap(); - } - ); - - // Obtain the validator groups - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_groups.clone())).unwrap(); - } - ); - - // obtain the validators per relay parent - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - // obtain the validator_id to authority_id mapping - test_validator_discovery( - &mut virtual_overseer, - next_leaf, - test_state.session_index, - &test_state.validator_public, - &test_state.validator_authority_id, - &test_state.validator_groups.0[0], - ).await; - - // We now should connect to our validator group. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| test_state.validator_authority_id.contains(id))); - - let result = vec![ - (test_state.validator_authority_id[2].clone(), test_state.validator_peer_id[2].clone()), - (test_state.validator_authority_id[0].clone(), test_state.validator_peer_id[0].clone()), - (test_state.validator_authority_id[4].clone(), test_state.validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - // We already know that the leaf in question in the peer's view so we request - // a chunk from them right away. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - to_peers, - payload, - )) => { - assert_eq!(to_peers, vec![test_state.validator_peer_id[2].clone()]); - assert_eq!(payload, awaiting_message(next_leaf.clone(), vec![pov_hash.clone()])); - } - ); - }); -} - -#[test] -fn distributes_to_those_awaiting_and_completes_local() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - let peer_c = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - b.fetching.insert(pov_hash, vec![pov_send]); - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A has hash_a in its view and is awaiting the PoV. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - // peer B has hash_a in its view but is not awaiting. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer C doesn't have hash_a in its view but is awaiting the PoV under hash_b. - s.insert( - peer_c.clone(), - make_peer_state(vec![(hash_b, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a, hash_b], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let mut descriptor = CandidateDescriptor::default(); - descriptor.pov_hash = pov_hash; - - test_harness(state, |test_harness| async move { - let mut virtual_overseer = test_harness.virtual_overseer; - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::DistributePoV( - hash_a, - descriptor, - Arc::new(pov.clone()) - ) - ).await; - - // Let's assume runtime call failed and we're already connected to the peers. - assert_matches!( - virtual_overseer.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Err("nope".to_string().into())).unwrap(); - } - ); - - // our local sender also completed - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - virtual_overseer.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_a.clone()]); - assert_eq!( - message, - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ); - } - ) - }); -} - - -#[test] -fn we_inform_peers_with_same_view_we_are_awaiting() { - - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, _) = oneshot::channel(); - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A has hash_a in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer B doesn't have hash_a in its view. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - let mut descriptor = CandidateDescriptor::default(); - descriptor.pov_hash = pov_hash; - - let para_id_1 = ParaId::from(1); - let para_id_2 = ParaId::from(2); - - descriptor.para_id = para_id_1; - - let availability_cores = vec![ - CoreState::Scheduled(ScheduledCore { - para_id: para_id_1, - collator: None, - }), - CoreState::Scheduled(ScheduledCore { - para_id: para_id_2, - collator: None, - }), - ]; - - let validators = vec![ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ]; - - let validator_authority_id = validator_authority_id(&validators); - let validators = validator_pubkeys(&validators); - - let validator_peer_id: Vec<_> = std::iter::repeat_with(|| PeerId::random()) - .take(validators.len()) - .collect(); - - let validator_groups = vec![vec![2, 0, 4], vec![1], vec![3]] - .into_iter().map(|g| g.into_iter().map(ValidatorIndex).collect()).collect(); - let group_rotation_info = GroupRotationInfo { - session_start_block: 0, - group_rotation_frequency: 100, - now: 1, - }; - - let validator_groups = (validator_groups, group_rotation_info); - - executor::block_on(async move { - let handle_future = handle_fetch( - &mut state, - &mut ctx, - hash_a, - descriptor, - pov_send, - ); - - let check_future = async move { - //assert_eq!(state.relay_parent_state[&hash_a].fetching[&pov_hash].len(), 1); - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(availability_cores)).unwrap(); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(validator_groups.clone())).unwrap(); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(validators.clone())).unwrap(); - } - ); - - test_validator_discovery( - &mut handle, - hash_a, - 1, - &validators, - &validator_authority_id, - &validator_groups.0[0], - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| validator_authority_id.contains(id))); - - let result = vec![ - (validator_authority_id[2].clone(), validator_peer_id[2].clone()), - (validator_authority_id[0].clone(), validator_peer_id[0].clone()), - (validator_authority_id[4].clone(), validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - }; - - futures::join!(handle_future, check_future); - }); -} - -#[test] -fn peer_view_change_leads_to_us_informing() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_a_send, _) = oneshot::channel(); - - let pov_a = make_pov(vec![1, 2, 3]); - let pov_a_hash = pov_a.hash(); - - let pov_b = make_pov(vec![4, 5, 6]); - let pov_b_hash = pov_b.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov_a is still being fetched, whereas the fetch of pov_b has already - // completed, as implied by the empty vector. - b.fetching.insert(pov_a_hash, vec![pov_a_send]); - b.fetching.insert(pov_b_hash, vec![]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A doesn't yet have hash_a in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerViewChange(peer_a.clone(), view![hash_a, hash_b]), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_a.clone()]); - assert_eq!( - message, - awaiting_message(hash_a, vec![pov_a_hash]), - ); - } - ) - }); -} - -#[test] -fn peer_complete_fetch_and_is_rewarded() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peers A and B are functionally the same. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request before peer B. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_b.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, BENEFIT_LATE_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_bad_pov() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_send, _) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let bad_pov = make_pov(vec![6, 6, 6]); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&bad_pov).unwrap()), - ).focus().unwrap(), - ).await; - - // didn't complete our sender. - assert_eq!(state.relay_parent_state[&hash_a].fetching[&pov_hash].len(), 1); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_unexpected_pov() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_pov_out_of_our_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_b, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_too_much() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let n_validators = 10; - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let max_plausibly_awaited = n_validators * 2; - - // The peer awaits a plausible (albeit unlikely) amount of PoVs. - for i in 0..max_plausibly_awaited { - let pov_hash = make_pov(vec![i as u8; 32]).hash(); - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_a, vec![pov_hash]), - ).focus().unwrap(), - ).await; - } - - assert_eq!(state.peer_state[&peer_a].awaited[&hash_a].len(), max_plausibly_awaited); - - // The last straw: - let last_pov_hash = make_pov(vec![max_plausibly_awaited as u8; 32]).hash(); - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_a, vec![last_pov_hash]), - ).focus().unwrap(), - ).await; - - // No more bookkeeping for you! - assert_eq!(state.peer_state[&peer_a].awaited[&hash_a].len(), max_plausibly_awaited); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_APPARENT_FLOOD); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_outside_their_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - s.insert(hash_a, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s.insert(hash_b, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s - }, - peer_state: { - let mut s = HashMap::new(); - - // Peer has only hash A in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a, hash_b], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let pov_hash = make_pov(vec![1, 2, 3]).hash(); - - // Hash B is in our view but not the peer's - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_b, vec![pov_hash]), - ).focus().unwrap(), - ).await; - - assert!(state.peer_state[&peer_a].awaited.get(&hash_b).is_none()); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_AWAITED_NOT_IN_VIEW); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_outside_our_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - s.insert(hash_a, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s - }, - peer_state: { - let mut s = HashMap::new(); - - // Peer has hashes A and B in their view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![]), (hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let pov_hash = make_pov(vec![1, 2, 3]).hash(); - - // Hash B is in peer's view but not ours. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_b, vec![pov_hash]), - ).focus().unwrap(), - ).await; - - // Illegal `awaited` is ignored. - assert!(state.peer_state[&peer_a].awaited[&hash_b].is_empty()); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_AWAITED_NOT_IN_VIEW); - } - ); - }); -} - -#[test] -fn peer_complete_fetch_leads_to_us_completing_others() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer B is awaiting peer A's request. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_b.clone()]); - assert_eq!( - message, - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ); - } - ); - - assert!(!state.peer_state[&peer_b].awaited[&hash_a].contains(&pov_hash)); - }); -} - -#[test] -fn peer_completing_request_no_longer_awaiting() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A is registered as awaiting. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - // We received the PoV from peer A, so we do not consider it awaited by peer A anymore. - assert!(!state.peer_state[&peer_a].awaited[&hash_a].contains(&pov_hash)); - }); -} diff --git a/node/service/Cargo.toml b/node/service/Cargo.toml index 36b84d909c53..dfcda817e531 100644 --- a/node/service/Cargo.toml +++ b/node/service/Cargo.toml @@ -93,7 +93,6 @@ polkadot-node-core-candidate-validation = { path = "../core/candidate-validation polkadot-node-core-chain-api = { path = "../core/chain-api", optional = true } polkadot-node-core-provisioner = { path = "../core/provisioner", optional = true } polkadot-node-core-runtime-api = { path = "../core/runtime-api", optional = true } -polkadot-pov-distribution = { path = "../network/pov-distribution", optional = true } polkadot-statement-distribution = { path = "../network/statement-distribution", optional = true } polkadot-approval-distribution = { path = "../network/approval-distribution", optional = true } polkadot-node-core-approval-voting = { path = "../core/approval-voting", optional = true } @@ -139,7 +138,6 @@ real-overseer = [ "polkadot-node-core-chain-api", "polkadot-node-core-provisioner", "polkadot-node-core-runtime-api", - "polkadot-pov-distribution", "polkadot-statement-distribution", "polkadot-approval-distribution", ] From a7fc368d288cd060d6d4ef9a84db7afb306c552b Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 18 Mar 2021 10:46:51 +0100 Subject: [PATCH 04/30] WIP: Fetch PoV directly from backing. --- Cargo.lock | 2 + node/core/backing/Cargo.toml | 1 + node/core/backing/src/lib.rs | 107 ++++++++++++++++++++++++----------- 3 files changed, 76 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b7ccf112e49..fb8426da037c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5493,6 +5493,7 @@ dependencies = [ "bitvec", "futures 0.3.12", "polkadot-erasure-coding", + "polkadot-node-network-protocol", "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-node-subsystem-test-helpers", @@ -5655,6 +5656,7 @@ dependencies = [ "polkadot-primitives", "sc-network", "strum", + "thiserror", ] [[package]] diff --git a/node/core/backing/Cargo.toml b/node/core/backing/Cargo.toml index a6b04c03ae92..8538fef0a43f 100644 --- a/node/core/backing/Cargo.toml +++ b/node/core/backing/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" futures = "0.3.12" sp-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" } polkadot-primitives = { path = "../../../primitives" } +polkadot-node-network-protocol = { path = "../../network/protocol" } polkadot-node-primitives = { path = "../../primitives" } polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" } polkadot-node-subsystem-util = { path = "../../subsystem-util" } diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index 96fb71021dad..203f7ac753c9 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -34,15 +34,7 @@ use polkadot_primitives::v1::{ use polkadot_node_primitives::{ Statement, SignedFullStatement, ValidationResult, }; -use polkadot_subsystem::{ - PerLeafSpan, Stage, - jaeger, - messages::{ - AllMessages, AvailabilityStoreMessage, CandidateBackingMessage, CandidateSelectionMessage, - CandidateValidationMessage, PoVDistributionMessage, ProvisionableData, - ProvisionerMessage, StatementDistributionMessage, ValidationFailed, RuntimeApiRequest, - }, -}; +use polkadot_subsystem::{PerLeafSpan, Stage, errors::RuntimeApiError, jaeger, messages::{AllMessages, AvailabilityStoreMessage, CandidateBackingMessage, CandidateSelectionMessage, CandidateValidationMessage, IfDisconnected, NetworkBridgeMessage, PoVDistributionMessage, ProvisionableData, ProvisionerMessage, RuntimeApiRequest, StatementDistributionMessage, ValidationFailed}}; use polkadot_node_subsystem_util::{ self as util, request_session_index_for_child, @@ -54,6 +46,10 @@ use polkadot_node_subsystem_util::{ FromJobCommand, metrics::{self, prometheus}, }; +use polkadot_node_subsystem_util::Error as UtilError; +use polkadot_node_network_protocol::request_response::{Recipient, Requests, request::{OutgoingRequest, RequestError}, v1::{ + PoVFetchingRequest, PoVFetchingResponse + }}; use statement_table::{ generic::AttestedCandidate as TableAttestedCandidate, Context as TableContextTrait, @@ -65,6 +61,7 @@ use statement_table::{ }, }; use thiserror::Error; +use util::request_session_info; const LOG_TARGET: &str = "parachain::candidate-backing"; @@ -76,8 +73,18 @@ enum Error { InvalidSignature, #[error("Failed to send candidates {0:?}")] Send(Vec), + #[error("FetchPoV request error")] + FetchPoV(#[source] RequestError), #[error("FetchPoV channel closed before receipt")] - FetchPoV(#[source] oneshot::Canceled), + FetchPoVCanceled(#[source] oneshot::Canceled), + #[error("FetchPoV runtime request failed in utilities")] + FetchPoVUtil(#[source] UtilError), + #[error("FetchPoV runtime request failed")] + FetchPoVRuntime(#[source] RuntimeApiError), + #[error("FetchPoV could not find session")] + FetchPoVNoSuchSession, + #[error("FetchPoV could not find index of signing validator")] + FetchPoVInvalidValidatorIndex, #[error("ValidateFromChainState channel closed before receipt")] ValidateFromChainState(#[source] oneshot::Canceled), #[error("StoreAvailableData channel closed before receipt")] @@ -94,6 +101,14 @@ enum Error { UtilError(#[from] util::Error), } +/// PoV data to validate. +enum PoVData { + /// Allready available (from candidate selection). + Ready(Arc), + /// Needs to be fetched from validator (we are checking a signed statement). + FetchFromValidator(ValidatorIndex), +} + enum ValidatedCandidateCommand { // We were instructed to second the candidate. Second(BackgroundValidationResult), @@ -336,18 +351,46 @@ async fn make_pov_available( Ok(Ok(())) } -async fn request_pov_from_distribution( +async fn request_pov( tx_from: &mut mpsc::Sender, parent: Hash, descriptor: CandidateDescriptor, + from_validator: ValidatorIndex, ) -> Result, Error> { - let (tx, rx) = oneshot::channel(); - - tx_from.send(AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(parent, descriptor, tx) - ).into()).await?; - - rx.await.map_err(Error::FetchPoV) + let session_index = request_session_index_for_child(parent, &mut tx_from) + .await.map_err(Error::FetchPoVUtil)? + .await.map_err(Error::FetchPoVCanceled)? + .map_err(Error::FetchPoVRuntime)?; + let session_info = request_session_info(parent, session_index, &mut tx_from) + .await.map_err(Error::FetchPoVUtil)? + .await.map_err(Error::FetchPoVCanceled)? + .map_err(Error::FetchPoVRuntime)? + .ok_or(Error::FetchPoVNoSuchSession)?; + let authority_id = session_info.discovery_keys.get(from_validator.0 as _) + .ok_or(Error::FetchPoVInvalidValidatorIndex); + + let (req, pending_response) = OutgoingRequest::new( + Recipient::Authority(authority_id), + PoVFetchingRequest { + relay_parent: parent, + descriptor, + }, + ); + let full_req = Requests::PoVFetching(req); + + tx_from.send(FromJobCommand::SendMessage(AllMessages::NetworkBridge( + NetworkBridgeMessage::SendRequests( + vec![full_req], + IfDisconnected::ImmediateError + ) + ))).await?; + + let response = pending_response.await.map_err(Error::FetchPoV)?; + match response { + PoVFetchingResponse::PoV(compressed) => { + } + PoVFetchingResponse::NoSuchPoV => { + } } async fn request_candidate_validation( @@ -380,7 +423,7 @@ struct BackgroundValidationParams { tx_command: mpsc::Sender, candidate: CandidateReceipt, relay_parent: Hash, - pov: Option>, + pov: PoVData, validator_index: Option, n_validators: usize, span: Option, @@ -403,13 +446,14 @@ async fn validate_and_make_available( } = params; let pov = match pov { - Some(pov) => pov, - None => { + PoVData::Ready(pov) => pov, + PoVData::FetchFromValidator(validator_index) => { let _span = span.as_ref().map(|s| s.child("request-pov")); - request_pov_from_distribution( + request_pov( &mut tx_from, relay_parent, candidate.descriptor.clone(), + validator_index, ).await? } }; @@ -822,6 +866,7 @@ impl CandidateBackingJob { async fn kick_off_validation_work( &mut self, summary: TableSummary, + statement: SignedFullStatement, span: Option, ) -> Result<(), Error> { let candidate_hash = summary.candidate; @@ -860,7 +905,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate, relay_parent: self.parent, - pov: None, + pov: PoVData::FetchFromValidator(statement.validator_index()), validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, @@ -885,7 +930,11 @@ impl CandidateBackingJob { summary.group_id, ); - self.kick_off_validation_work(summary, span).await?; + self.kick_off_validation_work( + summary, + statement, + span, + ).await?; } } } @@ -986,16 +1035,6 @@ impl CandidateBackingJob { Ok(()) } - async fn distribute_pov( - &mut self, - descriptor: CandidateDescriptor, - pov: Arc, - ) -> Result<(), Error> { - self.tx_from.send(AllMessages::from( - PoVDistributionMessage::DistributePoV(self.parent, descriptor, pov), - ).into()).await.map_err(Into::into) - } - async fn distribute_signed_statement(&mut self, s: SignedFullStatement) -> Result<(), Error> { let smsg = StatementDistributionMessage::Share(self.parent, s); From e03ff75ffc926bd00020b80dff30689ea8f34592 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Fri, 19 Mar 2021 13:40:42 +0100 Subject: [PATCH 05/30] Backing compiles. --- node/core/backing/src/lib.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index 882050d45464..ab8ed125b250 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -29,7 +29,7 @@ use sp_keystore::SyncCryptoStorePtr; use polkadot_primitives::v1::{ AvailableData, BackedCandidate, CandidateCommitments, CandidateDescriptor, CandidateHash, CandidateReceipt, CollatorId, CommittedCandidateReceipt, CoreIndex, CoreState, Hash, Id as ParaId, - PoV, SigningContext, ValidatorId, ValidatorIndex, ValidatorSignature, ValidityAttestation, + PoV, SigningContext, ValidatorId, ValidatorIndex, ValidatorSignature, ValidityAttestation, AuthorityDiscoveryId, }; use polkadot_node_primitives::{ Statement, SignedFullStatement, ValidationResult, @@ -85,6 +85,10 @@ enum Error { FetchPoVNoSuchSession, #[error("FetchPoV could not find index of signing validator")] FetchPoVInvalidValidatorIndex, + #[error("Invalid response")] + FetchPoVInvalidResponse, + #[error("Backer did not have PoV")] + FetchPoVNoSuchPoV, #[error("ValidateFromChainState channel closed before receipt")] ValidateFromChainState(#[source] oneshot::Canceled), #[error("StoreAvailableData channel closed before receipt")] @@ -357,20 +361,21 @@ async fn request_pov( descriptor: CandidateDescriptor, from_validator: ValidatorIndex, ) -> Result, Error> { - let session_index = request_session_index_for_child(parent, &mut tx_from) + let session_index = request_session_index_for_child(parent, tx_from) .await.map_err(Error::FetchPoVUtil)? .await.map_err(Error::FetchPoVCanceled)? .map_err(Error::FetchPoVRuntime)?; - let session_info = request_session_info(parent, session_index, &mut tx_from) + let session_info = request_session_info(parent, session_index, tx_from) .await.map_err(Error::FetchPoVUtil)? .await.map_err(Error::FetchPoVCanceled)? .map_err(Error::FetchPoVRuntime)? .ok_or(Error::FetchPoVNoSuchSession)?; - let authority_id = session_info.discovery_keys.get(from_validator.0 as _) - .ok_or(Error::FetchPoVInvalidValidatorIndex); + let authority_id = session_info.discovery_keys + .get(from_validator.0 as usize) + .ok_or(Error::FetchPoVInvalidValidatorIndex)?; let (req, pending_response) = OutgoingRequest::new( - Recipient::Authority(authority_id), + Recipient::Authority(authority_id.clone()), PoVFetchingRequest { relay_parent: parent, descriptor, @@ -386,11 +391,15 @@ async fn request_pov( ))).await?; let response = pending_response.await.map_err(Error::FetchPoV)?; - match response { + let pov = match response { PoVFetchingResponse::PoV(compressed) => { + compressed.decompress().map_err(|_| Error::FetchPoVInvalidResponse)? } PoVFetchingResponse::NoSuchPoV => { + return Err(Error::FetchPoVNoSuchPoV) } + }; + Ok(Arc::new(pov)) } async fn request_candidate_validation( @@ -588,7 +597,6 @@ impl CandidateBackingJob { ).await? { self.issue_candidate_seconded_message(stmt).await?; } - self.distribute_pov(candidate.descriptor, pov).await?; } } Err(candidate) => { @@ -689,7 +697,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate: candidate.clone(), relay_parent: self.parent, - pov: Some(pov), + pov: PoVData::Ready(pov), validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, From a49b4d431b2a58b525df2b67199f734f23017d42 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Tue, 23 Mar 2021 18:48:31 +0100 Subject: [PATCH 06/30] Runtime access and connection management for PoV distribution. --- .../availability-distribution/src/error.rs | 35 ++- .../availability-distribution/src/lib.rs | 35 ++- .../src/pov_requester/mod.rs | 128 +++++++++++ .../src/requester/mod.rs | 35 ++- .../src/responder.rs | 17 +- .../availability-distribution/src/runtime.rs | 199 ++++++++++++++++++ .../src/session_cache.rs | 9 +- .../protocol/src/request_response/v1.rs | 8 +- 8 files changed, 404 insertions(+), 62 deletions(-) create mode 100644 node/network/availability-distribution/src/pov_requester/mod.rs create mode 100644 node/network/availability-distribution/src/runtime.rs diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 47c556bd8532..0de80765696b 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -22,9 +22,11 @@ use thiserror::Error; use futures::channel::oneshot; use polkadot_node_subsystem_util::Error as UtilError; -use polkadot_primitives::v1::SessionIndex; +use polkadot_primitives::v1::{CompressedPoVError, SessionIndex}; use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError}; +use crate::LOG_TARGET; + /// Errors of this subsystem. #[derive(Debug, Error)] pub enum Error { @@ -56,24 +58,28 @@ pub enum Error { /// Sending response failed. #[error("Sending a request's response failed.")] SendResponse, -} -/// Error that we should handle gracefully by logging it. -#[derive(Debug)] -pub enum NonFatalError { /// Some request to utility functions failed. /// This can be either `RuntimeRequestCanceled` or `RuntimeApiError`. + #[error("Utility request failed")] UtilRequest(UtilError), /// Runtime API subsystem is down, which means we're shutting down. + #[error("Runtime request canceled")] RuntimeRequestCanceled(oneshot::Canceled), /// Some request to the runtime failed. /// For example if we prune a block we're requesting info about. + #[error("Runtime API error")] RuntimeRequest(RuntimeApiError), /// We tried fetching a session info which was not available. + #[error("There was no session with the given index")] NoSuchSession(SessionIndex), + + /// Decompressing PoV failed. + #[error("PoV could not be decompressed")] + PoVDecompression(CompressedPoVError), } pub type Result = std::result::Result; @@ -90,9 +96,20 @@ pub(crate) async fn recv_runtime( oneshot::Receiver>, UtilError, >, -) -> std::result::Result { - r.map_err(NonFatalError::UtilRequest)? +) -> std::result::Result { + r.map_err(Error::UtilRequest)? .await - .map_err(NonFatalError::RuntimeRequestCanceled)? - .map_err(NonFatalError::RuntimeRequest) + .map_err(Error::RuntimeRequestCanceled)? + .map_err(Error::RuntimeRequest) +} + + +/// Utility for eating top level errors and log them. +/// +/// We basically always want to try and continue on error. This utility function is meant to +/// consume top-level errors by simply logging them +pub fn log_error(result: Result<()>, ctx: &'static str) { + if let Err(error) = result { + tracing::warn!(target: LOG_TARGET, error = ?error, ctx); + } } diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index db34ba84527c..3d42c557f549 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -26,12 +26,20 @@ use polkadot_subsystem::{ /// Error and [`Result`] type for this subsystem. mod error; pub use error::Error; -use error::Result; +use error::{Result, log_error}; + +/// Runtime requests. +mod runtime; +use runtime::Runtime; /// `Requester` taking care of requesting chunks for candidates pending availability. mod requester; use requester::Requester; +/// Handing requests for PoVs during backing. +mod pov_requester; +use pov_requester::PoVRequester; + /// Responding to erasure chunk requests: mod responder; use responder::{answer_chunk_request_log, answer_pov_request_log}; @@ -52,6 +60,8 @@ const LOG_TARGET: &'static str = "parachain::availability-distribution"; pub struct AvailabilityDistributionSubsystem { /// Pointer to a keystore, which is required for determining this nodes validator index. keystore: SyncCryptoStorePtr, + /// Easy and efficient runtime access for this subsystem. + runtime: Runtime, /// Prometheus metrics. metrics: Metrics, } @@ -74,17 +84,20 @@ where } impl AvailabilityDistributionSubsystem { + /// Create a new instance of the availability distribution. pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self { - Self { keystore, metrics } + let runtime = Runtime::new(keystore.clone()); + Self { keystore, runtime, metrics } } /// Start processing work as passed on from the Overseer. - async fn run(self, mut ctx: Context) -> Result<()> + async fn run(mut self, mut ctx: Context) -> Result<()> where Context: SubsystemContext + Sync + Send, { let mut requester = Requester::new(self.keystore.clone(), self.metrics.clone()).fuse(); + let mut pov_requester = PoVRequester::new(); loop { let action = { let mut subsystem_next = ctx.recv().fuse(); @@ -107,14 +120,14 @@ impl AvailabilityDistributionSubsystem { }; match message { FromOverseer::Signal(OverseerSignal::ActiveLeaves(update)) => { - // Update the relay chain heads we are fetching our pieces for: - if let Some(e) = requester - .get_mut() - .update_fetching_heads(&mut ctx, update) - .await? - { - tracing::debug!(target: LOG_TARGET, "Error processing ActiveLeavesUpdate: {:?}", e); - } + log_error( + pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await, + "PoVRequester::update_connected_validators" + ); + log_error( + requester.get_mut().update_fetching_heads(&mut ctx, update).await, + "Error in Requester::update_fetching_heads" + ); } FromOverseer::Signal(OverseerSignal::BlockFinalized(..)) => {} FromOverseer::Signal(OverseerSignal::Conclude) => { diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs new file mode 100644 index 000000000000..912ffa3f8c41 --- /dev/null +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -0,0 +1,128 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! PoV requester takes care of requesting PoVs from validators of a backing group. + +use futures::channel::mpsc; +use lru::LruCache; + +use polkadot_node_network_protocol::{PeerId, peer_set::PeerSet}; +use polkadot_primitives::v1::{AuthorityDiscoveryId, Hash, SessionIndex}; +use polkadot_subsystem::{ActiveLeavesUpdate, SubsystemContext, messages::{AllMessages, NetworkBridgeMessage}}; + +use crate::runtime::Runtime; + +/// Number of sessions we want to keep in the LRU. +const NUM_SESSIONS: usize = 2; + +pub struct PoVRequester { + + /// We only ever care about being connected to validators of at most two sessions. + /// + /// So we keep an LRU for managing connection requests of size 2. + connected_validators: LruCache>, +} + +impl PoVRequester { + /// Create a new requester for PoVs. + pub fn new() -> Self { + Self { + connected_validators: LruCache::new(NUM_SESSIONS), + } + } + + /// Make sure we are connected to the right set of validators. + /// + /// On every `ActiveLeavesUpdate`, we check whether we are connected properly to our current + /// validator group. + pub async fn update_connected_validators( + &mut self, + ctx: &mut Context, + runtime: &mut Runtime, + update: &ActiveLeavesUpdate, + ) -> super::Result<()> + where + Context: SubsystemContext, + { + let activated = update.activated.iter().map(|(h, _)| h); + let activated_sessions = + get_activated_sessions(ctx, runtime, activated).await?; + + for (parent, session_index) in activated_sessions { + if self.connected_validators.contains(&session_index) { + continue + } + self.connected_validators.put( + session_index, + connect_to_relevant_validators(ctx, runtime, parent, session_index).await? + ); + } + Ok(()) + } + +} + +async fn get_activated_sessions(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator) + -> super::Result> +where + Context: SubsystemContext, +{ + let mut sessions = Vec::new(); + for parent in new_heads { + sessions.push((*parent, runtime.get_session_index(ctx, *parent).await?)); + } + Ok(sessions.into_iter()) +} + +async fn connect_to_relevant_validators( + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + session: SessionIndex +) + -> super::Result> +where + Context: SubsystemContext, +{ + let validator_ids = determine_relevant_validators(ctx, runtime, parent, session).await?; + // We don't actually care about `PeerId`s, just keeping receiver so we stay connected: + let (tx, rx) = mpsc::channel(0); + ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators { + validator_ids, peer_set: PeerSet::Validation, connected: tx + })).await; + Ok(rx) +} + +async fn determine_relevant_validators( + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + session: SessionIndex, +) + -> super::Result> +where + Context: SubsystemContext, +{ + let info = runtime.get_session_info_by_index(ctx, parent, session).await?; + if let Some(validator_info) = &info.validator_info { + let indeces = info.session_info.validator_groups.get(validator_info.our_group.0 as usize) + .expect("Our group got retrieved from that session info, it must exist. qed.") + .clone(); + Ok(indeces.into_iter().map(|i| info.session_info.discovery_keys[i.0 as usize].clone()).collect()) + } else { + Ok(Vec::new()) + } +} diff --git a/node/network/availability-distribution/src/requester/mod.rs b/node/network/availability-distribution/src/requester/mod.rs index adbca626d1e4..c167a0f467b0 100644 --- a/node/network/availability-distribution/src/requester/mod.rs +++ b/node/network/availability-distribution/src/requester/mod.rs @@ -40,7 +40,7 @@ use polkadot_subsystem::{ }; use super::{error::recv_runtime, session_cache::SessionCache, LOG_TARGET, Metrics}; -use crate::error::NonFatalError; +use crate::error::Error; /// A task fetching a particular chunk. mod fetch_task; @@ -97,7 +97,7 @@ impl Requester { &mut self, ctx: &mut Context, update: ActiveLeavesUpdate, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { @@ -107,9 +107,9 @@ impl Requester { } = update; // Order important! We need to handle activated, prior to deactivated, otherwise we might // cancel still needed jobs. - let err = self.start_requesting_chunks(ctx, activated.into_iter()).await?; + self.start_requesting_chunks(ctx, activated.into_iter()).await?; self.stop_requesting_chunks(deactivated.into_iter()); - Ok(err) + Ok(()) } /// Start requesting chunks for newly imported heads. @@ -117,20 +117,15 @@ impl Requester { &mut self, ctx: &mut Context, new_heads: impl Iterator)>, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { for (leaf, _) in new_heads { - let cores = match query_occupied_cores(ctx, leaf).await { - Err(err) => return Ok(Some(err)), - Ok(cores) => cores, - }; - if let Some(err) = self.add_cores(ctx, leaf, cores).await? { - return Ok(Some(err)); - } + let cores = query_occupied_cores(ctx, leaf).await?; + self.add_cores(ctx, leaf, cores).await?; } - Ok(None) + Ok(()) } /// Stop requesting chunks for obsolete heads. @@ -155,7 +150,7 @@ impl Requester { ctx: &mut Context, leaf: Hash, cores: impl IntoIterator, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { @@ -170,7 +165,7 @@ impl Requester { let tx = self.tx.clone(); let metrics = self.metrics.clone(); - let task_cfg = match self + let task_cfg = self .session_cache .with_session_info( ctx, @@ -180,11 +175,7 @@ impl Requester { leaf, |info| FetchTaskConfig::new(leaf, &core, tx, metrics, info), ) - .await - { - Err(err) => return Ok(Some(err)), - Ok(task_cfg) => task_cfg, - }; + .await?; if let Some(task_cfg) = task_cfg { e.insert(FetchTask::start(task_cfg, ctx).await?); @@ -193,7 +184,7 @@ impl Requester { } } } - Ok(None) + Ok(()) } } @@ -228,7 +219,7 @@ impl Stream for Requester { async fn query_occupied_cores( ctx: &mut Context, relay_parent: Hash, -) -> Result, NonFatalError> +) -> Result, Error> where Context: SubsystemContext, { diff --git a/node/network/availability-distribution/src/responder.rs b/node/network/availability-distribution/src/responder.rs index 07ed4ee9d971..a34199902001 100644 --- a/node/network/availability-distribution/src/responder.rs +++ b/node/network/availability-distribution/src/responder.rs @@ -19,7 +19,7 @@ use futures::channel::oneshot; use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1}; -use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex}; +use polkadot_primitives::v1::{AvailableData, CandidateHash, CompressedPoV, ErasureChunk, ValidatorIndex}; use polkadot_subsystem::{ messages::{AllMessages, AvailabilityStoreMessage}, SubsystemContext, jaeger, @@ -49,7 +49,7 @@ where err= ?err, "Serving PoV failed with error" ); - metrics.on_served(FAILED); + metrics.on_served_pov(FAILED); } } } @@ -75,7 +75,7 @@ where err= ?err, "Serving chunk failed with error" ); - metrics.on_served(FAILED); + metrics.on_served_chunk(FAILED); } } } @@ -90,12 +90,9 @@ pub async fn answer_pov_request( where Context: SubsystemContext, { - let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); - let _child_span = span.child_builder("answer-chunk-request") - .with_chunk_index(req.payload.index.0) - .build(); + let _span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-pov-request"); - let av_data = query_available_data(ctx, req.payload.candidate_hash, req.payload.index).await?; + let av_data = query_available_data(ctx, req.payload.candidate_hash).await?; let result = av_data.is_some(); @@ -111,7 +108,7 @@ where "Failed to create `CompressedPov`", ); // this should really not happen, let this request time out: - return false + return Err(Error::PoVDecompression(error)) } }; v1::PoVFetchingResponse::PoV(pov) @@ -132,7 +129,7 @@ pub async fn answer_chunk_request( where Context: SubsystemContext, { - let mut span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); + let span = jaeger::candidate_hash_span(&req.payload.candidate_hash, "answer-request"); let _child_span = span.child_builder("answer-chunk-request") .with_chunk_index(req.payload.index.0) .build(); diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs new file mode 100644 index 000000000000..c584f0bd7d77 --- /dev/null +++ b/node/network/availability-distribution/src/runtime.rs @@ -0,0 +1,199 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Convenient interface to the runtime. + +use lru::LruCache; + +use sp_application_crypto::AppKey; +use sp_core::crypto::Public; +use sp_keystore::{CryptoStore, SyncCryptoStorePtr}; + +use polkadot_node_subsystem_util::{ + request_session_index_for_child_ctx, request_session_info_ctx, +}; +use polkadot_primitives::v1::{GroupIndex, Hash, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex}; +use polkadot_subsystem::SubsystemContext; + +use super::{ + error::recv_runtime, + Error, +}; + +/// Caching of session info as needed by availability distribution. +/// +/// It should be ensured that a cached session stays live in the cache as long as we might need it. +pub struct Runtime { + /// Get the session index for a given relay parent. + /// + /// We query this up to a 100 times per block, so caching it here without roundtrips over the + /// overseer seems sensible. + session_index_cache: LruCache, + + /// Look up cached sessions by SessionIndex. + session_info_cache: LruCache, + + /// Key store for determining whether we are a validator and what `ValidatorIndex` we have. + keystore: SyncCryptoStorePtr, +} + +/// SessionInfo with additional useful data for validator nodes. +pub struct ExtendedSessionInfo { + /// Actual session info as fetched from the runtime. + pub session_info: SessionInfo, + /// Contains useful information about ourselves, in case this node is a validator. + pub validator_info: Option, +} + +/// Information about ourself, in case we are an `Authority`. +/// +/// This data is derived from the `SessionInfo` and our key as found in the keystore. +pub struct ValidatorInfo { + /// The index this very validator has in `SessionInfo` vectors. + pub our_index: ValidatorIndex, + /// The group we belong to. + pub our_group: GroupIndex, +} + +impl Runtime { + /// Create a new `Runtime` for convenient runtime fetches. + pub fn new(keystore: SyncCryptoStorePtr) -> Self { + Self { + // 5 relatively conservative, 1 to 2 should suffice: + session_index_cache: LruCache::new(5), + // We need to cache the current and the last session the most: + session_info_cache: LruCache::new(2), + keystore, + } + } + + /// Retrieve the current session index. + pub async fn get_session_index( + &mut self, + ctx: &mut Context, + parent: Hash, + ) -> Result + where + Context: SubsystemContext, + { + match self.session_index_cache.get(&parent) { + Some(index) => Ok(*index), + None => { + let index = + recv_runtime(request_session_index_for_child_ctx(parent, ctx).await) + .await?; + self.session_index_cache.put(parent, index); + Ok(index) + } + } + } + + /// Get `ExtendedSessionInfo` by relay parent hash. + pub async fn get_session_info<'a, Context>( + &'a mut self, + ctx: &mut Context, + parent: Hash, + ) -> Result<&'a ExtendedSessionInfo, Error> + where + Context: SubsystemContext, + { + let session_index = self.get_session_index(ctx, parent).await?; + + self.get_session_info_by_index(ctx, parent, session_index).await + } + + /// Get `ExtendedSessionInfo` by session index. + /// + /// `request_session_info_ctx` still requires the parent to be passed in, so we take the parent + /// in addition to the `SessionIndex`. + pub async fn get_session_info_by_index<'a, Context>( + &'a mut self, + ctx: &mut Context, + parent: Hash, + session_index: SessionIndex, + ) -> Result<&'a ExtendedSessionInfo, Error> + where + Context: SubsystemContext, + { + if !self.session_info_cache.contains(&session_index) { + let session_info = + recv_runtime(request_session_info_ctx(parent, session_index, ctx).await) + .await? + .ok_or(Error::NoSuchSession(session_index))?; + let validator_info = self.get_validator_info(&session_info).await?; + + let full_info = ExtendedSessionInfo { + session_info, + validator_info, + }; + + self.session_info_cache.put(session_index, full_info); + } + Ok( + self.session_info_cache.get(&session_index) + .expect("We just put the value there. qed.") + ) + } + + /// Build `ValidatorInfo` for the current session. + /// + /// + /// Returns: `None` if not a validator. + async fn get_validator_info( + &self, + session_info: &SessionInfo, + ) -> Result, Error> + { + if let Some(our_index) = self.get_our_index(&session_info.validators).await { + // Get our group index: + let our_group = session_info.validator_groups + .iter() + .enumerate() + .find_map(|(i, g)| { + g.iter().find_map(|v| { + if *v == our_index { + Some(GroupIndex(i as u32)) + } else { + None + } + }) + }) + .expect("Every validator should be in a validator group. qed."); + + let info = ValidatorInfo { + our_index, + our_group, + }; + Ok(Some(info)) + } else { + Ok(None) + } + } + + /// Get our `ValidatorIndex`. + /// + /// Returns: None if we are not a validator. + async fn get_our_index(&self, validators: &Vec) -> Option { + for (i, v) in validators.iter().enumerate() { + if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)]) + .await + { + return Some(ValidatorIndex(i as u32)); + } + } + None + } +} diff --git a/node/network/availability-distribution/src/session_cache.rs b/node/network/availability-distribution/src/session_cache.rs index a18bf8bcee29..418f79f49ebe 100644 --- a/node/network/availability-distribution/src/session_cache.rs +++ b/node/network/availability-distribution/src/session_cache.rs @@ -33,8 +33,7 @@ use polkadot_primitives::v1::{ use polkadot_subsystem::SubsystemContext; use super::{ - error::{recv_runtime, NonFatalError}, - Error, + error::{recv_runtime, Error}, LOG_TARGET, }; @@ -122,7 +121,7 @@ impl SessionCache { ctx: &mut Context, parent: Hash, with_info: F, - ) -> Result, NonFatalError> + ) -> Result, Error> where Context: SubsystemContext, F: FnOnce(&SessionInfo) -> R, @@ -219,7 +218,7 @@ impl SessionCache { ctx: &mut Context, parent: Hash, session_index: SessionIndex, - ) -> Result, NonFatalError> + ) -> Result, Error> where Context: SubsystemContext, { @@ -230,7 +229,7 @@ impl SessionCache { .. } = recv_runtime(request_session_info_ctx(parent, session_index, ctx).await) .await? - .ok_or(NonFatalError::NoSuchSession(session_index))?; + .ok_or(Error::NoSuchSession(session_index))?; if let Some(our_index) = self.get_our_index(validators).await { // Get our group index: diff --git a/node/network/protocol/src/request_response/v1.rs b/node/network/protocol/src/request_response/v1.rs index 3e4e7adcbb73..e07c678cdea6 100644 --- a/node/network/protocol/src/request_response/v1.rs +++ b/node/network/protocol/src/request_response/v1.rs @@ -18,7 +18,7 @@ use parity_scale_codec::{Decode, Encode}; -use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, ErasureChunk, ValidatorIndex, CompressedPoV, Hash, CandidateDescriptor}; +use polkadot_primitives::v1::{CandidateHash, CandidateReceipt, ErasureChunk, ValidatorIndex, CompressedPoV, Hash}; use polkadot_primitives::v1::Id as ParaId; use super::request::IsRequest; @@ -105,10 +105,8 @@ impl IsRequest for CollationFetchingRequest { /// Request the advertised collation at that relay-parent. #[derive(Debug, Clone, Encode, Decode)] pub struct PoVFetchingRequest { - /// Relay parent we want a collation for. - pub relay_parent: Hash, - /// Candidate descriptor. - pub descriptor: CandidateDescriptor, + /// Candidate we want a PoV for. + pub candidate_hash: CandidateHash, } /// Responses to `PoVFetchingRequest`. From 545e9503e3bf6c2f9c9797871940ef4faf8122e1 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Tue, 23 Mar 2021 18:52:34 +0100 Subject: [PATCH 07/30] Get rid of seemingly dead code. --- node/subsystem/src/messages.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index c5c2b091a4cd..1af21113140e 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -283,15 +283,6 @@ pub enum AvailabilityRecoveryMessage { NetworkBridgeUpdateV1(NetworkBridgeEvent), } -// impl AvailabilityDistributionMessage { -// /// If the current variant contains the relay parent hash, return it. -// pub fn relay_parent(&self) -> Option { -// match self { -// Self::AvailabilityFetchingRequest(_) => None, -// } -// } -// } - /// Bitfield distribution message. #[derive(Debug, derive_more::From)] pub enum BitfieldDistributionMessage { From 47d9f5ff2450e29498dbd0bfc5ff67b61952b716 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Tue, 23 Mar 2021 23:56:27 +0100 Subject: [PATCH 08/30] Implement PoV fetching. Backing does not yet use it. --- .../availability-distribution/src/error.rs | 12 +++ .../availability-distribution/src/lib.rs | 20 ++++ .../src/pov_requester/mod.rs | 101 ++++++++++++++++-- node/subsystem/src/messages.rs | 15 +++ 4 files changed, 141 insertions(+), 7 deletions(-) diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 0de80765696b..8f04706230dc 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -17,6 +17,7 @@ //! Error handling related code and Error/Result definitions. +use polkadot_node_network_protocol::request_response::request::RequestError; use thiserror::Error; use futures::channel::oneshot; @@ -80,6 +81,17 @@ pub enum Error { /// Decompressing PoV failed. #[error("PoV could not be decompressed")] PoVDecompression(CompressedPoVError), + + /// Fetching PoV failed with `RequestError`. + #[error("FetchPoV request error")] + FetchPoV(#[source] RequestError), + + #[error("Remote responded with `NoSuchPoV`")] + NoSuchPoV, + + /// No validator with the index could be found in current session. + #[error("Given validator index could not be found")] + InvalidValidatorIndex, } pub type Result = std::result::Result; diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index 3d42c557f549..3a59ccabfaff 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -143,6 +143,26 @@ impl AvailabilityDistributionSubsystem { } => { answer_pov_request_log(&mut ctx, req, &self.metrics).await } + FromOverseer::Communication { + msg: AvailabilityDistributionMessage::FetchPoV{ + relay_parent, + from_validator, + candidate_hash, + tx, + }, + } => { + log_error( + pov_requester.fetch_pov( + &mut ctx, + &mut self.runtime, + relay_parent, + from_validator, + candidate_hash, + tx, + ).await, + "PoVRequester::fetch_pov" + ); + } } } } diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 912ffa3f8c41..ccf65c41305b 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -16,14 +16,25 @@ //! PoV requester takes care of requesting PoVs from validators of a backing group. -use futures::channel::mpsc; +use futures::{FutureExt, channel::{mpsc, oneshot}, future::BoxFuture}; use lru::LruCache; -use polkadot_node_network_protocol::{PeerId, peer_set::PeerSet}; -use polkadot_primitives::v1::{AuthorityDiscoveryId, Hash, SessionIndex}; -use polkadot_subsystem::{ActiveLeavesUpdate, SubsystemContext, messages::{AllMessages, NetworkBridgeMessage}}; - -use crate::runtime::Runtime; +use polkadot_node_network_protocol::{ + PeerId, peer_set::PeerSet, + request_response::{OutgoingRequest, Recipient, request::{RequestError, Requests}, + v1::{PoVFetchingRequest, PoVFetchingResponse}} +}; +use polkadot_primitives::v1::{ + AuthorityDiscoveryId, CandidateHash, Hash, PoV, SessionIndex, ValidatorIndex +}; +use polkadot_subsystem::{ + ActiveLeavesUpdate, SubsystemContext, messages::{AllMessages, NetworkBridgeMessage, IfDisconnected} +}; + +use crate::{ + runtime::Runtime, + error::{Error, log_error}, +}; /// Number of sessions we want to keep in the LRU. const NUM_SESSIONS: usize = 2; @@ -73,8 +84,77 @@ impl PoVRequester { Ok(()) } + /// Start background worker for taking care of fetching the requested `PoV` from the network. + pub async fn fetch_pov( + &self, + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + from_validator: ValidatorIndex, + candidate_hash: CandidateHash, + tx: oneshot::Sender + ) -> super::Result<()> + where + Context: SubsystemContext, + { + let info = &runtime.get_session_info(ctx, parent).await?.session_info; + let authority_id = info.discovery_keys.get(from_validator.0 as usize) + .ok_or(Error::InvalidValidatorIndex)? + .clone(); + let (req, pending_response) = OutgoingRequest::new( + Recipient::Authority(authority_id), + PoVFetchingRequest { + candidate_hash, + }, + ); + let full_req = Requests::PoVFetching(req); + + ctx.send_message( + AllMessages::NetworkBridge( + NetworkBridgeMessage::SendRequests( + vec![full_req], + // We are supposed to be connected to validators of our group: + IfDisconnected::ImmediateError + ) + )).await; + + ctx.spawn("pov-fetcher", fetch_pov_job(pending_response.boxed(), tx).boxed()) + .await + .map_err(|e| Error::SpawnTask(e)) + } +} + +/// Future to be spawned for taking care of handling reception and sending of PoV. +async fn fetch_pov_job( + pending_response: BoxFuture<'static, Result>, + tx: oneshot::Sender, +) { + log_error( + do_fetch_pov(pending_response, tx).await, + "fetch_pov_job", + ) +} + +/// Do the actual work of waiting for the response. +async fn do_fetch_pov( + pending_response: BoxFuture<'static, Result>, + tx: oneshot::Sender, +) + -> super::Result<()> +{ + let response = pending_response.await.map_err(Error::FetchPoV)?; + let pov = match response { + PoVFetchingResponse::PoV(compressed) => { + compressed.decompress().map_err(Error::PoVDecompression)? + } + PoVFetchingResponse::NoSuchPoV => { + return Err(Error::NoSuchPoV) + } + }; + tx.send(pov).map_err(|_| Error::SendResponse) } +/// Get the session indeces for the given relay chain parents. async fn get_activated_sessions(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator) -> super::Result> where @@ -87,6 +167,7 @@ where Ok(sessions.into_iter()) } +/// Connect to validators of our validator group. async fn connect_to_relevant_validators( ctx: &mut Context, runtime: &mut Runtime, @@ -106,6 +187,7 @@ where Ok(rx) } +/// Get the validators in our validator group. async fn determine_relevant_validators( ctx: &mut Context, runtime: &mut Runtime, @@ -121,7 +203,12 @@ where let indeces = info.session_info.validator_groups.get(validator_info.our_group.0 as usize) .expect("Our group got retrieved from that session info, it must exist. qed.") .clone(); - Ok(indeces.into_iter().map(|i| info.session_info.discovery_keys[i.0 as usize].clone()).collect()) + Ok( + indeces.into_iter() + .filter(|i| *i != validator_info.our_index) + .map(|i| info.session_info.discovery_keys[i.0 as usize].clone()) + .collect() + ) } else { Ok(Vec::new()) } diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index 1af21113140e..3a6a7ac3789b 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -266,6 +266,21 @@ pub enum AvailabilityDistributionMessage { AvailabilityFetchingRequest(IncomingRequest), /// Incoming network request for a seconded PoV. PoVFetchingRequest(IncomingRequest), + /// Instruct availability distribution to fetch a remote PoV. + /// + /// NOTE: The result of this fetch is not yet locally validated and could be bogus. + FetchPoV{ + /// The relay parent giving the necessary context. + relay_parent: Hash, + /// Validator to fetch the PoV from. + from_validator: ValidatorIndex, + /// Candidate hash to fetch the PoV for. + candidate_hash: CandidateHash, + /// Sender for getting back the result of this fetch. + /// + /// The sender will be canceled if the fetching failed for some reason. + tx: oneshot::Sender, + }, } /// Availability Recovery Message. From 0a283aba476b684378b9886c8663232848842590 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Wed, 24 Mar 2021 01:36:31 +0100 Subject: [PATCH 09/30] Don't send `ConnectToValidators` for empty list. --- .../src/pov_requester/mod.rs | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index ccf65c41305b..3fe28d3c87bd 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -76,10 +76,9 @@ impl PoVRequester { if self.connected_validators.contains(&session_index) { continue } - self.connected_validators.put( - session_index, - connect_to_relevant_validators(ctx, runtime, parent, session_index).await? - ); + if let Some(rx) = connect_to_relevant_validators(ctx, runtime, parent, session_index).await? { + self.connected_validators.put(session_index, rx); + } } Ok(()) } @@ -174,27 +173,32 @@ async fn connect_to_relevant_validators( parent: Hash, session: SessionIndex ) - -> super::Result> + -> super::Result>> where Context: SubsystemContext, { - let validator_ids = determine_relevant_validators(ctx, runtime, parent, session).await?; - // We don't actually care about `PeerId`s, just keeping receiver so we stay connected: - let (tx, rx) = mpsc::channel(0); - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators { - validator_ids, peer_set: PeerSet::Validation, connected: tx - })).await; - Ok(rx) + if let Some(validator_ids) = determine_relevant_validators(ctx, runtime, parent, session).await? { + // We don't actually care about `PeerId`s, just keeping receiver so we stay connected: + let (tx, rx) = mpsc::channel(0); + ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators { + validator_ids, peer_set: PeerSet::Validation, connected: tx + })).await; + Ok(Some(rx)) + } else { + Ok(None) + } } /// Get the validators in our validator group. +/// +/// Return: `None` if not a validator. async fn determine_relevant_validators( ctx: &mut Context, runtime: &mut Runtime, parent: Hash, session: SessionIndex, ) - -> super::Result> + -> super::Result>> where Context: SubsystemContext, { @@ -203,13 +207,13 @@ where let indeces = info.session_info.validator_groups.get(validator_info.our_group.0 as usize) .expect("Our group got retrieved from that session info, it must exist. qed.") .clone(); - Ok( + Ok(Some( indeces.into_iter() .filter(|i| *i != validator_info.our_index) .map(|i| info.session_info.discovery_keys[i.0 as usize].clone()) .collect() - ) + )) } else { - Ok(Vec::new()) + Ok(None) } } From afd795f3746f873fcfde92d274be59b4ca38ef16 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Wed, 24 Mar 2021 02:11:08 +0100 Subject: [PATCH 10/30] Even better - no need to check over and over again. --- .../availability-distribution/src/pov_requester/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 3fe28d3c87bd..270f4c335a06 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -44,7 +44,8 @@ pub struct PoVRequester { /// We only ever care about being connected to validators of at most two sessions. /// /// So we keep an LRU for managing connection requests of size 2. - connected_validators: LruCache>, + /// Cache will contain `None` if we are not a validator in that session. + connected_validators: LruCache>>, } impl PoVRequester { @@ -76,9 +77,8 @@ impl PoVRequester { if self.connected_validators.contains(&session_index) { continue } - if let Some(rx) = connect_to_relevant_validators(ctx, runtime, parent, session_index).await? { - self.connected_validators.put(session_index, rx); - } + let rx = connect_to_relevant_validators(ctx, runtime, parent, session_index).await?; + self.connected_validators.put(session_index, rx); } Ok(()) } From 1c3eec889095061292bbd10f1f10f090aad782b7 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Wed, 24 Mar 2021 17:05:27 +0100 Subject: [PATCH 11/30] PoV fetching implemented. + Typechecks + Should work Missing: - Guide - Tests - Do fallback fetching in case fetching from seconding validator fails. --- Cargo.lock | 1 - node/core/backing/Cargo.toml | 1 - node/core/backing/src/lib.rs | 97 +++++--------- .../src/pov_requester/mod.rs | 7 +- node/overseer/src/lib.rs | 124 ++++-------------- node/service/src/lib.rs | 4 - node/subsystem/src/messages.rs | 29 ---- 7 files changed, 58 insertions(+), 205 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ebf9412b6181..802919079de1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5517,7 +5517,6 @@ dependencies = [ "bitvec", "futures 0.3.12", "polkadot-erasure-coding", - "polkadot-node-network-protocol", "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-node-subsystem-test-helpers", diff --git a/node/core/backing/Cargo.toml b/node/core/backing/Cargo.toml index 8538fef0a43f..a6b04c03ae92 100644 --- a/node/core/backing/Cargo.toml +++ b/node/core/backing/Cargo.toml @@ -8,7 +8,6 @@ edition = "2018" futures = "0.3.12" sp-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" } polkadot-primitives = { path = "../../../primitives" } -polkadot-node-network-protocol = { path = "../../network/protocol" } polkadot-node-primitives = { path = "../../primitives" } polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" } polkadot-node-subsystem-util = { path = "../../subsystem-util" } diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index ab8ed125b250..7880a17cfeec 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -29,12 +29,20 @@ use sp_keystore::SyncCryptoStorePtr; use polkadot_primitives::v1::{ AvailableData, BackedCandidate, CandidateCommitments, CandidateDescriptor, CandidateHash, CandidateReceipt, CollatorId, CommittedCandidateReceipt, CoreIndex, CoreState, Hash, Id as ParaId, - PoV, SigningContext, ValidatorId, ValidatorIndex, ValidatorSignature, ValidityAttestation, AuthorityDiscoveryId, + PoV, SigningContext, ValidatorId, ValidatorIndex, ValidatorSignature, ValidityAttestation, }; use polkadot_node_primitives::{ Statement, SignedFullStatement, ValidationResult, }; -use polkadot_subsystem::{PerLeafSpan, Stage, errors::RuntimeApiError, jaeger, messages::{AllMessages, AvailabilityStoreMessage, CandidateBackingMessage, CandidateSelectionMessage, CandidateValidationMessage, IfDisconnected, NetworkBridgeMessage, PoVDistributionMessage, ProvisionableData, ProvisionerMessage, RuntimeApiRequest, StatementDistributionMessage, ValidationFailed}}; +use polkadot_subsystem::{ + PerLeafSpan, Stage, jaeger, + messages::{ + AllMessages, AvailabilityDistributionMessage, AvailabilityStoreMessage, + CandidateBackingMessage, CandidateSelectionMessage, CandidateValidationMessage, + ProvisionableData, ProvisionerMessage, RuntimeApiRequest, + StatementDistributionMessage, ValidationFailed + } +}; use polkadot_node_subsystem_util::{ self as util, request_session_index_for_child, @@ -46,10 +54,6 @@ use polkadot_node_subsystem_util::{ FromJobCommand, metrics::{self, prometheus}, }; -use polkadot_node_subsystem_util::Error as UtilError; -use polkadot_node_network_protocol::request_response::{Recipient, Requests, request::{OutgoingRequest, RequestError}, v1::{ - PoVFetchingRequest, PoVFetchingResponse - }}; use statement_table::{ generic::AttestedCandidate as TableAttestedCandidate, Context as TableContextTrait, @@ -61,7 +65,6 @@ use statement_table::{ }, }; use thiserror::Error; -use util::request_session_info; const LOG_TARGET: &str = "parachain::candidate-backing"; @@ -73,22 +76,8 @@ enum Error { InvalidSignature, #[error("Failed to send candidates {0:?}")] Send(Vec), - #[error("FetchPoV request error")] - FetchPoV(#[source] RequestError), - #[error("FetchPoV channel closed before receipt")] - FetchPoVCanceled(#[source] oneshot::Canceled), - #[error("FetchPoV runtime request failed in utilities")] - FetchPoVUtil(#[source] UtilError), - #[error("FetchPoV runtime request failed")] - FetchPoVRuntime(#[source] RuntimeApiError), - #[error("FetchPoV could not find session")] - FetchPoVNoSuchSession, - #[error("FetchPoV could not find index of signing validator")] - FetchPoVInvalidValidatorIndex, - #[error("Invalid response")] - FetchPoVInvalidResponse, - #[error("Backer did not have PoV")] - FetchPoVNoSuchPoV, + #[error("FetchPoV failed")] + FetchPoV, #[error("ValidateFromChainState channel closed before receipt")] ValidateFromChainState(#[source] oneshot::Canceled), #[error("StoreAvailableData channel closed before receipt")] @@ -110,11 +99,11 @@ enum PoVData { /// Allready available (from candidate selection). Ready(Arc), /// Needs to be fetched from validator (we are checking a signed statement). - FetchFromValidator(ValidatorIndex), + FetchFromValidator(ValidatorIndex, CandidateHash), } enum ValidatedCandidateCommand { - // We were instructed to second the candidate. + // We were instructed to second the candidate that has been already validated. Second(BackgroundValidationResult), // We were instructed to validate the candidate. Attest(BackgroundValidationResult), @@ -357,48 +346,22 @@ async fn make_pov_available( async fn request_pov( tx_from: &mut mpsc::Sender, - parent: Hash, - descriptor: CandidateDescriptor, + relay_parent: Hash, from_validator: ValidatorIndex, + candidate_hash: CandidateHash, ) -> Result, Error> { - let session_index = request_session_index_for_child(parent, tx_from) - .await.map_err(Error::FetchPoVUtil)? - .await.map_err(Error::FetchPoVCanceled)? - .map_err(Error::FetchPoVRuntime)?; - let session_info = request_session_info(parent, session_index, tx_from) - .await.map_err(Error::FetchPoVUtil)? - .await.map_err(Error::FetchPoVCanceled)? - .map_err(Error::FetchPoVRuntime)? - .ok_or(Error::FetchPoVNoSuchSession)?; - let authority_id = session_info.discovery_keys - .get(from_validator.0 as usize) - .ok_or(Error::FetchPoVInvalidValidatorIndex)?; - - let (req, pending_response) = OutgoingRequest::new( - Recipient::Authority(authority_id.clone()), - PoVFetchingRequest { - relay_parent: parent, - descriptor, - }, - ); - let full_req = Requests::PoVFetching(req); - - tx_from.send(FromJobCommand::SendMessage(AllMessages::NetworkBridge( - NetworkBridgeMessage::SendRequests( - vec![full_req], - IfDisconnected::ImmediateError - ) - ))).await?; - let response = pending_response.await.map_err(Error::FetchPoV)?; - let pov = match response { - PoVFetchingResponse::PoV(compressed) => { - compressed.decompress().map_err(|_| Error::FetchPoVInvalidResponse)? - } - PoVFetchingResponse::NoSuchPoV => { - return Err(Error::FetchPoVNoSuchPoV) + let (tx, rx) = oneshot::channel(); + tx_from.send(FromJobCommand::SendMessage(AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + candidate_hash, + from_validator, + tx, } - }; + ))).await?; + + let pov = rx.await.map_err(|_| Error::FetchPoV)?; Ok(Arc::new(pov)) } @@ -456,13 +419,13 @@ async fn validate_and_make_available( let pov = match pov { PoVData::Ready(pov) => pov, - PoVData::FetchFromValidator(validator_index) => { + PoVData::FetchFromValidator(validator_index, candidate_hash) => { let _span = span.as_ref().map(|s| s.child("request-pov")); request_pov( &mut tx_from, relay_parent, - candidate.descriptor.clone(), validator_index, + candidate_hash, ).await? } }; @@ -580,7 +543,7 @@ impl CandidateBackingJob { match command { ValidatedCandidateCommand::Second(res) => { match res { - Ok((candidate, commitments, pov)) => { + Ok((candidate, commitments, _)) => { // sanity check. if self.seconded.is_none() && !self.issued_statements.contains(&candidate_hash) { self.seconded = Some(candidate_hash); @@ -915,7 +878,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate, relay_parent: self.parent, - pov: PoVData::FetchFromValidator(statement.validator_index()), + pov: PoVData::FetchFromValidator(statement.validator_index(), candidate_hash), validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 270f4c335a06..f07fb8be6112 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -40,7 +40,6 @@ use crate::{ const NUM_SESSIONS: usize = 2; pub struct PoVRequester { - /// We only ever care about being connected to validators of at most two sessions. /// /// So we keep an LRU for managing connection requests of size 2. @@ -112,8 +111,10 @@ impl PoVRequester { AllMessages::NetworkBridge( NetworkBridgeMessage::SendRequests( vec![full_req], - // We are supposed to be connected to validators of our group: - IfDisconnected::ImmediateError + // We are supposed to be connected to validators of our group via `PeerSet`, + // but at session boundaries that is kind of racy, in case a connection takes + // longer to get established, so we try to connect in any case. + IfDisconnected::TryConnect ) )).await; diff --git a/node/overseer/src/lib.rs b/node/overseer/src/lib.rs index 906e8ff9090e..74890639a54d 100644 --- a/node/overseer/src/lib.rs +++ b/node/overseer/src/lib.rs @@ -83,7 +83,7 @@ use polkadot_subsystem::messages::{ CandidateValidationMessage, CandidateBackingMessage, CandidateSelectionMessage, ChainApiMessage, StatementDistributionMessage, AvailabilityDistributionMessage, BitfieldSigningMessage, BitfieldDistributionMessage, - ProvisionerMessage, PoVDistributionMessage, RuntimeApiMessage, + ProvisionerMessage, RuntimeApiMessage, AvailabilityStoreMessage, NetworkBridgeMessage, AllMessages, CollationGenerationMessage, CollatorProtocolMessage, AvailabilityRecoveryMessage, ApprovalDistributionMessage, ApprovalVotingMessage, GossipSupportMessage, @@ -538,9 +538,6 @@ pub struct Overseer { /// A provisioner subsystem. provisioner_subsystem: OverseenSubsystem, - /// A PoV distribution subsystem. - pov_distribution_subsystem: OverseenSubsystem, - /// A runtime API subsystem. runtime_api_subsystem: OverseenSubsystem, @@ -608,7 +605,7 @@ pub struct Overseer { /// subsystems are implemented and the rest can be mocked with the [`DummySubsystem`]. pub struct AllSubsystems< CV = (), CB = (), CS = (), SD = (), AD = (), AR = (), BS = (), BD = (), P = (), - PoVD = (), RA = (), AS = (), NB = (), CA = (), CG = (), CP = (), ApD = (), ApV = (), + RA = (), AS = (), NB = (), CA = (), CG = (), CP = (), ApD = (), ApV = (), GS = (), > { /// A candidate validation subsystem. @@ -629,8 +626,6 @@ pub struct AllSubsystems< pub bitfield_distribution: BD, /// A provisioner subsystem. pub provisioner: P, - /// A PoV distribution subsystem. - pub pov_distribution: PoVD, /// A runtime API subsystem. pub runtime_api: RA, /// An availability store subsystem. @@ -651,8 +646,8 @@ pub struct AllSubsystems< pub gossip_support: GS, } -impl - AllSubsystems +impl + AllSubsystems { /// Create a new instance of [`AllSubsystems`]. /// @@ -685,7 +680,6 @@ impl { AllSubsystems { candidate_validation: DummySubsystem, @@ -697,7 +691,6 @@ impl( self, candidate_validation: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation, candidate_backing: self.candidate_backing, @@ -725,7 +718,6 @@ impl( self, candidate_backing: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing, @@ -753,7 +745,6 @@ impl( self, candidate_selection: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -781,7 +772,6 @@ impl( self, statement_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -809,7 +799,6 @@ impl( self, availability_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -837,7 +826,6 @@ impl( self, availability_recovery: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -865,7 +853,6 @@ impl( self, bitfield_signing: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -893,7 +880,6 @@ impl( self, bitfield_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -921,7 +907,6 @@ impl( self, provisioner: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -949,35 +934,6 @@ impl( - self, - pov_distribution: NEW, - ) -> AllSubsystems { - AllSubsystems { - candidate_validation: self.candidate_validation, - candidate_backing: self.candidate_backing, - candidate_selection: self.candidate_selection, - statement_distribution: self.statement_distribution, - availability_distribution: self.availability_distribution, - availability_recovery: self.availability_recovery, - bitfield_signing: self.bitfield_signing, - bitfield_distribution: self.bitfield_distribution, - provisioner: self.provisioner, - pov_distribution, runtime_api: self.runtime_api, availability_store: self.availability_store, network_bridge: self.network_bridge, @@ -994,7 +950,7 @@ impl( self, runtime_api: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1005,7 +961,6 @@ impl( self, availability_store: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1033,7 +988,6 @@ impl( self, network_bridge: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1061,7 +1015,6 @@ impl( self, chain_api: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1089,7 +1042,6 @@ impl( self, collation_generation: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1117,7 +1069,6 @@ impl( self, collator_protocol: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1145,7 +1096,6 @@ impl( self, approval_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1173,7 +1123,6 @@ impl( self, approval_voting: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1201,7 +1150,6 @@ impl( self, gossip_support: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1229,7 +1177,6 @@ impl( + pub fn new( leaves: impl IntoIterator, - all_subsystems: AllSubsystems, + all_subsystems: AllSubsystems, prometheus_registry: Option<&prometheus::Registry>, mut s: S, ) -> SubsystemResult<(Self, OverseerHandler)> @@ -1465,7 +1412,6 @@ where BS: Subsystem> + Send, BD: Subsystem> + Send, P: Subsystem> + Send, - PoVD: Subsystem> + Send, RA: Subsystem> + Send, AS: Subsystem> + Send, NB: Subsystem> + Send, @@ -1595,16 +1541,6 @@ where TaskKind::Regular, )?; - let pov_distribution_subsystem = spawn( - &mut s, - &mut running_subsystems, - metered::UnboundedMeteredSender::<_>::clone(&to_overseer_tx), - all_subsystems.pov_distribution, - &metrics, - &mut seed, - TaskKind::Regular, - )?; - let runtime_api_subsystem = spawn( &mut s, &mut running_subsystems, @@ -1714,7 +1650,6 @@ where bitfield_signing_subsystem, bitfield_distribution_subsystem, provisioner_subsystem, - pov_distribution_subsystem, runtime_api_subsystem, availability_store_subsystem, network_bridge_subsystem, @@ -1749,7 +1684,6 @@ where let _ = self.bitfield_signing_subsystem.send_signal(OverseerSignal::Conclude).await; let _ = self.bitfield_distribution_subsystem.send_signal(OverseerSignal::Conclude).await; let _ = self.provisioner_subsystem.send_signal(OverseerSignal::Conclude).await; - let _ = self.pov_distribution_subsystem.send_signal(OverseerSignal::Conclude).await; let _ = self.runtime_api_subsystem.send_signal(OverseerSignal::Conclude).await; let _ = self.availability_store_subsystem.send_signal(OverseerSignal::Conclude).await; let _ = self.network_bridge_subsystem.send_signal(OverseerSignal::Conclude).await; @@ -1920,7 +1854,6 @@ where self.bitfield_signing_subsystem.send_signal(signal.clone()).await?; self.bitfield_distribution_subsystem.send_signal(signal.clone()).await?; self.provisioner_subsystem.send_signal(signal.clone()).await?; - self.pov_distribution_subsystem.send_signal(signal.clone()).await?; self.runtime_api_subsystem.send_signal(signal.clone()).await?; self.availability_store_subsystem.send_signal(signal.clone()).await?; self.network_bridge_subsystem.send_signal(signal.clone()).await?; @@ -1966,9 +1899,6 @@ where AllMessages::Provisioner(msg) => { self.provisioner_subsystem.send_message(msg).await?; }, - AllMessages::PoVDistribution(msg) => { - self.pov_distribution_subsystem.send_message(msg).await?; - }, AllMessages::RuntimeApi(msg) => { self.runtime_api_subsystem.send_message(msg).await?; }, @@ -2833,10 +2763,6 @@ mod tests { ProvisionerMessage::RequestInherentData(Default::default(), sender) } - fn test_pov_distribution_msg() -> PoVDistributionMessage { - PoVDistributionMessage::NetworkBridgeUpdateV1(test_network_bridge_event()) - } - fn test_runtime_api_msg() -> RuntimeApiMessage { let (sender, _) = oneshot::channel(); RuntimeApiMessage::Request(Default::default(), RuntimeApiRequest::Validators(sender)) @@ -2888,7 +2814,6 @@ mod tests { bitfield_signing: subsystem.clone(), bitfield_distribution: subsystem.clone(), provisioner: subsystem.clone(), - pov_distribution: subsystem.clone(), runtime_api: subsystem.clone(), availability_store: subsystem.clone(), network_bridge: subsystem.clone(), @@ -2927,7 +2852,6 @@ mod tests { // handler.send_msg(AllMessages::GossipSupport(test_bitfield_signing_msg())).await; handler.send_msg(AllMessages::BitfieldDistribution(test_bitfield_distribution_msg())).await; handler.send_msg(AllMessages::Provisioner(test_provisioner_msg())).await; - handler.send_msg(AllMessages::PoVDistribution(test_pov_distribution_msg())).await; handler.send_msg(AllMessages::RuntimeApi(test_runtime_api_msg())).await; handler.send_msg(AllMessages::AvailabilityStore(test_availability_store_msg())).await; handler.send_msg(AllMessages::NetworkBridge(test_network_bridge_msg())).await; diff --git a/node/service/src/lib.rs b/node/service/src/lib.rs index 17b51419c618..69f7dd91ba25 100644 --- a/node/service/src/lib.rs +++ b/node/service/src/lib.rs @@ -450,7 +450,6 @@ where use polkadot_node_collation_generation::CollationGenerationSubsystem; use polkadot_collator_protocol::{CollatorProtocolSubsystem, ProtocolSide}; use polkadot_network_bridge::NetworkBridge as NetworkBridgeSubsystem; - use polkadot_pov_distribution::PoVDistribution as PoVDistributionSubsystem; use polkadot_node_core_provisioner::ProvisioningSubsystem as ProvisionerSubsystem; use polkadot_node_core_runtime_api::RuntimeApiSubsystem; use polkadot_statement_distribution::StatementDistribution as StatementDistributionSubsystem; @@ -514,9 +513,6 @@ where authority_discovery, request_multiplexer, ), - pov_distribution: PoVDistributionSubsystem::new( - Metrics::register(registry)?, - ), provisioner: ProvisionerSubsystem::new( spawner.clone(), (), diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index 3a6a7ac3789b..5d296186c0ec 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -572,33 +572,6 @@ impl BoundToRelayParent for ProvisionerMessage { } } -/// Message to the PoV Distribution subsystem. -#[derive(Debug, derive_more::From)] -pub enum PoVDistributionMessage { - /// Fetch a PoV from the network. - /// - /// This `CandidateDescriptor` should correspond to a candidate seconded under the provided - /// relay-parent hash. - FetchPoV(Hash, CandidateDescriptor, oneshot::Sender>), - /// Distribute a PoV for the given relay-parent and CandidateDescriptor. - /// The PoV should correctly hash to the PoV hash mentioned in the CandidateDescriptor - DistributePoV(Hash, CandidateDescriptor, Arc), - /// An update from the network bridge. - #[from] - NetworkBridgeUpdateV1(NetworkBridgeEvent), -} - -impl PoVDistributionMessage { - /// If the current variant contains the relay parent hash, return it. - pub fn relay_parent(&self) -> Option { - match self { - Self::FetchPoV(hash, _, _) => Some(*hash), - Self::DistributePoV(hash, _, _) => Some(*hash), - Self::NetworkBridgeUpdateV1(_) => None, - } - } -} - /// Message to the Collation Generation subsystem. #[derive(Debug)] pub enum CollationGenerationMessage { @@ -720,8 +693,6 @@ pub enum AllMessages { /// Message for the Provisioner subsystem. #[skip] Provisioner(ProvisionerMessage), - /// Message for the PoV Distribution subsystem. - PoVDistribution(PoVDistributionMessage), /// Message for the Runtime API subsystem. #[skip] RuntimeApi(RuntimeApiMessage), From cceddce3607653e53515d508bd4389e0fe986ece Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Wed, 24 Mar 2021 21:23:42 +0100 Subject: [PATCH 12/30] Check PoV hash upon reception. --- node/core/backing/src/lib.rs | 36 ++++++++--- .../availability-distribution/src/error.rs | 4 ++ .../availability-distribution/src/lib.rs | 60 ++++++++++--------- .../src/pov_requester/mod.rs | 23 ++++--- node/subsystem/src/messages.rs | 2 + 5 files changed, 79 insertions(+), 46 deletions(-) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index 5ab284ca826a..c30313c4cd23 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -99,7 +99,11 @@ enum PoVData { /// Allready available (from candidate selection). Ready(Arc), /// Needs to be fetched from validator (we are checking a signed statement). - FetchFromValidator(ValidatorIndex, CandidateHash), + FetchFromValidator { + from_validator: ValidatorIndex, + candidate_hash: CandidateHash, + pov_hash:Hash, + }, } enum ValidatedCandidateCommand { @@ -349,14 +353,16 @@ async fn request_pov( relay_parent: Hash, from_validator: ValidatorIndex, candidate_hash: CandidateHash, + pov_hash: Hash, ) -> Result, Error> { let (tx, rx) = oneshot::channel(); tx_from.send(FromJobCommand::SendMessage(AllMessages::AvailabilityDistribution( AvailabilityDistributionMessage::FetchPoV { relay_parent, - candidate_hash, from_validator, + candidate_hash, + pov_hash, tx, } ))).await?; @@ -419,13 +425,18 @@ async fn validate_and_make_available( let pov = match pov { PoVData::Ready(pov) => pov, - PoVData::FetchFromValidator(validator_index, candidate_hash) => { + PoVData::FetchFromValidator { + from_validator, + candidate_hash, + pov_hash, + } => { let _span = span.as_ref().map(|s| s.child("request-pov")); request_pov( &mut tx_from, relay_parent, - validator_index, + from_validator, candidate_hash, + pov_hash, ).await? } }; @@ -832,11 +843,11 @@ impl CandidateBackingJob { } /// Kick off validation work and distribute the result as a signed statement. - #[tracing::instrument(level = "trace", skip(self, span), fields(subsystem = LOG_TARGET))] + #[tracing::instrument(level = "trace", skip(self, pov, span), fields(subsystem = LOG_TARGET))] async fn kick_off_validation_work( &mut self, summary: TableSummary, - statement: SignedFullStatement, + pov: PoVData, span: Option, ) -> Result<(), Error> { let candidate_hash = summary.candidate; @@ -875,7 +886,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate, relay_parent: self.parent, - pov: PoVData::FetchFromValidator(statement.validator_index(), candidate_hash), + pov, validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, @@ -892,17 +903,24 @@ impl CandidateBackingJob { statement: SignedFullStatement, ) -> Result<(), Error> { if let Some(summary) = self.import_statement(&statement, parent_span).await? { - if let Statement::Seconded(_) = statement.payload() { + if let Statement::Seconded(receipt) = statement.payload() { if Some(summary.group_id) == self.assignment { let span = self.get_unbacked_validation_child( root_span, summary.candidate, summary.group_id, ); + let pov_hash = receipt.descriptor.pov_hash; + let candidate_hash = summary.candidate; + let pov_data = PoVData::FetchFromValidator { + from_validator: statement.validator_index(), + candidate_hash, + pov_hash + }; self.kick_off_validation_work( summary, - statement, + pov_data, span, ).await?; } diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 8f04706230dc..3a6dc7f444e4 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -86,6 +86,10 @@ pub enum Error { #[error("FetchPoV request error")] FetchPoV(#[source] RequestError), + /// Fetching PoV failed as the received PoV did not match the expected hash. + #[error("Fetched PoV does not match expected hash")] + UnexpectedPoV, + #[error("Remote responded with `NoSuchPoV`")] NoSuchPoV, diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index 3a59ccabfaff..18897822a218 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -60,8 +60,8 @@ const LOG_TARGET: &'static str = "parachain::availability-distribution"; pub struct AvailabilityDistributionSubsystem { /// Pointer to a keystore, which is required for determining this nodes validator index. keystore: SyncCryptoStorePtr, - /// Easy and efficient runtime access for this subsystem. - runtime: Runtime, + /// Easy and efficient runtime access for this subsystem. + runtime: Runtime, /// Prometheus metrics. metrics: Metrics, } @@ -87,7 +87,7 @@ impl AvailabilityDistributionSubsystem { /// Create a new instance of the availability distribution. pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self { - let runtime = Runtime::new(keystore.clone()); + let runtime = Runtime::new(keystore.clone()); Self { keystore, runtime, metrics } } @@ -97,7 +97,7 @@ impl AvailabilityDistributionSubsystem { Context: SubsystemContext + Sync + Send, { let mut requester = Requester::new(self.keystore.clone(), self.metrics.clone()).fuse(); - let mut pov_requester = PoVRequester::new(); + let mut pov_requester = PoVRequester::new(); loop { let action = { let mut subsystem_next = ctx.recv().fuse(); @@ -120,14 +120,14 @@ impl AvailabilityDistributionSubsystem { }; match message { FromOverseer::Signal(OverseerSignal::ActiveLeaves(update)) => { - log_error( - pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await, - "PoVRequester::update_connected_validators" - ); - log_error( - requester.get_mut().update_fetching_heads(&mut ctx, update).await, - "Error in Requester::update_fetching_heads" - ); + log_error( + pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await, + "PoVRequester::update_connected_validators" + ); + log_error( + requester.get_mut().update_fetching_heads(&mut ctx, update).await, + "Error in Requester::update_fetching_heads" + ); } FromOverseer::Signal(OverseerSignal::BlockFinalized(..)) => {} FromOverseer::Signal(OverseerSignal::Conclude) => { @@ -145,24 +145,26 @@ impl AvailabilityDistributionSubsystem { } FromOverseer::Communication { msg: AvailabilityDistributionMessage::FetchPoV{ - relay_parent, - from_validator, - candidate_hash, - tx, - }, + relay_parent, + from_validator, + candidate_hash, + pov_hash, + tx, + }, } => { - log_error( - pov_requester.fetch_pov( - &mut ctx, - &mut self.runtime, - relay_parent, - from_validator, - candidate_hash, - tx, - ).await, - "PoVRequester::fetch_pov" - ); - } + log_error( + pov_requester.fetch_pov( + &mut ctx, + &mut self.runtime, + relay_parent, + from_validator, + candidate_hash, + pov_hash, + tx, + ).await, + "PoVRequester::fetch_pov" + ); + } } } } diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index f07fb8be6112..16d6a90a3e24 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -90,8 +90,9 @@ impl PoVRequester { parent: Hash, from_validator: ValidatorIndex, candidate_hash: CandidateHash, + pov_hash: Hash, tx: oneshot::Sender - ) -> super::Result<()> + ) -> super::Result<()> where Context: SubsystemContext, { @@ -118,7 +119,7 @@ impl PoVRequester { ) )).await; - ctx.spawn("pov-fetcher", fetch_pov_job(pending_response.boxed(), tx).boxed()) + ctx.spawn("pov-fetcher", fetch_pov_job(pov_hash, pending_response.boxed(), tx).boxed()) .await .map_err(|e| Error::SpawnTask(e)) } @@ -126,20 +127,22 @@ impl PoVRequester { /// Future to be spawned for taking care of handling reception and sending of PoV. async fn fetch_pov_job( + pov_hash: Hash, pending_response: BoxFuture<'static, Result>, tx: oneshot::Sender, ) { log_error( - do_fetch_pov(pending_response, tx).await, + do_fetch_pov(pov_hash, pending_response, tx).await, "fetch_pov_job", ) } /// Do the actual work of waiting for the response. async fn do_fetch_pov( + pov_hash: Hash, pending_response: BoxFuture<'static, Result>, tx: oneshot::Sender, -) +) -> super::Result<()> { let response = pending_response.await.map_err(Error::FetchPoV)?; @@ -151,11 +154,15 @@ async fn do_fetch_pov( return Err(Error::NoSuchPoV) } }; - tx.send(pov).map_err(|_| Error::SendResponse) + if pov.hash() == pov_hash { + tx.send(pov).map_err(|_| Error::SendResponse) + } else { + Err(Error::UnexpectedPoV) + } } /// Get the session indeces for the given relay chain parents. -async fn get_activated_sessions(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator) +async fn get_activated_sessions(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator) -> super::Result> where Context: SubsystemContext, @@ -173,7 +180,7 @@ async fn connect_to_relevant_validators( runtime: &mut Runtime, parent: Hash, session: SessionIndex -) +) -> super::Result>> where Context: SubsystemContext, @@ -198,7 +205,7 @@ async fn determine_relevant_validators( runtime: &mut Runtime, parent: Hash, session: SessionIndex, -) +) -> super::Result>> where Context: SubsystemContext, diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index 30475007ba07..eb221a86ec95 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -280,6 +280,8 @@ pub enum AvailabilityDistributionMessage { from_validator: ValidatorIndex, /// Candidate hash to fetch the PoV for. candidate_hash: CandidateHash, + /// Expected hash of the PoV, a PoV not matching this hash will be rejected. + pov_hash: Hash, /// Sender for getting back the result of this fetch. /// /// The sender will be canceled if the fetching failed for some reason. From 89f0bf97a35ba4c97a0ab02679cb53d0706c86cf Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 25 Mar 2021 15:43:57 +0100 Subject: [PATCH 13/30] Implement retry of PoV fetching in backing. --- node/core/backing/src/lib.rs | 147 ++++++++++++++++++++++++++--------- 1 file changed, 112 insertions(+), 35 deletions(-) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index c30313c4cd23..fe16914ebf0a 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -102,7 +102,7 @@ enum PoVData { FetchFromValidator { from_validator: ValidatorIndex, candidate_hash: CandidateHash, - pov_hash:Hash, + pov_hash: Hash, }, } @@ -111,6 +111,8 @@ enum ValidatedCandidateCommand { Second(BackgroundValidationResult), // We were instructed to validate the candidate. Attest(BackgroundValidationResult), + // We were not able to `Attest` because backing validator did not send us the PoV. + AttestNoPoV(CandidateHash), } impl std::fmt::Debug for ValidatedCandidateCommand { @@ -121,6 +123,8 @@ impl std::fmt::Debug for ValidatedCandidateCommand { write!(f, "Second({})", candidate_hash), ValidatedCandidateCommand::Attest(_) => write!(f, "Attest({})", candidate_hash), + ValidatedCandidateCommand::AttestNoPoV(_) => + write!(f, "Attest({})", candidate_hash), } } } @@ -132,6 +136,7 @@ impl ValidatedCandidateCommand { ValidatedCandidateCommand::Second(Err(ref candidate)) => candidate.hash(), ValidatedCandidateCommand::Attest(Ok((ref candidate, _, _))) => candidate.hash(), ValidatedCandidateCommand::Attest(Err(ref candidate)) => candidate.hash(), + ValidatedCandidateCommand::AttestNoPoV(candidate_hash) => candidate_hash, } } } @@ -152,6 +157,8 @@ struct CandidateBackingJob { issued_statements: HashSet, /// These candidates are undergoing validation in the background. awaiting_validation: HashSet, + /// Data needed for retrying in case of `ValidatedCandidateCommand::AttestNoPoV`. + fallbacks: HashMap)>, /// `Some(h)` if this job has already issued `Seconded` statement for some candidate with `h` hash. seconded: Option, /// The candidates that are includable, by hash. Each entry here indicates @@ -165,6 +172,23 @@ struct CandidateBackingJob { metrics: Metrics, } +/// In case a backing validator does not provide a PoV, we need to retry with other backing +/// validators. +/// +/// This is the data needed to accomplish this. Basically all the data needed for spawning a +/// validation job and a list of backing validators, we can try. +#[derive(Clone)] +struct AttestingData { + /// The candidate to attest. + candidate: CandidateReceipt, + /// Hash of the PoV we need to fetch. + pov_hash: Hash, + /// Validator we are currently trying to get the PoV from. + from_validator: ValidatorIndex, + /// Other backing validators we can try in case `from_validator` failed. + backing: Vec, +} + const fn group_quorum(n_validators: usize) -> usize { (n_validators / 2) + 1 } @@ -363,7 +387,7 @@ async fn request_pov( from_validator, candidate_hash, pov_hash, - tx, + tx, } ))).await?; @@ -431,13 +455,20 @@ async fn validate_and_make_available( pov_hash, } => { let _span = span.as_ref().map(|s| s.child("request-pov")); - request_pov( - &mut tx_from, - relay_parent, - from_validator, - candidate_hash, - pov_hash, - ).await? + match request_pov( + &mut tx_from, + relay_parent, + from_validator, + candidate_hash, + pov_hash, + ).await { + Err(Error::FetchPoV) => { + tx_command.send(ValidatedCandidateCommand::AttestNoPoV(candidate.hash())).await.map_err(Error::Mpsc)?; + return Ok(()) + } + Err(err) => return Err(err), + Ok(pov) => pov, + } } }; @@ -587,6 +618,24 @@ impl CandidateBackingJob { self.issued_statements.insert(candidate_hash); } } + ValidatedCandidateCommand::AttestNoPoV(candidate_hash) => { + if let Some((attesting, span)) = self.fallbacks.get_mut(&candidate_hash) { + if let Some(index) = attesting.backing.pop() { + attesting.from_validator = index; + // Ok, another try: + let c_span = span.as_ref().map(|s| s.child("try")); + let attesting = attesting.clone(); + self.kick_off_validation_work(attesting, c_span).await? + } + + } else { + tracing::warn!( + target: LOG_TARGET, + "AttestNoPoV was triggered without fallback being available." + ); + debug_assert!(false); + } + } } Ok(()) @@ -843,29 +892,23 @@ impl CandidateBackingJob { } /// Kick off validation work and distribute the result as a signed statement. - #[tracing::instrument(level = "trace", skip(self, pov, span), fields(subsystem = LOG_TARGET))] + #[tracing::instrument(level = "trace", skip(self, attesting, span), fields(subsystem = LOG_TARGET))] async fn kick_off_validation_work( &mut self, - summary: TableSummary, - pov: PoVData, + attesting: AttestingData, span: Option, ) -> Result<(), Error> { - let candidate_hash = summary.candidate; - + let candidate_hash = attesting.candidate.hash(); if self.issued_statements.contains(&candidate_hash) { return Ok(()) } - // We clone the commitments here because there are borrowck - // errors relating to this being a struct and methods borrowing the entirety of self - // and not just those things that the function uses. - let candidate = self.table.get_candidate(&candidate_hash).ok_or(Error::CandidateNotFound)?.to_plain(); - let descriptor = candidate.descriptor().clone(); + let descriptor = attesting.candidate.descriptor().clone(); tracing::debug!( target: LOG_TARGET, candidate_hash = ?candidate_hash, - candidate_receipt = ?candidate, + candidate_receipt = ?attesting.candidate, "Kicking off validation", ); @@ -881,10 +924,16 @@ impl CandidateBackingJob { return Ok(()); } + let pov = PoVData::FetchFromValidator { + from_validator: attesting.from_validator, + candidate_hash, + pov_hash: attesting.pov_hash, + }; + self.background_validate_and_make_available(BackgroundValidationParams { tx_from: self.tx_from.clone(), tx_command: self.background_validation_tx.clone(), - candidate, + candidate: attesting.candidate, relay_parent: self.parent, pov, validator_index: self.table_context.validator.as_ref().map(|v| v.index()), @@ -903,30 +952,57 @@ impl CandidateBackingJob { statement: SignedFullStatement, ) -> Result<(), Error> { if let Some(summary) = self.import_statement(&statement, parent_span).await? { - if let Statement::Seconded(receipt) = statement.payload() { - if Some(summary.group_id) == self.assignment { + if Some(summary.group_id) != self.assignment { + return Ok(()) + } + let (attesting, span) = match statement.payload() { + Statement::Seconded(receipt) => { + let candidate_hash = summary.candidate; + let span = self.get_unbacked_validation_child( root_span, summary.candidate, summary.group_id, ); - let pov_hash = receipt.descriptor.pov_hash; - let candidate_hash = summary.candidate; - let pov_data = PoVData::FetchFromValidator { + + let attesting = AttestingData { + candidate: self.table.get_candidate(&candidate_hash).ok_or(Error::CandidateNotFound)?.to_plain(), + pov_hash: receipt.descriptor.pov_hash, from_validator: statement.validator_index(), - candidate_hash, - pov_hash + backing: Vec::new(), }; + let child = span.as_ref().map(|s| s.child("try")); + self.fallbacks.insert(summary.candidate, (attesting.clone(), span)); + (attesting, child) + } + Statement::Valid(candidate_hash) => { + if let Some((attesting, span)) = self.fallbacks.get_mut(candidate_hash) { + + let our_index = self.table_context.validator.as_ref().map(|v| v.index()); + if our_index == Some(statement.validator_index()) { + return Ok(()) + } - self.kick_off_validation_work( - summary, - pov_data, - span, - ).await?; + if self.awaiting_validation.contains(candidate_hash) { + // Job already running: + attesting.backing.push(statement.validator_index()); + return Ok(()) + } else { + // No job, so start another try with current validator: + attesting.from_validator = statement.validator_index(); + (attesting.clone(), span.as_ref().map(|s| s.child("try"))) + } + } else { + return Ok(()) + } } - } - } + }; + self.kick_off_validation_work( + attesting, + span, + ).await?; + } Ok(()) } @@ -1159,6 +1235,7 @@ impl util::JobTrait for CandidateBackingJob { required_collator, issued_statements: HashSet::new(), awaiting_validation: HashSet::new(), + fallbacks: HashMap::new(), seconded: None, unbacked_candidates: HashMap::new(), backed: HashSet::new(), From ab75fea9285a3bb1922bfe5b11e9d3b2b52f0207 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 25 Mar 2021 15:51:37 +0100 Subject: [PATCH 14/30] Avoid pointless validation spawning. --- node/core/backing/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index fe16914ebf0a..bfe89590fbd2 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -609,6 +609,8 @@ impl CandidateBackingJob { } } ValidatedCandidateCommand::Attest(res) => { + // We are done - avoid new validation spawns: + self.fallbacks.remove(&candidate_hash); // sanity check. if !self.issued_statements.contains(&candidate_hash) { if res.is_ok() { From 3915a57c6c2842eb31c6fdbf66f0037e6b14e18b Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 25 Mar 2021 16:22:26 +0100 Subject: [PATCH 15/30] Add jaeger span to pov requesting. --- .../availability-distribution/src/pov_requester/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 16d6a90a3e24..56ed78c30d68 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -19,6 +19,7 @@ use futures::{FutureExt, channel::{mpsc, oneshot}, future::BoxFuture}; use lru::LruCache; +use polkadot_subsystem::jaeger; use polkadot_node_network_protocol::{ PeerId, peer_set::PeerSet, request_response::{OutgoingRequest, Recipient, request::{RequestError, Requests}, @@ -119,7 +120,9 @@ impl PoVRequester { ) )).await; - ctx.spawn("pov-fetcher", fetch_pov_job(pov_hash, pending_response.boxed(), tx).boxed()) + let span = jaeger::Span::new(candidate_hash, "fetch-pov") + .with_validator_index(from_validator); + ctx.spawn("pov-fetcher", fetch_pov_job(pov_hash, pending_response.boxed(), span, tx).boxed()) .await .map_err(|e| Error::SpawnTask(e)) } @@ -129,10 +132,11 @@ impl PoVRequester { async fn fetch_pov_job( pov_hash: Hash, pending_response: BoxFuture<'static, Result>, + span: jaeger::Span, tx: oneshot::Sender, ) { log_error( - do_fetch_pov(pov_hash, pending_response, tx).await, + do_fetch_pov(pov_hash, pending_response, span, tx).await, "fetch_pov_job", ) } @@ -141,6 +145,7 @@ async fn fetch_pov_job( async fn do_fetch_pov( pov_hash: Hash, pending_response: BoxFuture<'static, Result>, + _span: jaeger::Span, tx: oneshot::Sender, ) -> super::Result<()> From fa6409e75dca0476869324a24d1fbf4efa3b2c0a Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 25 Mar 2021 17:01:24 +0100 Subject: [PATCH 16/30] Add back tracing. --- node/network/availability-distribution/src/requester/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/node/network/availability-distribution/src/requester/mod.rs b/node/network/availability-distribution/src/requester/mod.rs index 924305ccb33e..ce4dc05cb0ea 100644 --- a/node/network/availability-distribution/src/requester/mod.rs +++ b/node/network/availability-distribution/src/requester/mod.rs @@ -128,6 +128,11 @@ impl Requester { { for (leaf, _) in new_heads { let cores = query_occupied_cores(ctx, leaf).await?; + tracing::trace!( + target: LOG_TARGET, + occupied_cores = ?cores, + "Query occupied core" + ); self.add_cores(ctx, leaf, cores).await?; } Ok(()) From 8b9c2d4702481d71d4dfd4f295a515dd29820411 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Thu, 25 Mar 2021 17:14:24 +0100 Subject: [PATCH 17/30] Review remarks. --- .../network/availability-distribution/src/lib.rs | 2 +- .../availability-distribution/src/responder.rs | 2 +- .../availability-distribution/src/runtime.rs | 2 +- node/network/bridge/src/network.rs | 16 ++++++++-------- node/subsystem/src/messages.rs | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index 18897822a218..566b591d7179 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -144,7 +144,7 @@ impl AvailabilityDistributionSubsystem { answer_pov_request_log(&mut ctx, req, &self.metrics).await } FromOverseer::Communication { - msg: AvailabilityDistributionMessage::FetchPoV{ + msg: AvailabilityDistributionMessage::FetchPoV { relay_parent, from_validator, candidate_hash, diff --git a/node/network/availability-distribution/src/responder.rs b/node/network/availability-distribution/src/responder.rs index 97e083532cdb..ceb9580faca2 100644 --- a/node/network/availability-distribution/src/responder.rs +++ b/node/network/availability-distribution/src/responder.rs @@ -35,7 +35,7 @@ pub async fn answer_pov_request_log( ctx: &mut Context, req: IncomingRequest, metrics: &Metrics, -) -> () +) where Context: SubsystemContext, { diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs index c584f0bd7d77..c4bb77d4ee38 100644 --- a/node/network/availability-distribution/src/runtime.rs +++ b/node/network/availability-distribution/src/runtime.rs @@ -186,7 +186,7 @@ impl Runtime { /// Get our `ValidatorIndex`. /// /// Returns: None if we are not a validator. - async fn get_our_index(&self, validators: &Vec) -> Option { + async fn get_our_index(&self, validators: &[ValidatorId]) -> Option { for (i, v) in validators.iter().enumerate() { if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)]) .await diff --git a/node/network/bridge/src/network.rs b/node/network/bridge/src/network.rs index 22cc48b96960..04edb7c2c0e3 100644 --- a/node/network/bridge/src/network.rs +++ b/node/network/bridge/src/network.rs @@ -237,14 +237,14 @@ impl Network for Arc> { Recipient::Peer(peer_id) => Some(peer_id), Recipient::Authority(authority) => authority_discovery - .get_addresses_by_authority_id(authority) - .await - .and_then(|addrs| { - addrs - .into_iter() - .find_map(|addr| peer_id_from_multiaddr(&addr)) - }), - }; + .get_addresses_by_authority_id(authority) + .await + .and_then(|addrs| { + addrs + .into_iter() + .find_map(|addr| peer_id_from_multiaddr(&addr)) + }), + }; let peer_id = match peer_id { None => { diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index eb221a86ec95..fda382ce3a46 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -273,7 +273,7 @@ pub enum AvailabilityDistributionMessage { /// Instruct availability distribution to fetch a remote PoV. /// /// NOTE: The result of this fetch is not yet locally validated and could be bogus. - FetchPoV{ + FetchPoV { /// The relay parent giving the necessary context. relay_parent: Hash, /// Validator to fetch the PoV from. From 4af7d2e1ef7f353cf78607b40d3f4192c37db423 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Fri, 26 Mar 2021 00:13:54 +0100 Subject: [PATCH 18/30] Whitespace. --- node/core/backing/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index bfe89590fbd2..7ea77e065f8f 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -937,7 +937,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate: attesting.candidate, relay_parent: self.parent, - pov, + pov, validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, From 5c098298b31b745effb104cb85663fb77b032521 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Fri, 26 Mar 2021 13:44:27 +0100 Subject: [PATCH 19/30] Whitespace again. --- node/network/availability-distribution/src/error.rs | 2 +- node/network/availability-distribution/src/runtime.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 3a6dc7f444e4..c38294542007 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -33,7 +33,7 @@ use crate::LOG_TARGET; pub enum Error { #[error("Response channel to obtain chunk failed")] QueryChunkResponseChannel(#[source] oneshot::Canceled), - + #[error("Response channel to obtain available data failed")] QueryAvailableDataResponseChannel(#[source] oneshot::Canceled), diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs index c4bb77d4ee38..0cd02906bb72 100644 --- a/node/network/availability-distribution/src/runtime.rs +++ b/node/network/availability-distribution/src/runtime.rs @@ -50,7 +50,7 @@ pub struct Runtime { keystore: SyncCryptoStorePtr, } -/// SessionInfo with additional useful data for validator nodes. +/// SessionInfo with additional useful data for validator nodes. pub struct ExtendedSessionInfo { /// Actual session info as fetched from the runtime. pub session_info: SessionInfo, From ea9bde4a7291e419fe750d2780ad19974d547026 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 12:48:14 +0100 Subject: [PATCH 20/30] Cleanup + fix tests. --- node/core/backing/src/lib.rs | 68 +- .../src/tests/state.rs | 4 +- node/network/bridge/src/lib.rs | 37 +- node/network/pov-distribution/src/lib.rs | 995 ------------------ node/network/protocol/src/lib.rs | 19 +- 5 files changed, 55 insertions(+), 1068 deletions(-) delete mode 100644 node/network/pov-distribution/src/lib.rs diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index 7ea77e065f8f..b509881d4737 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -1671,15 +1671,6 @@ mod tests { } ); - assert_matches!( - virtual_overseer.recv().await, - AllMessages::PoVDistribution(PoVDistributionMessage::DistributePoV(hash, descriptor, pov_received)) => { - assert_eq!(test_state.relay_parent, hash); - assert_eq!(candidate.descriptor, descriptor); - assert_eq!(pov, *pov_received); - } - ); - virtual_overseer.send(FromOverseer::Signal( OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::stop_work(test_state.relay_parent))) ).await; @@ -1750,14 +1741,17 @@ mod tests { virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; // Sending a `Statement::Seconded` for our assignment will start - // validation process. The first thing requested is PoV from the - // `PoVDistribution`. + // validation process. The first thing requested is the PoV. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -1903,10 +1897,14 @@ mod tests { // `PoVDistribution`. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -2042,10 +2040,14 @@ mod tests { assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -2317,11 +2319,14 @@ mod tests { // Subsystem requests PoV and requests validation. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) - ) => { - assert_eq!(relay_parent, test_state.relay_parent); - tx.send(Arc::new(pov.clone())).unwrap(); + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); } ); @@ -2437,11 +2442,14 @@ mod tests { // Subsystem requests PoV and requests validation. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) - ) => { - assert_eq!(relay_parent, test_state.relay_parent); - tx.send(Arc::new(pov.clone())).unwrap(); + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); } ); diff --git a/node/network/availability-distribution/src/tests/state.rs b/node/network/availability-distribution/src/tests/state.rs index a227cca03674..940412d4235f 100644 --- a/node/network/availability-distribution/src/tests/state.rs +++ b/node/network/availability-distribution/src/tests/state.rs @@ -161,6 +161,9 @@ impl TestState { /// This will simply advance through the simulated chain and examines whether the subsystem /// behaves as expected: It will succeed if all valid chunks of other backing groups get stored /// and no other. + /// + /// We try to be as agnostic about details as possible, how the subsystem achieves those goals + /// should not be a matter to this test suite. async fn run_inner(self, executor: TaskExecutor, virtual_overseer: TestSubsystemContextHandle) { // We skip genesis here (in reality ActiveLeavesUpdate can also skip a block: let updates = { @@ -252,7 +255,6 @@ impl TestState { } } _ => { - panic!("Unexpected message received: {:?}", msg); } } } diff --git a/node/network/bridge/src/lib.rs b/node/network/bridge/src/lib.rs index 1c5b21216521..7e33301d3301 100644 --- a/node/network/bridge/src/lib.rs +++ b/node/network/bridge/src/lib.rs @@ -688,7 +688,6 @@ mod tests { use polkadot_subsystem::messages::{ ApprovalDistributionMessage, BitfieldDistributionMessage, - PoVDistributionMessage, StatementDistributionMessage }; use polkadot_node_subsystem_test_helpers::{ @@ -889,13 +888,6 @@ mod tests { ) if e == event.focus().expect("could not focus message") ); - assert_matches!( - virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::NetworkBridgeUpdateV1(e) - ) if e == event.focus().expect("could not focus message") - ); - assert_matches!( virtual_overseer.recv().await, AllMessages::ApprovalDistribution( @@ -1146,13 +1138,12 @@ mod tests { ).await; } - let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting( - [0; 32].into(), - vec![[1; 32].into()], + let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals( + Vec::new() ); - let message = protocol_v1::ValidationProtocol::PoVDistribution( - pov_distribution_message.clone(), + let message = protocol_v1::ValidationProtocol::ApprovalDistribution( + approval_distribution_message.clone(), ); network_handle.peer_message( @@ -1163,18 +1154,18 @@ mod tests { network_handle.disconnect_peer(peer.clone(), PeerSet::Validation).await; - // PoV distribution message comes first, and the message is only sent to that subsystem. + // Approval distribution message comes first, and the message is only sent to that subsystem. // then a disconnection event arises that is sent to all validation networking subsystems. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::NetworkBridgeUpdateV1( + AllMessages::ApprovalDistribution( + ApprovalDistributionMessage::NetworkBridgeUpdateV1( NetworkBridgeEvent::PeerMessage(p, m) ) ) => { assert_eq!(p, peer); - assert_eq!(m, pov_distribution_message); + assert_eq!(m, approval_distribution_message); } ); @@ -1535,13 +1526,12 @@ mod tests { // send a validation protocol message. { - let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting( - [0; 32].into(), - vec![[1; 32].into()], + let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals( + Vec::new() ); - let message = protocol_v1::ValidationProtocol::PoVDistribution( - pov_distribution_message.clone(), + let message = protocol_v1::ValidationProtocol::ApprovalDistribution( + approval_distribution_message.clone(), ); virtual_overseer.send(FromOverseer::Communication { @@ -1596,7 +1586,7 @@ mod tests { fn spread_event_to_subsystems_is_up_to_date() { // Number of subsystems expected to be interested in a network event, // and hence the network event broadcasted to. - const EXPECTED_COUNT: usize = 4; + const EXPECTED_COUNT: usize = 3; let mut cnt = 0_usize; for msg in AllMessages::dispatch_iter(NetworkBridgeEvent::PeerDisconnected(PeerId::random())) { @@ -1612,7 +1602,6 @@ mod tests { AllMessages::BitfieldDistribution(_) => { cnt += 1; } AllMessages::BitfieldSigning(_) => unreachable!("Not interested in network events"), AllMessages::Provisioner(_) => unreachable!("Not interested in network events"), - AllMessages::PoVDistribution(_) => { cnt += 1; } AllMessages::RuntimeApi(_) => unreachable!("Not interested in network events"), AllMessages::AvailabilityStore(_) => unreachable!("Not interested in network events"), AllMessages::NetworkBridge(_) => unreachable!("Not interested in network events"), diff --git a/node/network/pov-distribution/src/lib.rs b/node/network/pov-distribution/src/lib.rs deleted file mode 100644 index bc8812b57278..000000000000 --- a/node/network/pov-distribution/src/lib.rs +++ /dev/null @@ -1,995 +0,0 @@ -// Copyright 2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! PoV Distribution Subsystem of Polkadot. -//! -//! This is a gossip implementation of code that is responsible for distributing PoVs -//! among validators. - -#![deny(unused_crate_dependencies)] -#![warn(missing_docs)] - -use polkadot_primitives::v1::{CandidateDescriptor, CompressedPoV, CoreIndex, CoreState, Hash, Id as ParaId, PoV, ValidatorId}; -use polkadot_subsystem::{ - ActiveLeavesUpdate, OverseerSignal, SubsystemContext, SubsystemResult, SubsystemError, Subsystem, - FromOverseer, SpawnedSubsystem, - jaeger, - messages::{ - PoVDistributionMessage, AllMessages, NetworkBridgeMessage, NetworkBridgeEvent, - }, -}; -use polkadot_node_subsystem_util::{ - validator_discovery, - request_validators_ctx, - request_validator_groups_ctx, - request_availability_cores_ctx, - metrics::{self, prometheus}, -}; -use polkadot_node_network_protocol::{ - peer_set::PeerSet, v1 as protocol_v1, PeerId, OurView, UnifiedReputationChange as Rep, -}; - -use futures::prelude::*; -use futures::channel::oneshot; - -use std::collections::{hash_map::{Entry, HashMap}, HashSet}; -use std::sync::Arc; - -mod error; - -#[cfg(test)] -mod tests; - -const COST_APPARENT_FLOOD: Rep = Rep::CostMajor("Peer appears to be flooding us with PoV requests"); -const COST_UNEXPECTED_POV: Rep = Rep::CostMajor("Peer sent us an unexpected PoV"); -const COST_AWAITED_NOT_IN_VIEW: Rep - = Rep::CostMinor("Peer claims to be awaiting something outside of its view"); - -const BENEFIT_FRESH_POV: Rep = Rep::BenefitMinorFirst("Peer supplied us with an awaited PoV"); -const BENEFIT_LATE_POV: Rep = Rep::BenefitMinor("Peer supplied us with an awaited PoV, \ - but was not the first to do so"); - -const LOG_TARGET: &str = "parachain::pov-distribution"; - -/// The PoV Distribution Subsystem. -pub struct PoVDistribution { - // Prometheus metrics - metrics: Metrics, -} - -impl Subsystem for PoVDistribution - where C: SubsystemContext -{ - fn start(self, ctx: C) -> SpawnedSubsystem { - // Swallow error because failure is fatal to the node and we log with more precision - // within `run`. - let future = self.run(ctx) - .map_err(|e| SubsystemError::with_origin("pov-distribution", e)) - .boxed(); - SpawnedSubsystem { - name: "pov-distribution-subsystem", - future, - } - } -} - -#[derive(Default)] -struct State { - /// A state of things going on on a per-relay-parent basis. - relay_parent_state: HashMap, - - /// Info on peers. - peer_state: HashMap, - - /// Our own view. - our_view: OurView, - - /// Connect to relevant groups of validators at different relay parents. - connection_requests: validator_discovery::ConnectionRequests, - - /// Metrics. - metrics: Metrics, -} - -struct BlockBasedState { - known: HashMap, CompressedPoV)>, - - /// All the PoVs we are or were fetching, coupled with channels expecting the data. - /// - /// This may be an empty list, which indicates that we were once awaiting this PoV but have - /// received it already. - fetching: HashMap>>>, - - n_validators: usize, -} - -#[derive(Default)] -struct PeerState { - /// A set of awaited PoV-hashes for each relay-parent in the peer's view. - awaited: HashMap>, -} - -fn awaiting_message(relay_parent: Hash, awaiting: Vec) - -> protocol_v1::ValidationProtocol -{ - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, awaiting) - ) -} - -fn send_pov_message( - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) -> protocol_v1::ValidationProtocol { - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov.clone()) - ) -} - -/// Handles the signal. If successful, returns `true` if the subsystem should conclude, -/// `false` otherwise. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_signal( - state: &mut State, - ctx: &mut impl SubsystemContext, - signal: OverseerSignal, -) -> SubsystemResult { - match signal { - OverseerSignal::Conclude => Ok(true), - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated }) => { - let _timer = state.metrics.time_handle_signal(); - - for (relay_parent, span) in activated { - let _span = span.child("pov-dist") - .with_stage(jaeger::Stage::PoVDistribution); - - match request_validators_ctx(relay_parent, ctx).await { - Ok(vals_rx) => { - let n_validators = match vals_rx.await? { - Ok(v) => v.len(), - Err(e) => { - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - - // Not adding bookkeeping here might make us behave funny, but we - // shouldn't take down the node on spurious runtime API errors. - // - // and this is "behave funny" as in be bad at our job, but not in any - // slashable or security-related way. - continue; - } - }; - - state.relay_parent_state.insert(relay_parent, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators, - }); - } - Err(e) => { - // continue here also as above. - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - } - } - } - - for relay_parent in deactivated { - state.connection_requests.remove_all(&relay_parent); - state.relay_parent_state.remove(&relay_parent); - } - - Ok(false) - } - OverseerSignal::BlockFinalized(..) => Ok(false), - } -} - -/// Notify peers that we are awaiting a given PoV hash. -/// -/// This only notifies peers who have the relay parent in their view. -#[tracing::instrument(level = "trace", skip(peers, ctx), fields(subsystem = LOG_TARGET))] -async fn notify_all_we_are_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - pov_hash: Hash, -) { - // We use `awaited` as a proxy for which heads are in the peer's view. - let peers_to_send: Vec<_> = peers.iter() - .filter_map(|(peer, state)| if state.awaited.contains_key(&relay_parent) { - Some(peer.clone()) - } else { - None - }) - .collect(); - - if peers_to_send.is_empty() { - return; - } - - let payload = awaiting_message(relay_parent, vec![pov_hash]); - - tracing::trace!( - target: LOG_TARGET, - peers = ?peers_to_send, - ?relay_parent, - ?pov_hash, - "Sending awaiting message", - ); - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; -} - -/// Notify one peer about everything we're awaiting at a given relay-parent. -#[tracing::instrument(level = "trace", skip(ctx, relay_parent_state), fields(subsystem = LOG_TARGET))] -async fn notify_one_we_are_awaiting_many( - peer: &PeerId, - ctx: &mut impl SubsystemContext, - relay_parent_state: &HashMap, - relay_parent: Hash, -) { - let awaiting_hashes = relay_parent_state.get(&relay_parent).into_iter().flat_map(|s| { - // Send the peer everything we are fetching at this relay-parent - s.fetching.iter() - .filter(|(_, senders)| !senders.is_empty()) // that has not been completed already. - .map(|(pov_hash, _)| *pov_hash) - }).collect::>(); - - if awaiting_hashes.is_empty() { - return; - } - - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?awaiting_hashes, - "Sending awaiting message", - ); - let payload = awaiting_message(relay_parent, awaiting_hashes); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - vec![peer.clone()], - payload, - ))).await; -} - -/// Distribute a PoV to peers who are awaiting it. -#[tracing::instrument(level = "trace", skip(peers, ctx, metrics, pov), fields(subsystem = LOG_TARGET))] -async fn distribute_to_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - metrics: &Metrics, - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) { - // Send to all peers who are awaiting the PoV and have that relay-parent in their view. - // - // Also removes it from their awaiting set. - let peers_to_send: Vec<_> = peers.iter_mut() - .filter_map(|(peer, state)| state.awaited.get_mut(&relay_parent).and_then(|awaited| { - if awaited.remove(&pov_hash) { - Some(peer.clone()) - } else { - None - } - })) - .collect(); - - tracing::trace!( - target: LOG_TARGET, - peers = ?peers_to_send, - ?relay_parent, - ?pov_hash, - "Sending PoV message", - ); - if peers_to_send.is_empty() { return; } - - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; - - metrics.on_pov_distributed(); -} - -/// Connect to relevant validators in case we are not already. -async fn connect_to_relevant_validators( - connection_requests: &mut validator_discovery::ConnectionRequests, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: &CandidateDescriptor, -) { - let para_id = descriptor.para_id; - if let Ok(Some(relevant_validators)) = - determine_relevant_validators(ctx, relay_parent, para_id).await - { - // We only need one connection request per (relay_parent, para_id) - // so here we take this shortcut to avoid calling `connect_to_validators` - // more than once. - if !connection_requests.contains_request(&relay_parent, para_id) { - tracing::debug!( - target: LOG_TARGET, - validators=?relevant_validators, - ?relay_parent, - "connecting to validators" - ); - match validator_discovery::connect_to_validators( - ctx, - relay_parent, - relevant_validators, - PeerSet::Validation, - ).await { - Ok(new_connection_request) => { - connection_requests.put(relay_parent, para_id, new_connection_request); - } - Err(e) => { - tracing::debug!( - target: LOG_TARGET, - "Failed to create a validator connection request {:?}", - e, - ); - } - } - } - } -} - -/// Get the Id of the Core that is assigned to the para being collated on if any -/// and the total number of cores. -async fn determine_core( - ctx: &mut impl SubsystemContext, - para_id: ParaId, - relay_parent: Hash, -) -> error::Result> { - let cores = request_availability_cores_ctx(relay_parent, ctx).await?.await??; - - for (idx, core) in cores.iter().enumerate() { - if let CoreState::Scheduled(occupied) = core { - if occupied.para_id == para_id { - return Ok(Some(((idx as u32).into(), cores.len()))); - } - } - } - - Ok(None) -} - -/// Figure out a group of validators assigned to a given `ParaId`. -async fn determine_validators_for_core( - ctx: &mut impl SubsystemContext, - core_index: CoreIndex, - num_cores: usize, - relay_parent: Hash, -) -> error::Result>> { - let groups = request_validator_groups_ctx(relay_parent, ctx).await?.await??; - - let group_index = groups.1.group_for_core(core_index, num_cores); - - let connect_to_validators = match groups.0.get(group_index.0 as usize) { - Some(group) => group.clone(), - None => return Ok(None), - }; - - let validators = request_validators_ctx(relay_parent, ctx).await?.await??; - - let validators = connect_to_validators - .into_iter() - .map(|idx| validators[idx.0 as usize].clone()) - .collect(); - - Ok(Some(validators)) -} - -async fn determine_relevant_validators( - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - para_id: ParaId, -) -> error::Result>> { - // Determine which core the para_id is assigned to. - let (core, num_cores) = match determine_core(ctx, para_id, relay_parent).await? { - Some(core) => core, - None => { - tracing::warn!( - target: LOG_TARGET, - "Looks like no core is assigned to {:?} at {:?}", - para_id, - relay_parent, - ); - - return Ok(None); - } - }; - - determine_validators_for_core(ctx, core, num_cores, relay_parent).await -} - -/// Handles a `FetchPoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, response_sender), fields(subsystem = LOG_TARGET))] -async fn handle_fetch( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - response_sender: oneshot::Sender>, -) { - let _timer = state.metrics.time_handle_fetch(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - if let Some((pov, _)) = relay_parent_state.known.get(&descriptor.pov_hash) { - let _ = response_sender.send(pov.clone()); - return; - } - - { - match relay_parent_state.fetching.entry(descriptor.pov_hash) { - Entry::Occupied(mut e) => { - // we are already awaiting this PoV if there is an entry. - e.get_mut().push(response_sender); - return; - } - Entry::Vacant(e) => { - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - e.insert(vec![response_sender]); - } - } - } - - if relay_parent_state.fetching.len() > 2 * relay_parent_state.n_validators { - tracing::warn!( - target = LOG_TARGET, - relay_parent_state.fetching.len = relay_parent_state.fetching.len(), - "other subsystems have requested PoV distribution to fetch more PoVs than reasonably expected", - ); - return; - } - - // Issue an `Awaiting` message to all peers with this in their view. - notify_all_we_are_awaiting( - &mut state.peer_state, - ctx, - relay_parent, - descriptor.pov_hash - ).await -} - -/// Handles a `DistributePoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))] -async fn handle_distribute( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - pov: Arc, -) { - let _timer = state.metrics.time_handle_distribute(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - - if let Some(our_awaited) = relay_parent_state.fetching.get_mut(&descriptor.pov_hash) { - // Drain all the senders, but keep the entry in the map around intentionally. - // - // It signals that we were at one point awaiting this, so we will be able to tell - // why peers are sending it to us. - for response_sender in our_awaited.drain(..) { - let _ = response_sender.send(pov.clone()); - } - } - - let encoded_pov = match CompressedPoV::compress(&*pov) { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Failed to create `CompressedPov`." - ); - return - } - }; - - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - descriptor.pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(descriptor.pov_hash, (pov, encoded_pov)); -} - -/// Report a reputation change for a peer. -#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))] -async fn report_peer( - ctx: &mut impl SubsystemContext, - peer: PeerId, - rep: Rep, -) { - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(peer, rep))).await -} - -/// Handle a notification from a peer that they are awaiting some PoVs. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_awaiting( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hashes: Vec, -) { - if !state.our_view.contains(&relay_parent) { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hashes, - "Received awaiting message for unknown block", - ); - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hashes, - "Received awaiting message", - ); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - tracing::warn!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - "PoV Distribution relay parent state out-of-sync with our view" - ); - return; - } - Some(s) => s, - }; - - let peer_awaiting = match - state.peer_state.get_mut(&peer).and_then(|s| s.awaited.get_mut(&relay_parent)) - { - None => { - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - Some(a) => a, - }; - - let will_be_awaited = peer_awaiting.len() + pov_hashes.len(); - if will_be_awaited <= 2 * relay_parent_state.n_validators { - for pov_hash in pov_hashes { - // For all requested PoV hashes, if we have it, we complete the request immediately. - // Otherwise, we note that the peer is awaiting the PoV. - if let Some((_, ref pov)) = relay_parent_state.known.get(&pov_hash) { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Sending awaited PoV message", - ); - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload) - )).await; - } else { - peer_awaiting.insert(pov_hash); - } - } - } else { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - "Too many PoV requests", - ); - report_peer(ctx, peer, COST_APPARENT_FLOOD).await; - } -} - -/// Handle an incoming PoV from our peer. Reports them if unexpected, rewards them if not. -/// -/// Completes any requests awaiting that PoV. -#[tracing::instrument(level = "trace", skip(ctx, state, encoded_pov), fields(subsystem = LOG_TARGET))] -async fn handle_incoming_pov( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hash: Hash, - encoded_pov: CompressedPoV, -) { - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Unexpected PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - }, - Some(r) => r, - }; - - let pov = match encoded_pov.decompress() { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Could not extract PoV", - ); - return; - } - }; - - let pov = { - // Do validity checks and complete all senders awaiting this PoV. - let fetching = match relay_parent_state.fetching.get_mut(&pov_hash) { - None => { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Unexpected PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - Some(f) => f, - }; - - let hash = pov.hash(); - if hash != pov_hash { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - ?hash, - "Mismatched PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - - let pov = Arc::new(pov); - - if fetching.is_empty() { - // fetching is empty whenever we were awaiting something and - // it was completed afterwards. - report_peer(ctx, peer.clone(), BENEFIT_LATE_POV).await; - } else { - // fetching is non-empty when the peer just provided us with data we needed. - report_peer(ctx, peer.clone(), BENEFIT_FRESH_POV).await; - } - - for response_sender in fetching.drain(..) { - let _ = response_sender.send(pov.clone()); - } - - pov - }; - - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Received PoV", - ); - // make sure we don't consider this peer as awaiting that PoV anymore. - if let Some(peer_state) = state.peer_state.get_mut(&peer) { - peer_state.awaited.remove(&pov_hash); - } - - // distribute the PoV to all other peers who are awaiting it. - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(pov_hash, (pov, encoded_pov)); -} - -/// Handles a newly or already connected validator in the context of some relay leaf. -fn handle_validator_connected(state: &mut State, peer_id: PeerId) { - state.peer_state.entry(peer_id).or_default(); -} - -/// Handles a network bridge update. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_network_update( - state: &mut State, - ctx: &mut impl SubsystemContext, - update: NetworkBridgeEvent, -) { - let _timer = state.metrics.time_handle_network_update(); - - match update { - NetworkBridgeEvent::PeerConnected(peer, role) => { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?role, - "Peer connected", - ); - handle_validator_connected(state, peer); - } - NetworkBridgeEvent::PeerDisconnected(peer) => { - tracing::trace!( - target: LOG_TARGET, - ?peer, - "Peer disconnected", - ); - state.peer_state.remove(&peer); - } - NetworkBridgeEvent::PeerViewChange(peer_id, view) => { - tracing::trace!( - target: LOG_TARGET, - ?peer_id, - ?view, - "Peer view change", - ); - if let Some(peer_state) = state.peer_state.get_mut(&peer_id) { - // prune anything not in the new view. - peer_state.awaited.retain(|relay_parent, _| view.contains(&relay_parent)); - - // introduce things from the new view. - for relay_parent in view.iter() { - if let Entry::Vacant(entry) = peer_state.awaited.entry(*relay_parent) { - entry.insert(HashSet::new()); - - // Notify the peer about everything we're awaiting at the new relay-parent. - notify_one_we_are_awaiting_many( - &peer_id, - ctx, - &state.relay_parent_state, - *relay_parent, - ).await; - } - } - } - - } - NetworkBridgeEvent::PeerMessage(peer, message) => { - match message { - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, pov_hashes) - => handle_awaiting( - state, - ctx, - peer, - relay_parent, - pov_hashes, - ).await, - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov) - => handle_incoming_pov( - state, - ctx, - peer, - relay_parent, - pov_hash, - pov, - ).await, - } - } - NetworkBridgeEvent::OurViewChange(view) => { - tracing::trace!( - target: LOG_TARGET, - "Own view change", - ); - state.our_view = view; - } - } -} - -impl PoVDistribution { - /// Create a new instance of `PovDistribution`. - pub fn new(metrics: Metrics) -> Self { - Self { metrics } - } - - #[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))] - async fn run( - self, - ctx: impl SubsystemContext, - ) -> SubsystemResult<()> { - self.run_with_state(ctx, State::default()).await - } - - async fn run_with_state( - self, - mut ctx: impl SubsystemContext, - mut state: State, - ) -> SubsystemResult<()> { - state.metrics = self.metrics; - - loop { - // `select_biased` is used since receiving connection notifications and - // peer view update messages may be racy and we want connection notifications - // first. - futures::select_biased! { - v = state.connection_requests.next().fuse() => handle_validator_connected(&mut state, v.peer_id), - v = ctx.recv().fuse() => { - match v? { - FromOverseer::Signal(signal) => if handle_signal( - &mut state, - &mut ctx, - signal, - ).await? { - return Ok(()); - } - FromOverseer::Communication { msg } => match msg { - PoVDistributionMessage::FetchPoV(relay_parent, descriptor, response_sender) => - handle_fetch( - &mut state, - &mut ctx, - relay_parent, - descriptor, - response_sender, - ).await, - PoVDistributionMessage::DistributePoV(relay_parent, descriptor, pov) => - handle_distribute( - &mut state, - &mut ctx, - relay_parent, - descriptor, - pov, - ).await, - PoVDistributionMessage::NetworkBridgeUpdateV1(event) => - handle_network_update( - &mut state, - &mut ctx, - event, - ).await, - } - } - } - } - } - } -} - - - -#[derive(Clone)] -struct MetricsInner { - povs_distributed: prometheus::Counter, - handle_signal: prometheus::Histogram, - handle_fetch: prometheus::Histogram, - handle_distribute: prometheus::Histogram, - handle_network_update: prometheus::Histogram, -} - -/// Availability Distribution metrics. -#[derive(Default, Clone)] -pub struct Metrics(Option); - -impl Metrics { - fn on_pov_distributed(&self) { - if let Some(metrics) = &self.0 { - metrics.povs_distributed.inc(); - } - } - - /// Provide a timer for `handle_signal` which observes on drop. - fn time_handle_signal(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_signal.start_timer()) - } - - /// Provide a timer for `handle_fetch` which observes on drop. - fn time_handle_fetch(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_fetch.start_timer()) - } - - /// Provide a timer for `handle_distribute` which observes on drop. - fn time_handle_distribute(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_distribute.start_timer()) - } - - /// Provide a timer for `handle_network_update` which observes on drop. - fn time_handle_network_update(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_network_update.start_timer()) - } -} - -impl metrics::Metrics for Metrics { - fn try_register(registry: &prometheus::Registry) -> std::result::Result { - let metrics = MetricsInner { - povs_distributed: prometheus::register( - prometheus::Counter::new( - "parachain_povs_distributed_total", - "Number of PoVs distributed to other peers." - )?, - registry, - )?, - handle_signal: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_signal", - "Time spent within `pov_distribution::handle_signal`", - ) - )?, - registry, - )?, - handle_fetch: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_fetch", - "Time spent within `pov_distribution::handle_fetch`", - ) - )?, - registry, - )?, - handle_distribute: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_distribute", - "Time spent within `pov_distribution::handle_distribute`", - ) - )?, - registry, - )?, - handle_network_update: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_network_update", - "Time spent within `pov_distribution::handle_network_update`", - ) - )?, - registry, - )?, - }; - Ok(Metrics(Some(metrics))) - } -} diff --git a/node/network/protocol/src/lib.rs b/node/network/protocol/src/lib.rs index ee5d3398e6d8..211f2dcf37f3 100644 --- a/node/network/protocol/src/lib.rs +++ b/node/network/protocol/src/lib.rs @@ -289,7 +289,7 @@ pub mod v1 { use std::convert::TryFrom; use polkadot_primitives::v1::{ - CandidateIndex, CollatorId, CompressedPoV, Hash, Id as ParaId, SignedAvailabilityBitfield, + CandidateIndex, CollatorId, Hash, Id as ParaId, SignedAvailabilityBitfield, CollatorSignature, }; use polkadot_node_primitives::{ @@ -305,19 +305,6 @@ pub mod v1 { Bitfield(Hash, SignedAvailabilityBitfield), } - /// Network messages used by the PoV distribution subsystem. - #[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)] - pub enum PoVDistributionMessage { - /// Notification that we are awaiting the given PoVs (by hash) against a - /// specific relay-parent hash. - #[codec(index = 0)] - Awaiting(Hash, Vec), - /// Notification of an awaited PoV, in a given relay-parent context. - /// (relay_parent, pov_hash, compressed_pov) - #[codec(index = 1)] - SendPoV(Hash, Hash, CompressedPoV), - } - /// Network messages used by the statement distribution subsystem. #[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)] pub enum StatementDistributionMessage { @@ -361,9 +348,6 @@ pub mod v1 { /// Bitfield distribution messages #[codec(index = 1)] BitfieldDistribution(BitfieldDistributionMessage), - /// PoV Distribution messages - #[codec(index = 2)] - PoVDistribution(PoVDistributionMessage), /// Statement distribution messages #[codec(index = 3)] StatementDistribution(StatementDistributionMessage), @@ -373,7 +357,6 @@ pub mod v1 { } impl_try_from!(ValidationProtocol, BitfieldDistribution, BitfieldDistributionMessage); - impl_try_from!(ValidationProtocol, PoVDistribution, PoVDistributionMessage); impl_try_from!(ValidationProtocol, StatementDistribution, StatementDistributionMessage); impl_try_from!(ValidationProtocol, ApprovalDistribution, ApprovalDistributionMessage); From 4207eafcde3672c3b385a9f8c01f0793ea597b6e Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 12:48:29 +0100 Subject: [PATCH 21/30] Log to log target in overseer. --- node/overseer/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node/overseer/src/lib.rs b/node/overseer/src/lib.rs index fdef15d5cf0c..ee521beb9ed4 100644 --- a/node/overseer/src/lib.rs +++ b/node/overseer/src/lib.rs @@ -2022,9 +2022,9 @@ fn spawn( let fut = Box::pin(async move { if let Err(e) = future.await { - tracing::error!(subsystem=name, err = ?e, "subsystem exited with error"); + tracing::error!(target: LOG_TARGET, subsystem=name, err = ?e, "subsystem exited with error"); } else { - tracing::debug!(subsystem=name, "subsystem exited without an error"); + tracing::debug!(target: LOG_TARGET, subsystem=name, "subsystem exited without an error"); } let _ = tx.send(()); }); From 36910619c8218389409a9479e6d592abf691b3ab Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 12:51:09 +0100 Subject: [PATCH 22/30] Fix more tests. --- node/overseer/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/overseer/src/lib.rs b/node/overseer/src/lib.rs index ee521beb9ed4..921b3ef2d202 100644 --- a/node/overseer/src/lib.rs +++ b/node/overseer/src/lib.rs @@ -2864,7 +2864,7 @@ mod tests { select! { res = overseer_fut => { - const NUM_SUBSYSTEMS: usize = 19; + const NUM_SUBSYSTEMS: usize = 18; assert_eq!(stop_signals_received.load(atomic::Ordering::SeqCst), NUM_SUBSYSTEMS); assert_eq!(signals_received.load(atomic::Ordering::SeqCst), NUM_SUBSYSTEMS); From b1a201a84d7f3921c406f644692d2af6bd8d6e57 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 14:26:03 +0100 Subject: [PATCH 23/30] Don't fail if group cannot be found. --- .../availability-distribution/src/runtime.rs | 24 ++++++++++------- .../src/session_cache.rs | 26 ++++++++++++------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs index 0cd02906bb72..ebb1402deabd 100644 --- a/node/network/availability-distribution/src/runtime.rs +++ b/node/network/availability-distribution/src/runtime.rs @@ -170,17 +170,21 @@ impl Runtime { None } }) - }) - .expect("Every validator should be in a validator group. qed."); - - let info = ValidatorInfo { - our_index, - our_group, - }; - Ok(Some(info)) - } else { - Ok(None) + } + ); + debug_assert!( + our_group.is_some() || session_info.validator_groups.is_empty(), + "Groups are initialized but validator could not be found in any" + ); + if let Some(our_group) = our_group { + let info = ValidatorInfo { + our_index, + our_group, + }; + return Ok(Some(info)) + } } + return Ok(None) } /// Get our `ValidatorIndex`. diff --git a/node/network/availability-distribution/src/session_cache.rs b/node/network/availability-distribution/src/session_cache.rs index 418f79f49ebe..9df1b90ae4d5 100644 --- a/node/network/availability-distribution/src/session_cache.rs +++ b/node/network/availability-distribution/src/session_cache.rs @@ -244,8 +244,12 @@ impl SessionCache { None } }) - }) - .expect("Every validator should be in a validator group. qed."); + } + ); + debug_assert!( + our_group.is_some() || validator_groups.is_empty(), + "Groups are initialized but validator could not be found in any" + ); // Shuffle validators in groups: let mut rng = thread_rng(); @@ -267,15 +271,17 @@ impl SessionCache { }) .collect(); - let info = SessionInfo { - validator_groups, - our_index, - session_index, - our_group, - }; - return Ok(Some(info)); + if let Some(our_group) = our_group { + let info = SessionInfo { + validator_groups, + our_index, + session_index, + our_group, + }; + return Ok(Some(info)) + } } - return Ok(None); + return Ok(None) } /// Get our `ValidatorIndex`. From 298fe9d695f4a0fc251615a394da8a010cf6b9ff Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 21:07:44 +0100 Subject: [PATCH 24/30] Simple test for PoV fetcher. --- .../src/pov_requester/mod.rs | 99 +++++++++++++++++++ .../src/tests/mock.rs | 23 ++++- .../src/tests/state.rs | 20 +--- 3 files changed, 122 insertions(+), 20 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 56ed78c30d68..7919731d03e8 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -230,3 +230,102 @@ where Ok(None) } } + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + use futures::{executor, future}; + + use parity_scale_codec::Encode; + use sp_core::testing::TaskExecutor; + + use polkadot_primitives::v1::{BlockData, CandidateHash, CompressedPoV, Hash, ValidatorIndex}; + use polkadot_subsystem_testhelpers as test_helpers; + use polkadot_subsystem::messages::{AvailabilityDistributionMessage, RuntimeApiMessage, RuntimeApiRequest}; + + use super::*; + use crate::LOG_TARGET; + use crate::tests::mock::{make_session_info, make_ferdie_keystore}; + + #[test] + fn rejects_invalid_pov() { + sp_tracing::try_init_simple(); + let pov = PoV { + block_data: BlockData(vec![1,2,3,4,5,6]), + }; + test_run(Hash::default(), pov); + } + + #[test] + fn accepts_valid_pov() { + sp_tracing::try_init_simple(); + let pov = PoV { + block_data: BlockData(vec![1,2,3,4,5,6]), + }; + test_run(pov.hash(), pov); + } + + fn test_run(pov_hash: Hash, pov: PoV) { + let requester = PoVRequester::new(); + let pool = TaskExecutor::new(); + let (mut context, mut virtual_overseer) = + test_helpers::make_subsystem_context::(pool.clone()); + let keystore = make_ferdie_keystore(); + let mut runtime = crate::runtime::Runtime::new(keystore); + + let (tx, rx) = oneshot::channel(); + let testee = async { + requester.fetch_pov( + &mut context, + &mut runtime, + Hash::default(), + ValidatorIndex(0), + CandidateHash::default(), + pov_hash, + tx, + ).await.expect("Should succeed"); + }; + + let tester = async move { + loop { + match virtual_overseer.recv().await { + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionIndexForChild(tx) + ) + ) => { + tx.send(Ok(0)).unwrap(); + } + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionInfo(_, tx) + ) + ) => { + tx.send(Ok(Some(make_session_info()))).unwrap(); + } + AllMessages::NetworkBridge(NetworkBridgeMessage::SendRequests(mut reqs, _)) => { + let req = assert_matches!( + reqs.pop(), + Some(Requests::PoVFetching(outgoing)) => {outgoing} + ); + req.pending_response.send(Ok(PoVFetchingResponse::PoV( + CompressedPoV::compress(&pov).unwrap()).encode() + )).unwrap(); + break + }, + msg => tracing::debug!(target: LOG_TARGET, msg = ?msg, "Received msg"), + } + } + if pov.hash() == pov_hash { + assert_eq!(rx.await, Ok(pov)); + } else { + assert_eq!(rx.await, Err(oneshot::Canceled)); + } + }; + futures::pin_mut!(testee); + futures::pin_mut!(tester); + executor::block_on(future::join(testee, tester)); + } +} diff --git a/node/network/availability-distribution/src/tests/mock.rs b/node/network/availability-distribution/src/tests/mock.rs index 9d9896d8fd59..91eff4b4075d 100644 --- a/node/network/availability-distribution/src/tests/mock.rs +++ b/node/network/availability-distribution/src/tests/mock.rs @@ -19,14 +19,29 @@ use std::sync::Arc; +use sc_keystore::LocalKeystore; use sp_keyring::Sr25519Keyring; +use sp_application_crypto::AppKey; use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; -use polkadot_primitives::v1::{AvailableData, BlockData, CandidateCommitments, CandidateDescriptor, - CandidateHash, CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id - as ParaId, OccupiedCore, PersistedValidationData, PoV, SessionInfo, - ValidatorIndex +use polkadot_primitives::v1::{ + AvailableData, BlockData, CandidateCommitments, CandidateDescriptor, CandidateHash, + CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id as ParaId, + OccupiedCore, PersistedValidationData, PoV, SessionInfo, ValidatorId, ValidatorIndex }; +use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr}; + +/// Get mock keystore with `Ferdie` key. +pub fn make_ferdie_keystore() -> SyncCryptoStorePtr { + let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory()); + SyncCryptoStore::sr25519_generate_new( + &*keystore, + ValidatorId::ID, + Some(&Sr25519Keyring::Ferdie.to_seed()), + ) + .expect("Insert key into keystore"); + keystore +} /// Create dummy session info with two validator groups. pub fn make_session_info() -> SessionInfo { diff --git a/node/network/availability-distribution/src/tests/state.rs b/node/network/availability-distribution/src/tests/state.rs index 940412d4235f..0b1f1da2de7b 100644 --- a/node/network/availability-distribution/src/tests/state.rs +++ b/node/network/availability-distribution/src/tests/state.rs @@ -23,10 +23,7 @@ use smallvec::smallvec; use futures::{FutureExt, channel::oneshot, SinkExt, channel::mpsc, StreamExt}; use futures_timer::Delay; -use sc_keystore::LocalKeystore; -use sp_application_crypto::AppKey; -use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr}; -use sp_keyring::Sr25519Keyring; +use sp_keystore::SyncCryptoStorePtr; use sp_core::{traits::SpawnNamed, testing::TaskExecutor}; use sc_network as network; use sc_network::IfDisconnected; @@ -37,7 +34,7 @@ use polkadot_subsystem::{ActiveLeavesUpdate, FromOverseer, OverseerSignal, messa RuntimeApiRequest} }; use polkadot_primitives::v1::{CandidateHash, CoreState, ErasureChunk, GroupIndex, Hash, Id - as ParaId, ScheduledCore, SessionInfo, ValidatorId, + as ParaId, ScheduledCore, SessionInfo, ValidatorIndex }; use polkadot_node_network_protocol::{jaeger, @@ -46,7 +43,7 @@ use polkadot_node_network_protocol::{jaeger, use polkadot_subsystem_testhelpers as test_helpers; use test_helpers::SingleItemSink; -use super::mock::{make_session_info, OccupiedCoreBuilder, }; +use super::mock::{make_session_info, OccupiedCoreBuilder, make_ferdie_keystore}; use crate::LOG_TARGET; pub struct TestHarness { @@ -81,17 +78,10 @@ impl Default for TestState { let chain_ids = vec![chain_a, chain_b]; - let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory()); + let keystore = make_ferdie_keystore(); let session_info = make_session_info(); - SyncCryptoStore::sr25519_generate_new( - &*keystore, - ValidatorId::ID, - Some(&Sr25519Keyring::Ferdie.to_seed()), - ) - .expect("Insert key into keystore"); - let (cores, chunks) = { let mut cores = HashMap::new(); let mut chunks = HashMap::new(); @@ -261,8 +251,6 @@ impl TestState { } } - - async fn overseer_signal( mut tx: SingleItemSink>, msg: impl Into, From af9f12cbc1e0368e28c57e548892cc4f4eac5c9c Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 21:27:47 +0100 Subject: [PATCH 25/30] Handle missing group membership better. --- .../src/pov_requester/mod.rs | 15 +++++----- .../src/requester/fetch_task/mod.rs | 2 +- .../availability-distribution/src/runtime.rs | 30 ++++++++----------- .../src/session_cache.rs | 24 +++++++-------- 4 files changed, 31 insertions(+), 40 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 7919731d03e8..48ccd99f8aa5 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -32,10 +32,7 @@ use polkadot_subsystem::{ ActiveLeavesUpdate, SubsystemContext, messages::{AllMessages, NetworkBridgeMessage, IfDisconnected} }; -use crate::{ - runtime::Runtime, - error::{Error, log_error}, -}; +use crate::{error::{Error, log_error}, runtime::{Runtime, ValidatorInfo}}; /// Number of sessions we want to keep in the LRU. const NUM_SESSIONS: usize = 2; @@ -216,13 +213,17 @@ where Context: SubsystemContext, { let info = runtime.get_session_info_by_index(ctx, parent, session).await?; - if let Some(validator_info) = &info.validator_info { - let indeces = info.session_info.validator_groups.get(validator_info.our_group.0 as usize) + if let ValidatorInfo { + our_index: Some(our_index), + our_group: Some(our_group) + } = &info.validator_info { + + let indeces = info.session_info.validator_groups.get(our_group.0 as usize) .expect("Our group got retrieved from that session info, it must exist. qed.") .clone(); Ok(Some( indeces.into_iter() - .filter(|i| *i != validator_info.our_index) + .filter(|i| *i != *our_index) .map(|i| info.session_info.discovery_keys[i.0 as usize].clone()) .collect() )) diff --git a/node/network/availability-distribution/src/requester/fetch_task/mod.rs b/node/network/availability-distribution/src/requester/fetch_task/mod.rs index c4e539ab23da..7b9c39e52b28 100644 --- a/node/network/availability-distribution/src/requester/fetch_task/mod.rs +++ b/node/network/availability-distribution/src/requester/fetch_task/mod.rs @@ -138,7 +138,7 @@ impl FetchTaskConfig { let live_in = vec![leaf].into_iter().collect(); // Don't run tasks for our backing group: - if session_info.our_group == core.group_responsible { + if session_info.our_group == Some(core.group_responsible) { return FetchTaskConfig { live_in, prepared_running: None, diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs index ebb1402deabd..0bc81d3e70f2 100644 --- a/node/network/availability-distribution/src/runtime.rs +++ b/node/network/availability-distribution/src/runtime.rs @@ -55,17 +55,17 @@ pub struct ExtendedSessionInfo { /// Actual session info as fetched from the runtime. pub session_info: SessionInfo, /// Contains useful information about ourselves, in case this node is a validator. - pub validator_info: Option, + pub validator_info: ValidatorInfo, } /// Information about ourself, in case we are an `Authority`. /// /// This data is derived from the `SessionInfo` and our key as found in the keystore. pub struct ValidatorInfo { - /// The index this very validator has in `SessionInfo` vectors. - pub our_index: ValidatorIndex, - /// The group we belong to. - pub our_group: GroupIndex, + /// The index this very validator has in `SessionInfo` vectors, if any. + pub our_index: Option, + /// The group we belong to, if any. + pub our_group: Option, } impl Runtime { @@ -155,7 +155,7 @@ impl Runtime { async fn get_validator_info( &self, session_info: &SessionInfo, - ) -> Result, Error> + ) -> Result { if let Some(our_index) = self.get_our_index(&session_info.validators).await { // Get our group index: @@ -172,19 +172,13 @@ impl Runtime { }) } ); - debug_assert!( - our_group.is_some() || session_info.validator_groups.is_empty(), - "Groups are initialized but validator could not be found in any" - ); - if let Some(our_group) = our_group { - let info = ValidatorInfo { - our_index, - our_group, - }; - return Ok(Some(info)) - } + let info = ValidatorInfo { + our_index: Some(our_index), + our_group, + }; + return Ok(info) } - return Ok(None) + return Ok(ValidatorInfo { our_index: None, our_group: None }) } /// Get our `ValidatorIndex`. diff --git a/node/network/availability-distribution/src/session_cache.rs b/node/network/availability-distribution/src/session_cache.rs index 9df1b90ae4d5..e4e2bce41fc5 100644 --- a/node/network/availability-distribution/src/session_cache.rs +++ b/node/network/availability-distribution/src/session_cache.rs @@ -81,7 +81,9 @@ pub struct SessionInfo { /// Remember to which group we belong, so we won't start fetching chunks for candidates with /// our group being responsible. (We should have that chunk already.) - pub our_group: GroupIndex, + /// + /// `None`, if we are not in fact part of any group. + pub our_group: Option, } /// Report of bad validators. @@ -246,10 +248,6 @@ impl SessionCache { }) } ); - debug_assert!( - our_group.is_some() || validator_groups.is_empty(), - "Groups are initialized but validator could not be found in any" - ); // Shuffle validators in groups: let mut rng = thread_rng(); @@ -271,15 +269,13 @@ impl SessionCache { }) .collect(); - if let Some(our_group) = our_group { - let info = SessionInfo { - validator_groups, - our_index, - session_index, - our_group, - }; - return Ok(Some(info)) - } + let info = SessionInfo { + validator_groups, + our_index, + session_index, + our_group, + }; + return Ok(Some(info)) } return Ok(None) } From 0c30792b161d7e1f8136d5441206da36973c90fb Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 23:18:45 +0100 Subject: [PATCH 26/30] Add test for retry functionality. --- Cargo.lock | 1 + node/core/backing/Cargo.toml | 1 + node/core/backing/src/lib.rs | 205 ++++++++++++++++++++++++----------- 3 files changed, 141 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4afca4128932..0264cd59e939 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5628,6 +5628,7 @@ dependencies = [ "sp-core", "sp-keyring", "sp-keystore", + "sp-tracing", "thiserror", "tracing", ] diff --git a/node/core/backing/Cargo.toml b/node/core/backing/Cargo.toml index a6b04c03ae92..d912143b9bcd 100644 --- a/node/core/backing/Cargo.toml +++ b/node/core/backing/Cargo.toml @@ -22,6 +22,7 @@ sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-application-crypto = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } sc-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-tracing = { git = "https://github.com/paritytech/substrate", branch = "master" } futures = { version = "0.3.12", features = ["thread-pool"] } assert_matches = "1.4.0" polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index b509881d4737..dbd2b72ab8bf 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -651,12 +651,11 @@ impl CandidateBackingJob { >, ) -> Result<(), Error> { let candidate_hash = params.candidate.hash(); - if self.awaiting_validation.insert(candidate_hash) { // spawn background task. let bg = async move { if let Err(e) = validate_and_make_available(params).await { - tracing::error!("Failed to validate and make available: {:?}", e); + tracing::error!(target: LOG_TARGET, "Failed to validate and make available: {:?}", e); } }; self.tx_from.send(FromJobCommand::Spawn("Backing Validation", bg.boxed())).await?; @@ -2590,80 +2589,154 @@ mod tests { }); } + // Test whether we retry on failed PoV fetching. #[test] - fn candidate_backing_reorders_votes() { - use sp_core::Encode; - use std::convert::TryFrom; - - let relay_parent = [1; 32].into(); - let para_id = ParaId::from(10); - let session_index = 5; - let signing_context = SigningContext { parent_hash: relay_parent, session_index }; - let validators = vec![ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - Sr25519Keyring::One, - ]; - - let validator_public = validator_pubkeys(&validators); - let validator_groups = { - let mut validator_groups = HashMap::new(); - validator_groups.insert(para_id, vec![0, 1, 2, 3, 4, 5].into_iter().map(ValidatorIndex).collect()); - validator_groups - }; + fn retry_works() { + sp_tracing::try_init_simple(); + let test_state = TestState::default(); + test_harness(test_state.keystore.clone(), |test_harness| async move { + let TestHarness { mut virtual_overseer } = test_harness; - let table_context = TableContext { - signing_context, - validator: None, - groups: validator_groups, - validators: validator_public.clone(), - }; + test_startup(&mut virtual_overseer, &test_state).await; - let fake_attestation = |idx: u32| { - let candidate: CommittedCandidateReceipt = Default::default(); - let hash = candidate.hash(); - let mut data = vec![0; 64]; - data[0..32].copy_from_slice(hash.0.as_bytes()); - data[32..36].copy_from_slice(idx.encode().as_slice()); + let pov = PoV { + block_data: BlockData(vec![42, 43, 44]), + }; - let sig = ValidatorSignature::try_from(data).unwrap(); - statement_table::generic::ValidityAttestation::Implicit(sig) - }; + let pov_hash = pov.hash(); - let attested = TableAttestedCandidate { - candidate: Default::default(), - validity_votes: vec![ - (ValidatorIndex(5), fake_attestation(5)), - (ValidatorIndex(3), fake_attestation(3)), - (ValidatorIndex(1), fake_attestation(1)), - ], - group_id: para_id, - }; + let candidate = TestCandidateBuilder { + para_id: test_state.chain_ids[0], + relay_parent: test_state.relay_parent, + pov_hash, + erasure_root: make_erasure_root(&test_state, pov.clone()), + ..Default::default() + }.build(); - let backed = table_attested_to_backed(attested, &table_context).unwrap(); + let public2 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, Some(&test_state.validators[2].to_seed()) + ).await.expect("Insert key into keystore"); + let public3 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, + Some(&test_state.validators[3].to_seed()), + ).await.expect("Insert key into keystore"); + let public5 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, + Some(&test_state.validators[5].to_seed()), + ).await.expect("Insert key into keystore"); + let signed_a = SignedFullStatement::sign( + &test_state.keystore, + Statement::Seconded(candidate.clone()), + &test_state.signing_context, + ValidatorIndex(2), + &public2.into(), + ).await.ok().flatten().expect("should be signed"); + let signed_b = SignedFullStatement::sign( + &test_state.keystore, + Statement::Valid(candidate.hash()), + &test_state.signing_context, + ValidatorIndex(3), + &public3.into(), + ).await.ok().flatten().expect("should be signed"); + let signed_c = SignedFullStatement::sign( + &test_state.keystore, + Statement::Valid(candidate.hash()), + &test_state.signing_context, + ValidatorIndex(5), + &public5.into(), + ).await.ok().flatten().expect("should be signed"); - let expected_bitvec = { - let mut validator_indices = BitVec::::with_capacity(6); - validator_indices.resize(6, false); + // Send in a `Statement` with a candidate. + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_a.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; - validator_indices.set(1, true); - validator_indices.set(3, true); - validator_indices.set(5, true); + // Subsystem requests PoV and requests validation. + // We cancel - should mean retry on next backing statement. + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + std::mem::drop(tx); + } + ); - validator_indices - }; + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_b.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; - // Should be in bitfield order, which is opposite to the order provided to the function. - let expected_attestations = vec![ - fake_attestation(1).into(), - fake_attestation(3).into(), - fake_attestation(5).into(), - ]; + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_c.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; + + // There are already enough votes for the candidate to be backed: + assert_matches!( + virtual_overseer.recv().await, + AllMessages::Provisioner( + ProvisionerMessage::ProvisionableData( + _, + ProvisionableData::BackedCandidate(CandidateReceipt { + descriptor, + .. + }) + ) + ) if descriptor == candidate.descriptor + ); + + // Subsystem requests PoV and requests validation. + // We cancel once more. + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + std::mem::drop(tx); + } + ); + + // Subsystem requests PoV and requests validation. + // Now we pass. + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); + } + ); - assert_eq!(backed.validator_indices, expected_bitvec); - assert_eq!(backed.validity_votes, expected_attestations); + assert_matches!( + virtual_overseer.recv().await, + AllMessages::CandidateValidation( + CandidateValidationMessage::ValidateFromChainState( + c, + pov, + _tx, + ) + ) if pov == pov && &c == candidate.descriptor() + ); + }); } } From eb47465aaded3e73518a5d28d75a20b0fe6404f4 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sat, 27 Mar 2021 23:42:12 +0100 Subject: [PATCH 27/30] Fix flaky test. --- node/core/backing/src/lib.rs | 130 ++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 25 deletions(-) diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index dbd2b72ab8bf..7c3d90162b6f 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -2589,6 +2589,83 @@ mod tests { }); } + #[test] + fn candidate_backing_reorders_votes() { + use sp_core::Encode; + use std::convert::TryFrom; + + let relay_parent = [1; 32].into(); + let para_id = ParaId::from(10); + let session_index = 5; + let signing_context = SigningContext { parent_hash: relay_parent, session_index }; + let validators = vec![ + Sr25519Keyring::Alice, + Sr25519Keyring::Bob, + Sr25519Keyring::Charlie, + Sr25519Keyring::Dave, + Sr25519Keyring::Ferdie, + Sr25519Keyring::One, + ]; + + let validator_public = validator_pubkeys(&validators); + let validator_groups = { + let mut validator_groups = HashMap::new(); + validator_groups.insert(para_id, vec![0, 1, 2, 3, 4, 5].into_iter().map(ValidatorIndex).collect()); + validator_groups + }; + + let table_context = TableContext { + signing_context, + validator: None, + groups: validator_groups, + validators: validator_public.clone(), + }; + + let fake_attestation = |idx: u32| { + let candidate: CommittedCandidateReceipt = Default::default(); + let hash = candidate.hash(); + let mut data = vec![0; 64]; + data[0..32].copy_from_slice(hash.0.as_bytes()); + data[32..36].copy_from_slice(idx.encode().as_slice()); + + let sig = ValidatorSignature::try_from(data).unwrap(); + statement_table::generic::ValidityAttestation::Implicit(sig) + }; + + let attested = TableAttestedCandidate { + candidate: Default::default(), + validity_votes: vec![ + (ValidatorIndex(5), fake_attestation(5)), + (ValidatorIndex(3), fake_attestation(3)), + (ValidatorIndex(1), fake_attestation(1)), + ], + group_id: para_id, + }; + + let backed = table_attested_to_backed(attested, &table_context).unwrap(); + + let expected_bitvec = { + let mut validator_indices = BitVec::::with_capacity(6); + validator_indices.resize(6, false); + + validator_indices.set(1, true); + validator_indices.set(3, true); + validator_indices.set(5, true); + + validator_indices + }; + + // Should be in bitfield order, which is opposite to the order provided to the function. + let expected_attestations = vec![ + fake_attestation(1).into(), + fake_attestation(3).into(), + fake_attestation(5).into(), + ]; + + assert_eq!(backed.validator_indices, expected_bitvec); + assert_eq!(backed.validity_votes, expected_attestations); + } + // Test whether we retry on failed PoV fetching. #[test] fn retry_works() { @@ -2683,34 +2760,37 @@ mod tests { ); virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; - // There are already enough votes for the candidate to be backed: - assert_matches!( - virtual_overseer.recv().await, - AllMessages::Provisioner( - ProvisionerMessage::ProvisionableData( - _, - ProvisionableData::BackedCandidate(CandidateReceipt { - descriptor, + + // Not deterministic which message comes first: + for _ in 1..2 { + match virtual_overseer.recv().await { + AllMessages::Provisioner( + ProvisionerMessage::ProvisionableData( + _, + ProvisionableData::BackedCandidate(CandidateReceipt { + descriptor, + .. + }) + ) + ) => { + assert_eq!(descriptor, candidate.descriptor); + } + // Subsystem requests PoV and requests validation. + // We cancel once more: + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, .. - }) - ) - ) if descriptor == candidate.descriptor - ); - - // Subsystem requests PoV and requests validation. - // We cancel once more. - assert_matches!( - virtual_overseer.recv().await, - AllMessages::AvailabilityDistribution( - AvailabilityDistributionMessage::FetchPoV { - relay_parent, - tx, - .. + } + ) if relay_parent == test_state.relay_parent => { + std::mem::drop(tx); + } + msg => { + assert!(false, "Unexpected message: {:?}", msg); } - ) if relay_parent == test_state.relay_parent => { - std::mem::drop(tx); } - ); + } // Subsystem requests PoV and requests validation. // Now we pass. From 3fa57914a8aefcd9364bbffbc211bf6117e5f849 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sun, 28 Mar 2021 03:10:19 +0200 Subject: [PATCH 28/30] Spaces again. --- .../availability-distribution/src/pov_requester/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs index 16492430982e..2a3ed95e3b51 100644 --- a/node/network/availability-distribution/src/pov_requester/mod.rs +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -215,9 +215,9 @@ where { let info = runtime.get_session_info_by_index(ctx, parent, session).await?; if let ValidatorInfo { - our_index: Some(our_index), - our_group: Some(our_group) - } = &info.validator_info { + our_index: Some(our_index), + our_group: Some(our_group) + } = &info.validator_info { let indeces = info.session_info.validator_groups.get(our_group.0 as usize) .expect("Our group got retrieved from that session info, it must exist. qed.") @@ -236,7 +236,7 @@ where #[cfg(test)] mod tests { use assert_matches::assert_matches; - use futures::{executor, future}; + use futures::{executor, future}; use parity_scale_codec::Encode; use sp_core::testing::TaskExecutor; From 82d4a11ef7e8f5774e466e6685afcfaa85dcf03d Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sun, 28 Mar 2021 15:35:19 +0200 Subject: [PATCH 29/30] Guide updates. --- .../availability/availability-distribution.md | 72 +++++++++--- .../src/node/backing/candidate-backing.md | 16 +-- .../src/node/backing/pov-distribution.md | 110 ------------------ .../src/types/overseer-protocol.md | 28 +++-- 4 files changed, 82 insertions(+), 144 deletions(-) delete mode 100644 roadmap/implementers-guide/src/node/backing/pov-distribution.md diff --git a/roadmap/implementers-guide/src/node/availability/availability-distribution.md b/roadmap/implementers-guide/src/node/availability/availability-distribution.md index db7fdebe8293..da12cf4b33cb 100644 --- a/roadmap/implementers-guide/src/node/availability/availability-distribution.md +++ b/roadmap/implementers-guide/src/node/availability/availability-distribution.md @@ -1,49 +1,79 @@ # Availability Distribution -Distribute availability erasure-coded chunks to validators. - -After a candidate is backed, the availability of the PoV block must be confirmed -by 2/3+ of all validators. Backing nodes will serve chunks for a PoV block from -their [Availability Store](../utility/availability-store.md), all other -validators request their chunks from backing nodes and store those received chunks in -their local availability store. +This subsystem is responsible for distribution availability data to peers. +Availability data are chunks, `PoV`s and `AvailableData` (which is `PoV` + +`PersistedValidationData`). It does so via request response protocols. + +In particular this subsystem is responsible for: + +- Respond to network requests requesting availability data by querying the + [Availability Store](../utility/availability-store.md). +- Request chunks from backing validators to put them in the local `Availability + Store` whenever we find an occupied core on the chain, + this is to ensure availability by at least 2/3+ of all validators, this + happens after a candidate is backed. +- Fetch `PoV` from validators, when requested via `FetchPoV` message from + backing (pov_requester module). +- +The backing subsystem is responsible of making available data available in the +local `Availability Store` upon validation. This subsystem will serve any +network requests by querying that store. ## Protocol -This subsystem has no associated peer set right now, but instead relies on -a request/response protocol, defined by `Protocol::ChunkFetching`. +This subsystem does not handle any peer set messages, but the `pov_requester` +does connecto to validators of the same backing group on the validation peer +set, to ensure fast propagation of statements between those validators and for +ensuring already established connections for requesting `PoV`s. Other than that +this subsystem drives request/response protocols. Input: - OverseerSignal::ActiveLeaves(`[ActiveLeavesUpdate]`) - AvailabilityDistributionMessage{msg: ChunkFetchingRequest} +- AvailabilityDistributionMessage{msg: PoVFetchingRequest} +- AvailabilityDistributionMessage{msg: FetchPoV} Output: - NetworkBridgeMessage::SendRequests(`[Requests]`, IfDisconnected::TryConnect) - AvailabilityStore::QueryChunk(candidate_hash, index, response_channel) - AvailabilityStore::StoreChunk(candidate_hash, chunk) +- AvailabilityStore::QueryAvailableData(candidate_hash, response_channel) - RuntimeApiRequest::SessionIndexForChild - RuntimeApiRequest::SessionInfo - RuntimeApiRequest::AvailabilityCores ## Functionality -### Requesting +### PoV Requester -This subsystems monitors currently occupied cores for all active leaves. For -each occupied core it will spawn a task fetching the erasure chunk which has the -`ValidatorIndex` of the node. For this an `ChunkFetchingRequest` is -issued, via substrate's generic request/response protocol. +The PoV requester in the `pov_requester` module takes care of staying connected +to validators of the current backing group of this very validator on the `Validation` +peer set and it will handle `FetchPoV` requests by issuing network requests to +those validators. It will check the hash of the received `PoV`, but will not do any +further validation. That needs to be done by the original `FetchPoV` sender +(backing subsystem). + +### Chunk Requester + +After a candidate is backed, the availability of the PoV block must be confirmed +by 2/3+ of all validators. The chunk requester is responsible of making that +availability a reality. + +It does that by querying checking occupied cores for all active leaves. For each +occupied core it will spawn a task fetching the erasure chunk which has the +`ValidatorIndex` of the node. For this an `ChunkFetchingRequest` is issued, via +substrate's generic request/response protocol. The spawned task will start trying to fetch the chunk from validators in responsible group of the occupied core, in a random order. For ensuring that we -use already open TCP connections wherever possible, the subsystem maintains a +use already open TCP connections wherever possible, the requester maintains a cache and preserves that random order for the entire session. Note however that, because not all validators in a group have to be actual backers, not all of them are required to have the needed chunk. This in turn -could lead to low throughput, as we have to wait for a fetches to fail, +could lead to low throughput, as we have to wait for fetches to fail, before reaching a validator finally having our chunk. We do rank back validators not delivering our chunk, but as backers could vary from block to block on a perfectly legitimate basis, this is still not ideal. See issues [2509](https://github.com/paritytech/polkadot/issues/2509) and [2512](https://github.com/paritytech/polkadot/issues/2512) @@ -59,6 +89,10 @@ as we would like as many validators as possible to have their chunk. See this ### Serving -On the other side the subsystem will listen for incoming -`ChunkFetchingRequest`s from the network bridge and will respond to -queries, by looking the requested chunk up in the availability store. +On the other side the subsystem will listen for incoming `ChunkFetchingRequest`s +and `PoVFetchingRequest`s from the network bridge and will respond to queries, +by looking the requested chunks and `PoV`s up in the availability store, this +happens in the `responder` module. + +We rely on the backing subsystem to make available data available locally in the +`Availability Store` after it has validated it. diff --git a/roadmap/implementers-guide/src/node/backing/candidate-backing.md b/roadmap/implementers-guide/src/node/backing/candidate-backing.md index 016c50967494..c44f5e1f09b1 100644 --- a/roadmap/implementers-guide/src/node/backing/candidate-backing.md +++ b/roadmap/implementers-guide/src/node/backing/candidate-backing.md @@ -18,7 +18,7 @@ Output: - [`RuntimeApiMessage`][RAM] - [`CandidateSelectionMessage`][CSM] - [`ProvisionerMessage`][PM] -- [`PoVDistributionMessage`][PDM] +- [`AvailabilityDistributionMessage`][ADM] - [`StatementDistributionMessage`][SDM] ## Functionality @@ -39,8 +39,11 @@ The subsystem should maintain a set of handles to Candidate Backing Jobs that ar ### On Receiving `CandidateBackingMessage` * If the message is a [`CandidateBackingMessage`][CBM]`::GetBackedCandidates`, get all backable candidates from the statement table and send them back. -* If the message is a [`CandidateBackingMessage`][CBM]`::Second`, sign and dispatch a `Seconded` statement only if we have not seconded any other candidate and have not signed a `Valid` statement for the requested candidate. Signing both a `Seconded` and `Valid` message is a double-voting misbehavior with a heavy penalty, and this could occur if another validator has seconded the same candidate and we've received their message before the internal seconding request. After successfully dispatching the `Seconded` statement we have to distribute the PoV. -* If the message is a [`CandidateBackingMessage`][CBM]`::Statement`, count the statement to the quorum. If the statement in the message is `Seconded` and it contains a candidate that belongs to our assignment, request the corresponding `PoV` from the `PoVDistribution` and launch validation. Issue our own `Valid` or `Invalid` statement as a result. +* If the message is a [`CandidateBackingMessage`][CBM]`::Second`, sign and dispatch a `Seconded` statement only if we have not seconded any other candidate and have not signed a `Valid` statement for the requested candidate. Signing both a `Seconded` and `Valid` message is a double-voting misbehavior with a heavy penalty, and this could occur if another validator has seconded the same candidate and we've received their message before the internal seconding request. +* If the message is a [`CandidateBackingMessage`][CBM]`::Statement`, count the statement to the quorum. If the statement in the message is `Seconded` and it contains a candidate that belongs to our assignment, request the corresponding `PoV` from the backing node via `AvailabilityDistribution` and launch validation. Issue our own `Valid` or `Invalid` statement as a result. + +If the seconding node did not provide us with the `PoV` we will retry fetching from other backing validators. + > big TODO: "contextual execution" > @@ -112,11 +115,8 @@ fn spawn_validation_work(candidate, parachain head, validation function) { ### Fetch Pov Block Create a `(sender, receiver)` pair. -Dispatch a [`PoVDistributionMessage`][PDM]`::FetchPoV(relay_parent, candidate_hash, sender)` and listen on the receiver for a response. - -### Distribute Pov Block +Dispatch a [`AvailabilityDistributionMessage`][PDM]`::FetchPoV{ validator_index, pov_hash, candidate_hash, tx, } and listen on the passed receiver for a response. Availability distribution will send the request to the validator specified by `validator_index`, which might not be serving it for whatever reasons, therefore we need to retry with other backing validators in that case. -Dispatch a [`PoVDistributionMessage`][PDM]`::DistributePoV(relay_parent, candidate_descriptor, pov)`. ### Validate PoV Block @@ -135,7 +135,7 @@ Dispatch a [`StatementDistributionMessage`][PDM]`::Share(relay_parent, SignedFul [CVM]: ../../types/overseer-protocol.md#validation-request-type [PM]: ../../types/overseer-protocol.md#provisioner-message [CBM]: ../../types/overseer-protocol.md#candidate-backing-message -[PDM]: ../../types/overseer-protocol.md#pov-distribution-message +[ADM]: ../../types/overseer-protocol.md#availability-distribution-message [SDM]: ../../types/overseer-protocol.md#statement-distribution-message [CS]: candidate-selection.md diff --git a/roadmap/implementers-guide/src/node/backing/pov-distribution.md b/roadmap/implementers-guide/src/node/backing/pov-distribution.md deleted file mode 100644 index 3cf6dd995aa5..000000000000 --- a/roadmap/implementers-guide/src/node/backing/pov-distribution.md +++ /dev/null @@ -1,110 +0,0 @@ -# PoV Distribution - -This subsystem is responsible for distributing PoV blocks. For now, unified with [Statement Distribution subsystem](statement-distribution.md). - -## Protocol - -`PeerSet`: `Validation` - -Input: [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message) - - -Output: - -- NetworkBridge::SendMessage(`[PeerId]`, message) -- NetworkBridge::ReportPeer(PeerId, cost_or_benefit) - - -## Functionality - -This network protocol is responsible for distributing [`PoV`s](../../types/availability.md#proof-of-validity) by gossip. Since PoVs are heavy in practice, gossip is far from the most efficient way to distribute them. In the future, this should be replaced by a better network protocol that finds validators who have validated the block and connects to them directly. This protocol is described. - -This protocol is described in terms of "us" and our peers, with the understanding that this is the procedure that any honest node will run. It has the following goals: - - We never have to buffer an unbounded amount of data - - PoVs will flow transitively across a network of honest nodes, stemming from the validators that originally seconded candidates requiring those PoVs. - -As we are gossiping, we need to track which PoVs our peers are waiting for to avoid sending them data that they are not expecting. It is not reasonable to expect our peers to buffer unexpected PoVs, just as we will not buffer unexpected PoVs. So notifying our peers about what is being awaited is key. However it is important that the notifications system is also bounded. - -For this, in order to avoid reaching into the internals of the [Statement Distribution](statement-distribution.md) Subsystem, we can rely on an expected property of candidate backing: that each validator can second up to 2 candidates per chain head. This will typically be only one, because they are only supposed to issue one, but they can equivocate if they are willing to be slashed. So we can set a cap on the number of PoVs each peer is allowed to notify us that they are waiting for at a given relay-parent. This cap will be twice the number of validators at that relay-parent. In practice, this is a very lax upper bound that can be reduced much further if desired. - -The view update mechanism of the [Network Bridge](../utility/network-bridge.md) ensures that peers are only allowed to consider a certain set of relay-parents as live. So this bounding mechanism caps the amount of data we need to store per peer at any time at `sum({ 2 * n_validators_at_head(head) * sizeof(hash) for head in view_heads })`. Additionally, peers should only be allowed to notify us of PoV hashes they are waiting for in the context of relay-parents in our own local view, which means that `n_validators_at_head` is implied to be `0` for relay-parents not in our own local view. - -View updates from peers and our own view updates are received from the network bridge. These will lag somewhat behind the `ActiveLeavesUpdate` messages received from the overseer, which will influence the actual data we store. The `OurViewUpdate`s from the [`NetworkBridgeEvent`](../../types/overseer-protocol.md#network-bridge-update) must be considered canonical in terms of our peers' perception of us. - -Lastly, the system needs to be bootstrapped with our own perception of which PoVs we are cognizant of but awaiting data for. This is done by receipt of the [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message)::FetchPoV variant. Proper operation of this subsystem depends on the descriptors passed faithfully representing candidates which have been seconded by other validators. - -## Formal Description - -This protocol can be implemented as a state machine with the following state: - -```rust -struct State { - relay_parent_state: Map, - peer_state: Map, - our_view: View, -} - -struct BlockBasedState { - known: Map, // should be a shared PoV in practice. these things are heavy. - fetching: Map]>, - n_validators: usize, -} - -struct PeerState { - awaited: Map>, -} -``` - -We also use the [`PoVDistributionV1Message`](../../types/network.md#pov-distribution) as our `NetworkMessage`, which are sent and received by the [Network Bridge](../utility/network-bridge.md) - -Here is the logic of the state machine: - -*Overseer Signals* -- On `ActiveLeavesUpdate(relay_parent)`: - - For each relay-parent in the `activated` list: - - Get the number of validators at that relay parent by querying the [Runtime API](../utility/runtime-api.md) for the validators and then counting them. - - Create a blank entry in `relay_parent_state` under `relay_parent` with correct `n_validators` set. - - For each relay-parent in the `deactivated` list: - - Remove the entry for `relay_parent` from `relay_parent_state`. -- On `Conclude`: conclude. - -*PoV Distribution Messages* -- On `FetchPoV(relay_parent, descriptor, response_channel)` - - If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - - If there is a PoV under `descriptor.pov_hash` in the `known` map, send that PoV on the channel and return. - - Otherwise, place the `response_channel` in the `fetching` map under `descriptor.pov_hash`. - - If the `pov_hash` had no previous entry in `fetching` and there are `2 * n_validators` or fewer entries in the `fetching` set, send `NetworkMessage::Awaiting(relay_parent, vec![pov_hash])` to all peers. -- On `DistributePoV(relay_parent, descriptor, PoV)` - - If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - - Complete and remove any channels under `descriptor.pov_hash` in the `fetching` map. - - Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`. - - Note the PoV under `descriptor.pov_hash` in `known`. - -*Network Bridge Updates* -- On `PeerConnected(peer_id, observed_role)` - - Make a fresh entry in the `peer_state` map for the `peer_id`. -- On `PeerDisconnected(peer_id)` - - Remove the entry for `peer_id` from the `peer_state` map. -- On `PeerMessage(peer_id, bytes)` - - If the bytes do not decode to a `NetworkMessage` or the `peer_id` has no entry in the `peer_state` map, report and ignore. - - If this is `NetworkMessage::Awaiting(relay_parent, pov_hashes)`: - - If there is no entry under `peer_state.awaited` for the `relay_parent`, report and ignore. - - If `relay_parent` is not contained within `our_view`, report and ignore. - - Otherwise, if the peer's `awaited` map combined with the `pov_hashes` would have more than ` 2 * relay_parent_state[relay_parent].n_validators` entries, report and ignore. Note that we are leaning on the property of the network bridge that it sets our view based on `activated` heads in `ActiveLeavesUpdate` signals. - - For each new `pov_hash` in `pov_hashes`, if there is a `pov` under `pov_hash` in the `known` map, send the peer a `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`. - - Otherwise, add the `pov_hash` to the `awaited` map - - If this is `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`: - - If there is no entry under `relay_parent` in `relay_parent_state` or no entry under `pov_hash` in our `fetching` map for that `relay_parent`, report and ignore. - - If the blake2-256 hash of the pov doesn't equal `pov_hash`, report and ignore. - - Complete and remove any listeners in the `fetching` map under `pov_hash`. However, leave an empty set of listeners in the `fetching` map to denote that this was something we once awaited. This will allow us to recognize peers who have sent us something we were expecting, but just a little late. - - Add to `known` map. - - Remove the `pov_hash` from the `peer.awaited` map, if any. - - Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`. -- On `PeerViewChange(peer_id, view)` - - If Peer is unknown, ignore. - - Ensure there is an entry under `relay_parent` for each `relay_parent` in `view` within the `peer.awaited` map, creating blank `awaited` lists as necessary. - - Remove all entries under `peer.awaited` that are not within `view`. - - For all hashes in `view` but were not within the old, send the peer all the keys in our `fetching` map under the block-based state for that hash - i.e. notify the peer of everything we are awaiting at that hash. -- On `OurViewChange(view)` - - Update `our_view` to `view` - diff --git a/roadmap/implementers-guide/src/types/overseer-protocol.md b/roadmap/implementers-guide/src/types/overseer-protocol.md index cedf01d3f881..2eb94c42fdce 100644 --- a/roadmap/implementers-guide/src/types/overseer-protocol.md +++ b/roadmap/implementers-guide/src/types/overseer-protocol.md @@ -136,13 +136,27 @@ This is a network protocol that receives messages of type [`AvailabilityDistribu ```rust enum AvailabilityDistributionMessage { - /// Distribute an availability chunk to other validators. - DistributeChunk(Hash, ErasureChunk), - /// Fetch an erasure chunk from network by candidate hash and chunk index. - FetchChunk(Hash, u32), - /// Event from the network. - /// An update on network state from the network bridge. - NetworkBridgeUpdateV1(NetworkBridgeEvent), + /// Incoming network request for an availability chunk. + ChunkFetchingRequest(IncomingRequest), + /// Incoming network request for a seconded PoV. + PoVFetchingRequest(IncomingRequest), + /// Instruct availability distribution to fetch a remote PoV. + /// + /// NOTE: The result of this fetch is not yet locally validated and could be bogus. + FetchPoV { + /// The relay parent giving the necessary context. + relay_parent: Hash, + /// Validator to fetch the PoV from. + from_validator: ValidatorIndex, + /// Candidate hash to fetch the PoV for. + candidate_hash: CandidateHash, + /// Expected hash of the PoV, a PoV not matching this hash will be rejected. + pov_hash: Hash, + /// Sender for getting back the result of this fetch. + /// + /// The sender will be canceled if the fetching failed for some reason. + tx: oneshot::Sender, + }, } ``` From a0609e7cbab3dd77e185d7a739747a4eabaec4a3 Mon Sep 17 00:00:00 2001 From: Robert Klotzner Date: Sun, 28 Mar 2021 16:05:53 +0200 Subject: [PATCH 30/30] Spaces. --- node/subsystem/src/messages.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index 6fbf5f89d56a..a8fcf9c3e85a 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -755,7 +755,7 @@ impl From> for AllMessages } } impl From> for AllMessages { - fn from(req: IncomingRequest) -> Self { - From::::from(From::from(req)) - } + fn from(req: IncomingRequest) -> Self { + From::::from(From::from(req)) + } }