diff --git a/Cargo.lock b/Cargo.lock index 55362d0a0147..3ed9d4ca2b2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5627,6 +5627,7 @@ dependencies = [ "sp-core", "sp-keyring", "sp-keystore", + "sp-tracing", "thiserror", "tracing", ] @@ -5778,6 +5779,7 @@ dependencies = [ "polkadot-primitives", "sc-network", "strum", + "thiserror", ] [[package]] @@ -5926,25 +5928,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "polkadot-pov-distribution" -version = "0.1.0" -dependencies = [ - "assert_matches", - "env_logger 0.8.2", - "futures 0.3.13", - "log", - "polkadot-node-network-protocol", - "polkadot-node-subsystem", - "polkadot-node-subsystem-test-helpers", - "polkadot-node-subsystem-util", - "polkadot-primitives", - "sp-core", - "sp-keyring", - "thiserror", - "tracing", -] - [[package]] name = "polkadot-primitives" version = "0.8.30" @@ -6231,7 +6214,6 @@ dependencies = [ "polkadot-node-subsystem-util", "polkadot-overseer", "polkadot-parachain", - "polkadot-pov-distribution", "polkadot-primitives", "polkadot-rpc", "polkadot-runtime", diff --git a/Cargo.toml b/Cargo.toml index cea3037cf4e6..7d97405d09e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,6 @@ members = [ "node/core/runtime-api", "node/network/approval-distribution", "node/network/bridge", - "node/network/pov-distribution", "node/network/protocol", "node/network/statement-distribution", "node/network/bitfield-distribution", diff --git a/node/core/backing/Cargo.toml b/node/core/backing/Cargo.toml index a6b04c03ae92..d912143b9bcd 100644 --- a/node/core/backing/Cargo.toml +++ b/node/core/backing/Cargo.toml @@ -22,6 +22,7 @@ sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-application-crypto = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } sc-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-tracing = { git = "https://github.com/paritytech/substrate", branch = "master" } futures = { version = "0.3.12", features = ["thread-pool"] } assert_matches = "1.4.0" polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } diff --git a/node/core/backing/src/lib.rs b/node/core/backing/src/lib.rs index 0e1a8ec8f17e..8121f0dd83ec 100644 --- a/node/core/backing/src/lib.rs +++ b/node/core/backing/src/lib.rs @@ -35,13 +35,13 @@ use polkadot_node_primitives::{ Statement, SignedFullStatement, ValidationResult, }; use polkadot_subsystem::{ - PerLeafSpan, Stage, - jaeger, + PerLeafSpan, Stage, jaeger, messages::{ - AllMessages, AvailabilityStoreMessage, CandidateBackingMessage, CandidateSelectionMessage, - CandidateValidationMessage, PoVDistributionMessage, ProvisionableData, - ProvisionerMessage, StatementDistributionMessage, ValidationFailed, RuntimeApiRequest, - }, + AllMessages, AvailabilityDistributionMessage, AvailabilityStoreMessage, + CandidateBackingMessage, CandidateSelectionMessage, CandidateValidationMessage, + ProvisionableData, ProvisionerMessage, RuntimeApiRequest, + StatementDistributionMessage, ValidationFailed + } }; use polkadot_node_subsystem_util::{ self as util, @@ -76,8 +76,8 @@ enum Error { InvalidSignature, #[error("Failed to send candidates {0:?}")] Send(Vec), - #[error("FetchPoV channel closed before receipt")] - FetchPoV(#[source] oneshot::Canceled), + #[error("FetchPoV failed")] + FetchPoV, #[error("ValidateFromChainState channel closed before receipt")] ValidateFromChainState(#[source] oneshot::Canceled), #[error("StoreAvailableData channel closed before receipt")] @@ -94,11 +94,25 @@ enum Error { UtilError(#[from] util::Error), } +/// PoV data to validate. +enum PoVData { + /// Allready available (from candidate selection). + Ready(Arc), + /// Needs to be fetched from validator (we are checking a signed statement). + FetchFromValidator { + from_validator: ValidatorIndex, + candidate_hash: CandidateHash, + pov_hash: Hash, + }, +} + enum ValidatedCandidateCommand { - // We were instructed to second the candidate. + // We were instructed to second the candidate that has been already validated. Second(BackgroundValidationResult), // We were instructed to validate the candidate. Attest(BackgroundValidationResult), + // We were not able to `Attest` because backing validator did not send us the PoV. + AttestNoPoV(CandidateHash), } impl std::fmt::Debug for ValidatedCandidateCommand { @@ -109,6 +123,8 @@ impl std::fmt::Debug for ValidatedCandidateCommand { write!(f, "Second({})", candidate_hash), ValidatedCandidateCommand::Attest(_) => write!(f, "Attest({})", candidate_hash), + ValidatedCandidateCommand::AttestNoPoV(_) => + write!(f, "Attest({})", candidate_hash), } } } @@ -120,6 +136,7 @@ impl ValidatedCandidateCommand { ValidatedCandidateCommand::Second(Err(ref candidate)) => candidate.hash(), ValidatedCandidateCommand::Attest(Ok((ref candidate, _, _))) => candidate.hash(), ValidatedCandidateCommand::Attest(Err(ref candidate)) => candidate.hash(), + ValidatedCandidateCommand::AttestNoPoV(candidate_hash) => candidate_hash, } } } @@ -140,6 +157,8 @@ struct CandidateBackingJob { issued_statements: HashSet, /// These candidates are undergoing validation in the background. awaiting_validation: HashSet, + /// Data needed for retrying in case of `ValidatedCandidateCommand::AttestNoPoV`. + fallbacks: HashMap)>, /// `Some(h)` if this job has already issued `Seconded` statement for some candidate with `h` hash. seconded: Option, /// The candidates that are includable, by hash. Each entry here indicates @@ -153,6 +172,23 @@ struct CandidateBackingJob { metrics: Metrics, } +/// In case a backing validator does not provide a PoV, we need to retry with other backing +/// validators. +/// +/// This is the data needed to accomplish this. Basically all the data needed for spawning a +/// validation job and a list of backing validators, we can try. +#[derive(Clone)] +struct AttestingData { + /// The candidate to attest. + candidate: CandidateReceipt, + /// Hash of the PoV we need to fetch. + pov_hash: Hash, + /// Validator we are currently trying to get the PoV from. + from_validator: ValidatorIndex, + /// Other backing validators we can try in case `from_validator` failed. + backing: Vec, +} + const fn group_quorum(n_validators: usize) -> usize { (n_validators / 2) + 1 } @@ -336,18 +372,27 @@ async fn make_pov_available( Ok(Ok(())) } -async fn request_pov_from_distribution( +async fn request_pov( tx_from: &mut mpsc::Sender, - parent: Hash, - descriptor: CandidateDescriptor, + relay_parent: Hash, + from_validator: ValidatorIndex, + candidate_hash: CandidateHash, + pov_hash: Hash, ) -> Result, Error> { - let (tx, rx) = oneshot::channel(); - tx_from.send(AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(parent, descriptor, tx) - ).into()).await?; + let (tx, rx) = oneshot::channel(); + tx_from.send(FromJobCommand::SendMessage(AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + from_validator, + candidate_hash, + pov_hash, + tx, + } + ))).await?; - rx.await.map_err(Error::FetchPoV) + let pov = rx.await.map_err(|_| Error::FetchPoV)?; + Ok(Arc::new(pov)) } async fn request_candidate_validation( @@ -380,7 +425,7 @@ struct BackgroundValidationParams { tx_command: mpsc::Sender, candidate: CandidateReceipt, relay_parent: Hash, - pov: Option>, + pov: PoVData, validator_index: Option, n_validators: usize, span: Option, @@ -403,14 +448,27 @@ async fn validate_and_make_available( } = params; let pov = match pov { - Some(pov) => pov, - None => { + PoVData::Ready(pov) => pov, + PoVData::FetchFromValidator { + from_validator, + candidate_hash, + pov_hash, + } => { let _span = span.as_ref().map(|s| s.child("request-pov")); - request_pov_from_distribution( - &mut tx_from, - relay_parent, - candidate.descriptor.clone(), - ).await? + match request_pov( + &mut tx_from, + relay_parent, + from_validator, + candidate_hash, + pov_hash, + ).await { + Err(Error::FetchPoV) => { + tx_command.send(ValidatedCandidateCommand::AttestNoPoV(candidate.hash())).await.map_err(Error::Mpsc)?; + return Ok(()) + } + Err(err) => return Err(err), + Ok(pov) => pov, + } } }; @@ -526,7 +584,7 @@ impl CandidateBackingJob { match command { ValidatedCandidateCommand::Second(res) => { match res { - Ok((candidate, commitments, pov)) => { + Ok((candidate, commitments, _)) => { // sanity check. if self.seconded.is_none() && !self.issued_statements.contains(&candidate_hash) { self.seconded = Some(candidate_hash); @@ -543,7 +601,6 @@ impl CandidateBackingJob { ).await? { self.issue_candidate_seconded_message(stmt).await?; } - self.distribute_pov(candidate.descriptor, pov).await?; } } Err(candidate) => { @@ -552,6 +609,8 @@ impl CandidateBackingJob { } } ValidatedCandidateCommand::Attest(res) => { + // We are done - avoid new validation spawns: + self.fallbacks.remove(&candidate_hash); // sanity check. if !self.issued_statements.contains(&candidate_hash) { if res.is_ok() { @@ -561,6 +620,24 @@ impl CandidateBackingJob { self.issued_statements.insert(candidate_hash); } } + ValidatedCandidateCommand::AttestNoPoV(candidate_hash) => { + if let Some((attesting, span)) = self.fallbacks.get_mut(&candidate_hash) { + if let Some(index) = attesting.backing.pop() { + attesting.from_validator = index; + // Ok, another try: + let c_span = span.as_ref().map(|s| s.child("try")); + let attesting = attesting.clone(); + self.kick_off_validation_work(attesting, c_span).await? + } + + } else { + tracing::warn!( + target: LOG_TARGET, + "AttestNoPoV was triggered without fallback being available." + ); + debug_assert!(false); + } + } } Ok(()) @@ -574,12 +651,11 @@ impl CandidateBackingJob { >, ) -> Result<(), Error> { let candidate_hash = params.candidate.hash(); - if self.awaiting_validation.insert(candidate_hash) { // spawn background task. let bg = async move { if let Err(e) = validate_and_make_available(params).await { - tracing::error!("Failed to validate and make available: {:?}", e); + tracing::error!(target: LOG_TARGET, "Failed to validate and make available: {:?}", e); } }; self.tx_from.send(FromJobCommand::Spawn("Backing Validation", bg.boxed())).await?; @@ -644,7 +720,7 @@ impl CandidateBackingJob { tx_command: self.background_validation_tx.clone(), candidate: candidate.clone(), relay_parent: self.parent, - pov: Some(pov), + pov: PoVData::Ready(pov), validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, @@ -817,28 +893,23 @@ impl CandidateBackingJob { } /// Kick off validation work and distribute the result as a signed statement. - #[tracing::instrument(level = "trace", skip(self, span), fields(subsystem = LOG_TARGET))] + #[tracing::instrument(level = "trace", skip(self, attesting, span), fields(subsystem = LOG_TARGET))] async fn kick_off_validation_work( &mut self, - summary: TableSummary, + attesting: AttestingData, span: Option, ) -> Result<(), Error> { - let candidate_hash = summary.candidate; - + let candidate_hash = attesting.candidate.hash(); if self.issued_statements.contains(&candidate_hash) { return Ok(()) } - // We clone the commitments here because there are borrowck - // errors relating to this being a struct and methods borrowing the entirety of self - // and not just those things that the function uses. - let candidate = self.table.get_candidate(&candidate_hash).ok_or(Error::CandidateNotFound)?.to_plain(); - let descriptor = candidate.descriptor().clone(); + let descriptor = attesting.candidate.descriptor().clone(); tracing::debug!( target: LOG_TARGET, candidate_hash = ?candidate_hash, - candidate_receipt = ?candidate, + candidate_receipt = ?attesting.candidate, "Kicking off validation", ); @@ -854,12 +925,18 @@ impl CandidateBackingJob { return Ok(()); } + let pov = PoVData::FetchFromValidator { + from_validator: attesting.from_validator, + candidate_hash, + pov_hash: attesting.pov_hash, + }; + self.background_validate_and_make_available(BackgroundValidationParams { tx_from: self.tx_from.clone(), tx_command: self.background_validation_tx.clone(), - candidate, + candidate: attesting.candidate, relay_parent: self.parent, - pov: None, + pov, validator_index: self.table_context.validator.as_ref().map(|v| v.index()), n_validators: self.table_context.validators.len(), span, @@ -876,19 +953,57 @@ impl CandidateBackingJob { statement: SignedFullStatement, ) -> Result<(), Error> { if let Some(summary) = self.import_statement(&statement, parent_span).await? { - if let Statement::Seconded(_) = statement.payload() { - if Some(summary.group_id) == self.assignment { + if Some(summary.group_id) != self.assignment { + return Ok(()) + } + let (attesting, span) = match statement.payload() { + Statement::Seconded(receipt) => { + let candidate_hash = summary.candidate; + let span = self.get_unbacked_validation_child( root_span, summary.candidate, summary.group_id, ); - self.kick_off_validation_work(summary, span).await?; + let attesting = AttestingData { + candidate: self.table.get_candidate(&candidate_hash).ok_or(Error::CandidateNotFound)?.to_plain(), + pov_hash: receipt.descriptor.pov_hash, + from_validator: statement.validator_index(), + backing: Vec::new(), + }; + let child = span.as_ref().map(|s| s.child("try")); + self.fallbacks.insert(summary.candidate, (attesting.clone(), span)); + (attesting, child) } - } - } + Statement::Valid(candidate_hash) => { + if let Some((attesting, span)) = self.fallbacks.get_mut(candidate_hash) { + + let our_index = self.table_context.validator.as_ref().map(|v| v.index()); + if our_index == Some(statement.validator_index()) { + return Ok(()) + } + + if self.awaiting_validation.contains(candidate_hash) { + // Job already running: + attesting.backing.push(statement.validator_index()); + return Ok(()) + } else { + // No job, so start another try with current validator: + attesting.from_validator = statement.validator_index(); + (attesting.clone(), span.as_ref().map(|s| s.child("try"))) + } + } else { + return Ok(()) + } + } + }; + self.kick_off_validation_work( + attesting, + span, + ).await?; + } Ok(()) } @@ -981,16 +1096,6 @@ impl CandidateBackingJob { Ok(()) } - async fn distribute_pov( - &mut self, - descriptor: CandidateDescriptor, - pov: Arc, - ) -> Result<(), Error> { - self.tx_from.send(AllMessages::from( - PoVDistributionMessage::DistributePoV(self.parent, descriptor, pov), - ).into()).await.map_err(Into::into) - } - async fn distribute_signed_statement(&mut self, s: SignedFullStatement) -> Result<(), Error> { let smsg = StatementDistributionMessage::Share(self.parent, s); @@ -1131,6 +1236,7 @@ impl util::JobTrait for CandidateBackingJob { required_collator, issued_statements: HashSet::new(), awaiting_validation: HashSet::new(), + fallbacks: HashMap::new(), seconded: None, unbacked_candidates: HashMap::new(), backed: HashSet::new(), @@ -1565,15 +1671,6 @@ mod tests { } ); - assert_matches!( - virtual_overseer.recv().await, - AllMessages::PoVDistribution(PoVDistributionMessage::DistributePoV(hash, descriptor, pov_received)) => { - assert_eq!(test_state.relay_parent, hash); - assert_eq!(candidate.descriptor, descriptor); - assert_eq!(pov, *pov_received); - } - ); - virtual_overseer.send(FromOverseer::Signal( OverseerSignal::ActiveLeaves(ActiveLeavesUpdate::stop_work(test_state.relay_parent))) ).await; @@ -1644,14 +1741,17 @@ mod tests { virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; // Sending a `Statement::Seconded` for our assignment will start - // validation process. The first thing requested is PoV from the - // `PoVDistribution`. + // validation process. The first thing requested is the PoV. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -1797,10 +1897,14 @@ mod tests { // `PoVDistribution`. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -1936,10 +2040,14 @@ mod tests { assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } ) if relay_parent == test_state.relay_parent => { - tx.send(Arc::new(pov.clone())).unwrap(); + tx.send(pov.clone()).unwrap(); } ); @@ -2211,11 +2319,14 @@ mod tests { // Subsystem requests PoV and requests validation. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) - ) => { - assert_eq!(relay_parent, test_state.relay_parent); - tx.send(Arc::new(pov.clone())).unwrap(); + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); } ); @@ -2331,11 +2442,14 @@ mod tests { // Subsystem requests PoV and requests validation. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::FetchPoV(relay_parent, _, tx) - ) => { - assert_eq!(relay_parent, test_state.relay_parent); - tx.send(Arc::new(pov.clone())).unwrap(); + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); } ); @@ -2552,4 +2666,157 @@ mod tests { assert_eq!(backed.validator_indices, expected_bitvec); assert_eq!(backed.validity_votes, expected_attestations); } + + // Test whether we retry on failed PoV fetching. + #[test] + fn retry_works() { + sp_tracing::try_init_simple(); + let test_state = TestState::default(); + test_harness(test_state.keystore.clone(), |test_harness| async move { + let TestHarness { mut virtual_overseer } = test_harness; + + test_startup(&mut virtual_overseer, &test_state).await; + + let pov = PoV { + block_data: BlockData(vec![42, 43, 44]), + }; + + let pov_hash = pov.hash(); + + let candidate = TestCandidateBuilder { + para_id: test_state.chain_ids[0], + relay_parent: test_state.relay_parent, + pov_hash, + erasure_root: make_erasure_root(&test_state, pov.clone()), + ..Default::default() + }.build(); + + let public2 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, Some(&test_state.validators[2].to_seed()) + ).await.expect("Insert key into keystore"); + let public3 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, + Some(&test_state.validators[3].to_seed()), + ).await.expect("Insert key into keystore"); + let public5 = CryptoStore::sr25519_generate_new( + &*test_state.keystore, + ValidatorId::ID, + Some(&test_state.validators[5].to_seed()), + ).await.expect("Insert key into keystore"); + let signed_a = SignedFullStatement::sign( + &test_state.keystore, + Statement::Seconded(candidate.clone()), + &test_state.signing_context, + ValidatorIndex(2), + &public2.into(), + ).await.ok().flatten().expect("should be signed"); + let signed_b = SignedFullStatement::sign( + &test_state.keystore, + Statement::Valid(candidate.hash()), + &test_state.signing_context, + ValidatorIndex(3), + &public3.into(), + ).await.ok().flatten().expect("should be signed"); + let signed_c = SignedFullStatement::sign( + &test_state.keystore, + Statement::Valid(candidate.hash()), + &test_state.signing_context, + ValidatorIndex(5), + &public5.into(), + ).await.ok().flatten().expect("should be signed"); + + // Send in a `Statement` with a candidate. + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_a.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; + + // Subsystem requests PoV and requests validation. + // We cancel - should mean retry on next backing statement. + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + std::mem::drop(tx); + } + ); + + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_b.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; + + let statement = CandidateBackingMessage::Statement( + test_state.relay_parent, + signed_c.clone(), + ); + virtual_overseer.send(FromOverseer::Communication{ msg: statement }).await; + + // Not deterministic which message comes first: + for _ in 0..2 { + match virtual_overseer.recv().await { + AllMessages::Provisioner( + ProvisionerMessage::ProvisionableData( + _, + ProvisionableData::BackedCandidate(CandidateReceipt { + descriptor, + .. + }) + ) + ) => { + assert_eq!(descriptor, candidate.descriptor); + } + // Subsystem requests PoV and requests validation. + // We cancel once more: + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + std::mem::drop(tx); + } + msg => { + assert!(false, "Unexpected message: {:?}", msg); + } + } + } + + // Subsystem requests PoV and requests validation. + // Now we pass. + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityDistribution( + AvailabilityDistributionMessage::FetchPoV { + relay_parent, + tx, + .. + } + ) if relay_parent == test_state.relay_parent => { + tx.send(pov.clone()).unwrap(); + } + ); + + assert_matches!( + virtual_overseer.recv().await, + AllMessages::CandidateValidation( + CandidateValidationMessage::ValidateFromChainState( + c, + pov, + _tx, + ) + ) if pov == pov && &c == candidate.descriptor() + ); + }); + } } diff --git a/node/network/availability-distribution/src/error.rs b/node/network/availability-distribution/src/error.rs index 9960e7f1d7d0..c38294542007 100644 --- a/node/network/availability-distribution/src/error.rs +++ b/node/network/availability-distribution/src/error.rs @@ -17,20 +17,26 @@ //! Error handling related code and Error/Result definitions. +use polkadot_node_network_protocol::request_response::request::RequestError; use thiserror::Error; use futures::channel::oneshot; use polkadot_node_subsystem_util::Error as UtilError; -use polkadot_primitives::v1::SessionIndex; +use polkadot_primitives::v1::{CompressedPoVError, SessionIndex}; use polkadot_subsystem::{errors::RuntimeApiError, SubsystemError}; +use crate::LOG_TARGET; + /// Errors of this subsystem. #[derive(Debug, Error)] pub enum Error { - #[error("Response channel to obtain QueryChunk failed")] + #[error("Response channel to obtain chunk failed")] QueryChunkResponseChannel(#[source] oneshot::Canceled), + #[error("Response channel to obtain available data failed")] + QueryAvailableDataResponseChannel(#[source] oneshot::Canceled), + #[error("Receive channel closed")] IncomingMessageChannel(#[source] SubsystemError), @@ -53,24 +59,43 @@ pub enum Error { /// Sending response failed. #[error("Sending a request's response failed.")] SendResponse, -} -/// Error that we should handle gracefully by logging it. -#[derive(Debug)] -pub enum NonFatalError { /// Some request to utility functions failed. /// This can be either `RuntimeRequestCanceled` or `RuntimeApiError`. + #[error("Utility request failed")] UtilRequest(UtilError), /// Runtime API subsystem is down, which means we're shutting down. + #[error("Runtime request canceled")] RuntimeRequestCanceled(oneshot::Canceled), /// Some request to the runtime failed. /// For example if we prune a block we're requesting info about. + #[error("Runtime API error")] RuntimeRequest(RuntimeApiError), /// We tried fetching a session info which was not available. + #[error("There was no session with the given index")] NoSuchSession(SessionIndex), + + /// Decompressing PoV failed. + #[error("PoV could not be decompressed")] + PoVDecompression(CompressedPoVError), + + /// Fetching PoV failed with `RequestError`. + #[error("FetchPoV request error")] + FetchPoV(#[source] RequestError), + + /// Fetching PoV failed as the received PoV did not match the expected hash. + #[error("Fetched PoV does not match expected hash")] + UnexpectedPoV, + + #[error("Remote responded with `NoSuchPoV`")] + NoSuchPoV, + + /// No validator with the index could be found in current session. + #[error("Given validator index could not be found")] + InvalidValidatorIndex, } pub type Result = std::result::Result; @@ -87,9 +112,20 @@ pub(crate) async fn recv_runtime( oneshot::Receiver>, UtilError, >, -) -> std::result::Result { - r.map_err(NonFatalError::UtilRequest)? +) -> std::result::Result { + r.map_err(Error::UtilRequest)? .await - .map_err(NonFatalError::RuntimeRequestCanceled)? - .map_err(NonFatalError::RuntimeRequest) + .map_err(Error::RuntimeRequestCanceled)? + .map_err(Error::RuntimeRequest) +} + + +/// Utility for eating top level errors and log them. +/// +/// We basically always want to try and continue on error. This utility function is meant to +/// consume top-level errors by simply logging them +pub fn log_error(result: Result<()>, ctx: &'static str) { + if let Err(error) = result { + tracing::warn!(target: LOG_TARGET, error = ?error, ctx); + } } diff --git a/node/network/availability-distribution/src/lib.rs b/node/network/availability-distribution/src/lib.rs index 3dd7b4e69c0d..775ad6f793c1 100644 --- a/node/network/availability-distribution/src/lib.rs +++ b/node/network/availability-distribution/src/lib.rs @@ -26,15 +26,23 @@ use polkadot_subsystem::{ /// Error and [`Result`] type for this subsystem. mod error; pub use error::Error; -use error::Result; +use error::{Result, log_error}; + +/// Runtime requests. +mod runtime; +use runtime::Runtime; /// `Requester` taking care of requesting chunks for candidates pending availability. mod requester; use requester::Requester; +/// Handing requests for PoVs during backing. +mod pov_requester; +use pov_requester::PoVRequester; + /// Responding to erasure chunk requests: mod responder; -use responder::answer_request_log; +use responder::{answer_chunk_request_log, answer_pov_request_log}; /// Cache for session information. mod session_cache; @@ -52,6 +60,8 @@ const LOG_TARGET: &'static str = "parachain::availability-distribution"; pub struct AvailabilityDistributionSubsystem { /// Pointer to a keystore, which is required for determining this nodes validator index. keystore: SyncCryptoStorePtr, + /// Easy and efficient runtime access for this subsystem. + runtime: Runtime, /// Prometheus metrics. metrics: Metrics, } @@ -74,17 +84,20 @@ where } impl AvailabilityDistributionSubsystem { + /// Create a new instance of the availability distribution. pub fn new(keystore: SyncCryptoStorePtr, metrics: Metrics) -> Self { - Self { keystore, metrics } + let runtime = Runtime::new(keystore.clone()); + Self { keystore, runtime, metrics } } /// Start processing work as passed on from the Overseer. - async fn run(self, mut ctx: Context) -> Result<()> + async fn run(mut self, mut ctx: Context) -> Result<()> where Context: SubsystemContext + Sync + Send, { let mut requester = Requester::new(self.keystore.clone(), self.metrics.clone()).fuse(); + let mut pov_requester = PoVRequester::new(); loop { let action = { let mut subsystem_next = ctx.recv().fuse(); @@ -107,14 +120,14 @@ impl AvailabilityDistributionSubsystem { }; match message { FromOverseer::Signal(OverseerSignal::ActiveLeaves(update)) => { - // Update the relay chain heads we are fetching our pieces for: - if let Some(e) = requester - .get_mut() - .update_fetching_heads(&mut ctx, update) - .await? - { - tracing::debug!(target: LOG_TARGET, "Error processing ActiveLeavesUpdate: {:?}", e); - } + log_error( + pov_requester.update_connected_validators(&mut ctx, &mut self.runtime, &update).await, + "PoVRequester::update_connected_validators" + ); + log_error( + requester.get_mut().update_fetching_heads(&mut ctx, update).await, + "Error in Requester::update_fetching_heads" + ); } FromOverseer::Signal(OverseerSignal::BlockFinalized(..)) => {} FromOverseer::Signal(OverseerSignal::Conclude) => { @@ -123,7 +136,34 @@ impl AvailabilityDistributionSubsystem { FromOverseer::Communication { msg: AvailabilityDistributionMessage::ChunkFetchingRequest(req), } => { - answer_request_log(&mut ctx, req, &self.metrics).await + answer_chunk_request_log(&mut ctx, req, &self.metrics).await + } + FromOverseer::Communication { + msg: AvailabilityDistributionMessage::PoVFetchingRequest(req), + } => { + answer_pov_request_log(&mut ctx, req, &self.metrics).await + } + FromOverseer::Communication { + msg: AvailabilityDistributionMessage::FetchPoV { + relay_parent, + from_validator, + candidate_hash, + pov_hash, + tx, + }, + } => { + log_error( + pov_requester.fetch_pov( + &mut ctx, + &mut self.runtime, + relay_parent, + from_validator, + candidate_hash, + pov_hash, + tx, + ).await, + "PoVRequester::fetch_pov" + ); } } } diff --git a/node/network/availability-distribution/src/metrics.rs b/node/network/availability-distribution/src/metrics.rs index c07500996fa2..925bbc8fe430 100644 --- a/node/network/availability-distribution/src/metrics.rs +++ b/node/network/availability-distribution/src/metrics.rs @@ -24,7 +24,7 @@ pub const SUCCEEDED: &'static str = "succeeded"; /// Label for fail counters. pub const FAILED: &'static str = "failed"; -/// Label for chunks that could not be served, because they were not available. +/// Label for chunks/PoVs that could not be served, because they were not available. pub const NOT_FOUND: &'static str = "not-found"; /// Availability Distribution metrics. @@ -47,6 +47,12 @@ struct MetricsInner { /// to a chunk request. This includes `NoSuchChunk` responses. served_chunks: CounterVec, + /// Number of PoVs served. + /// + /// Note: Right now, `Succeeded` gets incremented whenever we were able to successfully respond + /// to a PoV request. This includes `NoSuchPoV` responses. + served_povs: CounterVec, + /// Number of times our first set of validators did not provide the needed chunk and we had to /// query further validators. retries: Counter, @@ -66,12 +72,19 @@ impl Metrics { } /// Increment counter on served chunks. - pub fn on_served(&self, label: &'static str) { + pub fn on_served_chunk(&self, label: &'static str) { if let Some(metrics) = &self.0 { metrics.served_chunks.with_label_values(&[label]).inc() } } + /// Increment counter on served PoVs. + pub fn on_served_pov(&self, label: &'static str) { + if let Some(metrics) = &self.0 { + metrics.served_povs.with_label_values(&[label]).inc() + } + } + /// Increment retry counter. pub fn on_retry(&self) { if let Some(metrics) = &self.0 { @@ -103,6 +116,16 @@ impl metrics::Metrics for Metrics { )?, registry, )?, + served_povs: prometheus::register( + CounterVec::new( + Opts::new( + "parachain_served_povs_total", + "Total number of povs served by this backer.", + ), + &["success"] + )?, + registry, + )?, retries: prometheus::register( Counter::new( "parachain_fetch_retries_total", diff --git a/node/network/availability-distribution/src/pov_requester/mod.rs b/node/network/availability-distribution/src/pov_requester/mod.rs new file mode 100644 index 000000000000..2a3ed95e3b51 --- /dev/null +++ b/node/network/availability-distribution/src/pov_requester/mod.rs @@ -0,0 +1,333 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! PoV requester takes care of requesting PoVs from validators of a backing group. + +use futures::{FutureExt, channel::{mpsc, oneshot}, future::BoxFuture}; +use lru::LruCache; + +use polkadot_subsystem::jaeger; +use polkadot_node_network_protocol::{ + PeerId, peer_set::PeerSet, + request_response::{OutgoingRequest, Recipient, request::{RequestError, Requests}, + v1::{PoVFetchingRequest, PoVFetchingResponse}} +}; +use polkadot_primitives::v1::{ + AuthorityDiscoveryId, CandidateHash, Hash, PoV, SessionIndex, ValidatorIndex +}; +use polkadot_subsystem::{ + ActiveLeavesUpdate, SubsystemContext, ActivatedLeaf, + messages::{AllMessages, NetworkBridgeMessage, IfDisconnected} +}; + +use crate::{error::{Error, log_error}, runtime::{Runtime, ValidatorInfo}}; + +/// Number of sessions we want to keep in the LRU. +const NUM_SESSIONS: usize = 2; + +pub struct PoVRequester { + /// We only ever care about being connected to validators of at most two sessions. + /// + /// So we keep an LRU for managing connection requests of size 2. + /// Cache will contain `None` if we are not a validator in that session. + connected_validators: LruCache>>, +} + +impl PoVRequester { + /// Create a new requester for PoVs. + pub fn new() -> Self { + Self { + connected_validators: LruCache::new(NUM_SESSIONS), + } + } + + /// Make sure we are connected to the right set of validators. + /// + /// On every `ActiveLeavesUpdate`, we check whether we are connected properly to our current + /// validator group. + pub async fn update_connected_validators( + &mut self, + ctx: &mut Context, + runtime: &mut Runtime, + update: &ActiveLeavesUpdate, + ) -> super::Result<()> + where + Context: SubsystemContext, + { + let activated = update.activated.iter().map(|ActivatedLeaf { hash: h, .. }| h); + let activated_sessions = + get_activated_sessions(ctx, runtime, activated).await?; + + for (parent, session_index) in activated_sessions { + if self.connected_validators.contains(&session_index) { + continue + } + let rx = connect_to_relevant_validators(ctx, runtime, parent, session_index).await?; + self.connected_validators.put(session_index, rx); + } + Ok(()) + } + + /// Start background worker for taking care of fetching the requested `PoV` from the network. + pub async fn fetch_pov( + &self, + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + from_validator: ValidatorIndex, + candidate_hash: CandidateHash, + pov_hash: Hash, + tx: oneshot::Sender + ) -> super::Result<()> + where + Context: SubsystemContext, + { + let info = &runtime.get_session_info(ctx, parent).await?.session_info; + let authority_id = info.discovery_keys.get(from_validator.0 as usize) + .ok_or(Error::InvalidValidatorIndex)? + .clone(); + let (req, pending_response) = OutgoingRequest::new( + Recipient::Authority(authority_id), + PoVFetchingRequest { + candidate_hash, + }, + ); + let full_req = Requests::PoVFetching(req); + + ctx.send_message( + AllMessages::NetworkBridge( + NetworkBridgeMessage::SendRequests( + vec![full_req], + // We are supposed to be connected to validators of our group via `PeerSet`, + // but at session boundaries that is kind of racy, in case a connection takes + // longer to get established, so we try to connect in any case. + IfDisconnected::TryConnect + ) + )).await; + + let span = jaeger::Span::new(candidate_hash, "fetch-pov") + .with_validator_index(from_validator); + ctx.spawn("pov-fetcher", fetch_pov_job(pov_hash, pending_response.boxed(), span, tx).boxed()) + .await + .map_err(|e| Error::SpawnTask(e)) + } +} + +/// Future to be spawned for taking care of handling reception and sending of PoV. +async fn fetch_pov_job( + pov_hash: Hash, + pending_response: BoxFuture<'static, Result>, + span: jaeger::Span, + tx: oneshot::Sender, +) { + log_error( + do_fetch_pov(pov_hash, pending_response, span, tx).await, + "fetch_pov_job", + ) +} + +/// Do the actual work of waiting for the response. +async fn do_fetch_pov( + pov_hash: Hash, + pending_response: BoxFuture<'static, Result>, + _span: jaeger::Span, + tx: oneshot::Sender, +) + -> super::Result<()> +{ + let response = pending_response.await.map_err(Error::FetchPoV)?; + let pov = match response { + PoVFetchingResponse::PoV(compressed) => { + compressed.decompress().map_err(Error::PoVDecompression)? + } + PoVFetchingResponse::NoSuchPoV => { + return Err(Error::NoSuchPoV) + } + }; + if pov.hash() == pov_hash { + tx.send(pov).map_err(|_| Error::SendResponse) + } else { + Err(Error::UnexpectedPoV) + } +} + +/// Get the session indeces for the given relay chain parents. +async fn get_activated_sessions(ctx: &mut Context, runtime: &mut Runtime, new_heads: impl Iterator) + -> super::Result> +where + Context: SubsystemContext, +{ + let mut sessions = Vec::new(); + for parent in new_heads { + sessions.push((*parent, runtime.get_session_index(ctx, *parent).await?)); + } + Ok(sessions.into_iter()) +} + +/// Connect to validators of our validator group. +async fn connect_to_relevant_validators( + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + session: SessionIndex +) + -> super::Result>> +where + Context: SubsystemContext, +{ + if let Some(validator_ids) = determine_relevant_validators(ctx, runtime, parent, session).await? { + // We don't actually care about `PeerId`s, just keeping receiver so we stay connected: + let (tx, rx) = mpsc::channel(0); + ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ConnectToValidators { + validator_ids, peer_set: PeerSet::Validation, connected: tx + })).await; + Ok(Some(rx)) + } else { + Ok(None) + } +} + +/// Get the validators in our validator group. +/// +/// Return: `None` if not a validator. +async fn determine_relevant_validators( + ctx: &mut Context, + runtime: &mut Runtime, + parent: Hash, + session: SessionIndex, +) + -> super::Result>> +where + Context: SubsystemContext, +{ + let info = runtime.get_session_info_by_index(ctx, parent, session).await?; + if let ValidatorInfo { + our_index: Some(our_index), + our_group: Some(our_group) + } = &info.validator_info { + + let indeces = info.session_info.validator_groups.get(our_group.0 as usize) + .expect("Our group got retrieved from that session info, it must exist. qed.") + .clone(); + Ok(Some( + indeces.into_iter() + .filter(|i| *i != *our_index) + .map(|i| info.session_info.discovery_keys[i.0 as usize].clone()) + .collect() + )) + } else { + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + use futures::{executor, future}; + + use parity_scale_codec::Encode; + use sp_core::testing::TaskExecutor; + + use polkadot_primitives::v1::{BlockData, CandidateHash, CompressedPoV, Hash, ValidatorIndex}; + use polkadot_subsystem_testhelpers as test_helpers; + use polkadot_subsystem::messages::{AvailabilityDistributionMessage, RuntimeApiMessage, RuntimeApiRequest}; + + use super::*; + use crate::LOG_TARGET; + use crate::tests::mock::{make_session_info, make_ferdie_keystore}; + + #[test] + fn rejects_invalid_pov() { + sp_tracing::try_init_simple(); + let pov = PoV { + block_data: BlockData(vec![1,2,3,4,5,6]), + }; + test_run(Hash::default(), pov); + } + + #[test] + fn accepts_valid_pov() { + sp_tracing::try_init_simple(); + let pov = PoV { + block_data: BlockData(vec![1,2,3,4,5,6]), + }; + test_run(pov.hash(), pov); + } + + fn test_run(pov_hash: Hash, pov: PoV) { + let requester = PoVRequester::new(); + let pool = TaskExecutor::new(); + let (mut context, mut virtual_overseer) = + test_helpers::make_subsystem_context::(pool.clone()); + let keystore = make_ferdie_keystore(); + let mut runtime = crate::runtime::Runtime::new(keystore); + + let (tx, rx) = oneshot::channel(); + let testee = async { + requester.fetch_pov( + &mut context, + &mut runtime, + Hash::default(), + ValidatorIndex(0), + CandidateHash::default(), + pov_hash, + tx, + ).await.expect("Should succeed"); + }; + + let tester = async move { + loop { + match virtual_overseer.recv().await { + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionIndexForChild(tx) + ) + ) => { + tx.send(Ok(0)).unwrap(); + } + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionInfo(_, tx) + ) + ) => { + tx.send(Ok(Some(make_session_info()))).unwrap(); + } + AllMessages::NetworkBridge(NetworkBridgeMessage::SendRequests(mut reqs, _)) => { + let req = assert_matches!( + reqs.pop(), + Some(Requests::PoVFetching(outgoing)) => {outgoing} + ); + req.pending_response.send(Ok(PoVFetchingResponse::PoV( + CompressedPoV::compress(&pov).unwrap()).encode() + )).unwrap(); + break + }, + msg => tracing::debug!(target: LOG_TARGET, msg = ?msg, "Received msg"), + } + } + if pov.hash() == pov_hash { + assert_eq!(rx.await, Ok(pov)); + } else { + assert_eq!(rx.await, Err(oneshot::Canceled)); + } + }; + futures::pin_mut!(testee); + futures::pin_mut!(tester); + executor::block_on(future::join(testee, tester)); + } +} diff --git a/node/network/availability-distribution/src/requester/fetch_task/mod.rs b/node/network/availability-distribution/src/requester/fetch_task/mod.rs index c4e539ab23da..7b9c39e52b28 100644 --- a/node/network/availability-distribution/src/requester/fetch_task/mod.rs +++ b/node/network/availability-distribution/src/requester/fetch_task/mod.rs @@ -138,7 +138,7 @@ impl FetchTaskConfig { let live_in = vec![leaf].into_iter().collect(); // Don't run tasks for our backing group: - if session_info.our_group == core.group_responsible { + if session_info.our_group == Some(core.group_responsible) { return FetchTaskConfig { live_in, prepared_running: None, diff --git a/node/network/availability-distribution/src/requester/mod.rs b/node/network/availability-distribution/src/requester/mod.rs index ad7b1f036b0c..5516340df3b9 100644 --- a/node/network/availability-distribution/src/requester/mod.rs +++ b/node/network/availability-distribution/src/requester/mod.rs @@ -39,7 +39,7 @@ use polkadot_subsystem::{ }; use super::{error::recv_runtime, session_cache::SessionCache, LOG_TARGET, Metrics}; -use crate::error::NonFatalError; +use crate::error::Error; /// A task fetching a particular chunk. mod fetch_task; @@ -96,7 +96,7 @@ impl Requester { &mut self, ctx: &mut Context, update: ActiveLeavesUpdate, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { @@ -111,9 +111,9 @@ impl Requester { } = update; // Order important! We need to handle activated, prior to deactivated, otherwise we might // cancel still needed jobs. - let err = self.start_requesting_chunks(ctx, activated.into_iter()).await?; + self.start_requesting_chunks(ctx, activated.into_iter()).await?; self.stop_requesting_chunks(deactivated.into_iter()); - Ok(err) + Ok(()) } /// Start requesting chunks for newly imported heads. @@ -121,25 +121,20 @@ impl Requester { &mut self, ctx: &mut Context, new_heads: impl Iterator, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { for ActivatedLeaf { hash: leaf, .. } in new_heads { - let cores = match query_occupied_cores(ctx, leaf).await { - Err(err) => return Ok(Some(err)), - Ok(cores) => cores, - }; + let cores = query_occupied_cores(ctx, leaf).await?; tracing::trace!( target: LOG_TARGET, occupied_cores = ?cores, "Query occupied core" ); - if let Some(err) = self.add_cores(ctx, leaf, cores).await? { - return Ok(Some(err)); - } + self.add_cores(ctx, leaf, cores).await?; } - Ok(None) + Ok(()) } /// Stop requesting chunks for obsolete heads. @@ -164,7 +159,7 @@ impl Requester { ctx: &mut Context, leaf: Hash, cores: impl IntoIterator, - ) -> super::Result> + ) -> super::Result<()> where Context: SubsystemContext, { @@ -179,7 +174,7 @@ impl Requester { let tx = self.tx.clone(); let metrics = self.metrics.clone(); - let task_cfg = match self + let task_cfg = self .session_cache .with_session_info( ctx, @@ -189,11 +184,7 @@ impl Requester { leaf, |info| FetchTaskConfig::new(leaf, &core, tx, metrics, info), ) - .await - { - Err(err) => return Ok(Some(err)), - Ok(task_cfg) => task_cfg, - }; + .await?; if let Some(task_cfg) = task_cfg { e.insert(FetchTask::start(task_cfg, ctx).await?); @@ -202,7 +193,7 @@ impl Requester { } } } - Ok(None) + Ok(()) } } @@ -237,7 +228,7 @@ impl Stream for Requester { async fn query_occupied_cores( ctx: &mut Context, relay_parent: Hash, -) -> Result, NonFatalError> +) -> Result, Error> where Context: SubsystemContext, { diff --git a/node/network/availability-distribution/src/responder.rs b/node/network/availability-distribution/src/responder.rs index 8e37c6cf2f9e..3fcdbf2686bf 100644 --- a/node/network/availability-distribution/src/responder.rs +++ b/node/network/availability-distribution/src/responder.rs @@ -19,7 +19,7 @@ use futures::channel::oneshot; use polkadot_node_network_protocol::request_response::{request::IncomingRequest, v1}; -use polkadot_primitives::v1::{CandidateHash, ErasureChunk, ValidatorIndex}; +use polkadot_primitives::v1::{AvailableData, CandidateHash, CompressedPoV, ErasureChunk, ValidatorIndex}; use polkadot_subsystem::{ messages::{AllMessages, AvailabilityStoreMessage}, SubsystemContext, jaeger, @@ -28,10 +28,36 @@ use polkadot_subsystem::{ use crate::error::{Error, Result}; use crate::{LOG_TARGET, metrics::{Metrics, SUCCEEDED, FAILED, NOT_FOUND}}; -/// Variant of `answer_request` that does Prometheus metric and logging on errors. +/// Variant of `answer_pov_request` that does Prometheus metric and logging on errors. +/// +/// Any errors of `answer_pov_request` will simply be logged. +pub async fn answer_pov_request_log( + ctx: &mut Context, + req: IncomingRequest, + metrics: &Metrics, +) +where + Context: SubsystemContext, +{ + let res = answer_pov_request(ctx, req).await; + match res { + Ok(result) => + metrics.on_served_pov(if result {SUCCEEDED} else {NOT_FOUND}), + Err(err) => { + tracing::warn!( + target: LOG_TARGET, + err= ?err, + "Serving PoV failed with error" + ); + metrics.on_served_pov(FAILED); + } + } +} + +/// Variant of `answer_chunk_request` that does Prometheus metric and logging on errors. /// /// Any errors of `answer_request` will simply be logged. -pub async fn answer_request_log( +pub async fn answer_chunk_request_log( ctx: &mut Context, req: IncomingRequest, metrics: &Metrics, @@ -39,33 +65,71 @@ pub async fn answer_request_log( where Context: SubsystemContext, { - let res = answer_request(ctx, req).await; + let res = answer_chunk_request(ctx, req).await; match res { Ok(result) => - metrics.on_served(if result {SUCCEEDED} else {NOT_FOUND}), + metrics.on_served_chunk(if result {SUCCEEDED} else {NOT_FOUND}), Err(err) => { tracing::warn!( target: LOG_TARGET, err= ?err, "Serving chunk failed with error" ); - metrics.on_served(FAILED); + metrics.on_served_chunk(FAILED); } } } +/// Answer an incoming PoV fetch request by querying the av store. +/// +/// Returns: Ok(true) if chunk was found and served. +pub async fn answer_pov_request( + ctx: &mut Context, + req: IncomingRequest, +) -> Result +where + Context: SubsystemContext, +{ + let _span = jaeger::Span::new(req.payload.candidate_hash, "answer-pov-request"); + + let av_data = query_available_data(ctx, req.payload.candidate_hash).await?; + + let result = av_data.is_some(); + + let response = match av_data { + None => v1::PoVFetchingResponse::NoSuchPoV, + Some(av_data) => { + let pov = match CompressedPoV::compress(&av_data.pov) { + Ok(pov) => pov, + Err(error) => { + tracing::error!( + target: LOG_TARGET, + error = ?error, + "Failed to create `CompressedPov`", + ); + // this should really not happen, let this request time out: + return Err(Error::PoVDecompression(error)) + } + }; + v1::PoVFetchingResponse::PoV(pov) + } + }; + + req.send_response(response).map_err(|_| Error::SendResponse)?; + Ok(result) +} + /// Answer an incoming chunk request by querying the av store. /// /// Returns: Ok(true) if chunk was found and served. -pub async fn answer_request( +pub async fn answer_chunk_request( ctx: &mut Context, req: IncomingRequest, ) -> Result where Context: SubsystemContext, { - let span = jaeger::Span::new(req.payload.candidate_hash, "answer-request") - .with_stage(jaeger::Stage::AvailabilityDistribution); + let span = jaeger::Span::new(req.payload.candidate_hash, "answer-chunk-request"); let _child_span = span.child("answer-chunk-request") .with_chunk_index(req.payload.index.0); @@ -119,3 +183,21 @@ where Error::QueryChunkResponseChannel(e) }) } + +/// Query PoV from the availability store. +#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))] +async fn query_available_data( + ctx: &mut Context, + candidate_hash: CandidateHash, +) -> Result> +where + Context: SubsystemContext, +{ + let (tx, rx) = oneshot::channel(); + ctx.send_message(AllMessages::AvailabilityStore( + AvailabilityStoreMessage::QueryAvailableData(candidate_hash, tx), + )) + .await; + + rx.await.map_err(|e| Error::QueryAvailableDataResponseChannel(e)) +} diff --git a/node/network/availability-distribution/src/runtime.rs b/node/network/availability-distribution/src/runtime.rs new file mode 100644 index 000000000000..0bc81d3e70f2 --- /dev/null +++ b/node/network/availability-distribution/src/runtime.rs @@ -0,0 +1,197 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Convenient interface to the runtime. + +use lru::LruCache; + +use sp_application_crypto::AppKey; +use sp_core::crypto::Public; +use sp_keystore::{CryptoStore, SyncCryptoStorePtr}; + +use polkadot_node_subsystem_util::{ + request_session_index_for_child_ctx, request_session_info_ctx, +}; +use polkadot_primitives::v1::{GroupIndex, Hash, SessionIndex, SessionInfo, ValidatorId, ValidatorIndex}; +use polkadot_subsystem::SubsystemContext; + +use super::{ + error::recv_runtime, + Error, +}; + +/// Caching of session info as needed by availability distribution. +/// +/// It should be ensured that a cached session stays live in the cache as long as we might need it. +pub struct Runtime { + /// Get the session index for a given relay parent. + /// + /// We query this up to a 100 times per block, so caching it here without roundtrips over the + /// overseer seems sensible. + session_index_cache: LruCache, + + /// Look up cached sessions by SessionIndex. + session_info_cache: LruCache, + + /// Key store for determining whether we are a validator and what `ValidatorIndex` we have. + keystore: SyncCryptoStorePtr, +} + +/// SessionInfo with additional useful data for validator nodes. +pub struct ExtendedSessionInfo { + /// Actual session info as fetched from the runtime. + pub session_info: SessionInfo, + /// Contains useful information about ourselves, in case this node is a validator. + pub validator_info: ValidatorInfo, +} + +/// Information about ourself, in case we are an `Authority`. +/// +/// This data is derived from the `SessionInfo` and our key as found in the keystore. +pub struct ValidatorInfo { + /// The index this very validator has in `SessionInfo` vectors, if any. + pub our_index: Option, + /// The group we belong to, if any. + pub our_group: Option, +} + +impl Runtime { + /// Create a new `Runtime` for convenient runtime fetches. + pub fn new(keystore: SyncCryptoStorePtr) -> Self { + Self { + // 5 relatively conservative, 1 to 2 should suffice: + session_index_cache: LruCache::new(5), + // We need to cache the current and the last session the most: + session_info_cache: LruCache::new(2), + keystore, + } + } + + /// Retrieve the current session index. + pub async fn get_session_index( + &mut self, + ctx: &mut Context, + parent: Hash, + ) -> Result + where + Context: SubsystemContext, + { + match self.session_index_cache.get(&parent) { + Some(index) => Ok(*index), + None => { + let index = + recv_runtime(request_session_index_for_child_ctx(parent, ctx).await) + .await?; + self.session_index_cache.put(parent, index); + Ok(index) + } + } + } + + /// Get `ExtendedSessionInfo` by relay parent hash. + pub async fn get_session_info<'a, Context>( + &'a mut self, + ctx: &mut Context, + parent: Hash, + ) -> Result<&'a ExtendedSessionInfo, Error> + where + Context: SubsystemContext, + { + let session_index = self.get_session_index(ctx, parent).await?; + + self.get_session_info_by_index(ctx, parent, session_index).await + } + + /// Get `ExtendedSessionInfo` by session index. + /// + /// `request_session_info_ctx` still requires the parent to be passed in, so we take the parent + /// in addition to the `SessionIndex`. + pub async fn get_session_info_by_index<'a, Context>( + &'a mut self, + ctx: &mut Context, + parent: Hash, + session_index: SessionIndex, + ) -> Result<&'a ExtendedSessionInfo, Error> + where + Context: SubsystemContext, + { + if !self.session_info_cache.contains(&session_index) { + let session_info = + recv_runtime(request_session_info_ctx(parent, session_index, ctx).await) + .await? + .ok_or(Error::NoSuchSession(session_index))?; + let validator_info = self.get_validator_info(&session_info).await?; + + let full_info = ExtendedSessionInfo { + session_info, + validator_info, + }; + + self.session_info_cache.put(session_index, full_info); + } + Ok( + self.session_info_cache.get(&session_index) + .expect("We just put the value there. qed.") + ) + } + + /// Build `ValidatorInfo` for the current session. + /// + /// + /// Returns: `None` if not a validator. + async fn get_validator_info( + &self, + session_info: &SessionInfo, + ) -> Result + { + if let Some(our_index) = self.get_our_index(&session_info.validators).await { + // Get our group index: + let our_group = session_info.validator_groups + .iter() + .enumerate() + .find_map(|(i, g)| { + g.iter().find_map(|v| { + if *v == our_index { + Some(GroupIndex(i as u32)) + } else { + None + } + }) + } + ); + let info = ValidatorInfo { + our_index: Some(our_index), + our_group, + }; + return Ok(info) + } + return Ok(ValidatorInfo { our_index: None, our_group: None }) + } + + /// Get our `ValidatorIndex`. + /// + /// Returns: None if we are not a validator. + async fn get_our_index(&self, validators: &[ValidatorId]) -> Option { + for (i, v) in validators.iter().enumerate() { + if CryptoStore::has_keys(&*self.keystore, &[(v.to_raw_vec(), ValidatorId::ID)]) + .await + { + return Some(ValidatorIndex(i as u32)); + } + } + None + } +} diff --git a/node/network/availability-distribution/src/session_cache.rs b/node/network/availability-distribution/src/session_cache.rs index a18bf8bcee29..e4e2bce41fc5 100644 --- a/node/network/availability-distribution/src/session_cache.rs +++ b/node/network/availability-distribution/src/session_cache.rs @@ -33,8 +33,7 @@ use polkadot_primitives::v1::{ use polkadot_subsystem::SubsystemContext; use super::{ - error::{recv_runtime, NonFatalError}, - Error, + error::{recv_runtime, Error}, LOG_TARGET, }; @@ -82,7 +81,9 @@ pub struct SessionInfo { /// Remember to which group we belong, so we won't start fetching chunks for candidates with /// our group being responsible. (We should have that chunk already.) - pub our_group: GroupIndex, + /// + /// `None`, if we are not in fact part of any group. + pub our_group: Option, } /// Report of bad validators. @@ -122,7 +123,7 @@ impl SessionCache { ctx: &mut Context, parent: Hash, with_info: F, - ) -> Result, NonFatalError> + ) -> Result, Error> where Context: SubsystemContext, F: FnOnce(&SessionInfo) -> R, @@ -219,7 +220,7 @@ impl SessionCache { ctx: &mut Context, parent: Hash, session_index: SessionIndex, - ) -> Result, NonFatalError> + ) -> Result, Error> where Context: SubsystemContext, { @@ -230,7 +231,7 @@ impl SessionCache { .. } = recv_runtime(request_session_info_ctx(parent, session_index, ctx).await) .await? - .ok_or(NonFatalError::NoSuchSession(session_index))?; + .ok_or(Error::NoSuchSession(session_index))?; if let Some(our_index) = self.get_our_index(validators).await { // Get our group index: @@ -245,8 +246,8 @@ impl SessionCache { None } }) - }) - .expect("Every validator should be in a validator group. qed."); + } + ); // Shuffle validators in groups: let mut rng = thread_rng(); @@ -274,9 +275,9 @@ impl SessionCache { session_index, our_group, }; - return Ok(Some(info)); + return Ok(Some(info)) } - return Ok(None); + return Ok(None) } /// Get our `ValidatorIndex`. diff --git a/node/network/availability-distribution/src/tests/mock.rs b/node/network/availability-distribution/src/tests/mock.rs index 9d9896d8fd59..91eff4b4075d 100644 --- a/node/network/availability-distribution/src/tests/mock.rs +++ b/node/network/availability-distribution/src/tests/mock.rs @@ -19,14 +19,29 @@ use std::sync::Arc; +use sc_keystore::LocalKeystore; use sp_keyring::Sr25519Keyring; +use sp_application_crypto::AppKey; use polkadot_erasure_coding::{branches, obtain_chunks_v1 as obtain_chunks}; -use polkadot_primitives::v1::{AvailableData, BlockData, CandidateCommitments, CandidateDescriptor, - CandidateHash, CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id - as ParaId, OccupiedCore, PersistedValidationData, PoV, SessionInfo, - ValidatorIndex +use polkadot_primitives::v1::{ + AvailableData, BlockData, CandidateCommitments, CandidateDescriptor, CandidateHash, + CommittedCandidateReceipt, ErasureChunk, GroupIndex, Hash, HeadData, Id as ParaId, + OccupiedCore, PersistedValidationData, PoV, SessionInfo, ValidatorId, ValidatorIndex }; +use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr}; + +/// Get mock keystore with `Ferdie` key. +pub fn make_ferdie_keystore() -> SyncCryptoStorePtr { + let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory()); + SyncCryptoStore::sr25519_generate_new( + &*keystore, + ValidatorId::ID, + Some(&Sr25519Keyring::Ferdie.to_seed()), + ) + .expect("Insert key into keystore"); + keystore +} /// Create dummy session info with two validator groups. pub fn make_session_info() -> SessionInfo { diff --git a/node/network/availability-distribution/src/tests/state.rs b/node/network/availability-distribution/src/tests/state.rs index 42cd72dff086..99b0b2c830db 100644 --- a/node/network/availability-distribution/src/tests/state.rs +++ b/node/network/availability-distribution/src/tests/state.rs @@ -23,10 +23,7 @@ use smallvec::smallvec; use futures::{FutureExt, channel::oneshot, SinkExt, channel::mpsc, StreamExt}; use futures_timer::Delay; -use sc_keystore::LocalKeystore; -use sp_application_crypto::AppKey; -use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr}; -use sp_keyring::Sr25519Keyring; +use sp_keystore::SyncCryptoStorePtr; use sp_core::{traits::SpawnNamed, testing::TaskExecutor}; use sc_network as network; use sc_network::IfDisconnected; @@ -39,7 +36,7 @@ use polkadot_subsystem::{ActiveLeavesUpdate, FromOverseer, OverseerSignal, Activ } }; use polkadot_primitives::v1::{CandidateHash, CoreState, ErasureChunk, GroupIndex, Hash, Id - as ParaId, ScheduledCore, SessionInfo, ValidatorId, + as ParaId, ScheduledCore, SessionInfo, ValidatorIndex }; use polkadot_node_network_protocol::{jaeger, @@ -48,7 +45,7 @@ use polkadot_node_network_protocol::{jaeger, use polkadot_subsystem_testhelpers as test_helpers; use test_helpers::SingleItemSink; -use super::mock::{make_session_info, OccupiedCoreBuilder, }; +use super::mock::{make_session_info, OccupiedCoreBuilder, make_ferdie_keystore}; use crate::LOG_TARGET; pub struct TestHarness { @@ -83,17 +80,10 @@ impl Default for TestState { let chain_ids = vec![chain_a, chain_b]; - let keystore: SyncCryptoStorePtr = Arc::new(LocalKeystore::in_memory()); + let keystore = make_ferdie_keystore(); let session_info = make_session_info(); - SyncCryptoStore::sr25519_generate_new( - &*keystore, - ValidatorId::ID, - Some(&Sr25519Keyring::Ferdie.to_seed()), - ) - .expect("Insert key into keystore"); - let (cores, chunks) = { let mut cores = HashMap::new(); let mut chunks = HashMap::new(); @@ -163,6 +153,9 @@ impl TestState { /// This will simply advance through the simulated chain and examines whether the subsystem /// behaves as expected: It will succeed if all valid chunks of other backing groups get stored /// and no other. + /// + /// We try to be as agnostic about details as possible, how the subsystem achieves those goals + /// should not be a matter to this test suite. async fn run_inner(self, executor: TaskExecutor, virtual_overseer: TestSubsystemContextHandle) { // We skip genesis here (in reality ActiveLeavesUpdate can also skip a block: let updates = { @@ -258,15 +251,12 @@ impl TestState { } } _ => { - panic!("Unexpected message received: {:?}", msg); } } } } } - - async fn overseer_signal( mut tx: SingleItemSink>, msg: impl Into, diff --git a/node/network/bridge/src/lib.rs b/node/network/bridge/src/lib.rs index d8594722df16..6c79773bc15e 100644 --- a/node/network/bridge/src/lib.rs +++ b/node/network/bridge/src/lib.rs @@ -696,7 +696,6 @@ mod tests { use polkadot_subsystem::messages::{ ApprovalDistributionMessage, BitfieldDistributionMessage, - PoVDistributionMessage, StatementDistributionMessage }; use polkadot_node_subsystem_test_helpers::{ @@ -897,13 +896,6 @@ mod tests { ) if e == event.focus().expect("could not focus message") ); - assert_matches!( - virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::NetworkBridgeUpdateV1(e) - ) if e == event.focus().expect("could not focus message") - ); - assert_matches!( virtual_overseer.recv().await, AllMessages::ApprovalDistribution( @@ -1166,13 +1158,12 @@ mod tests { ).await; } - let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting( - [0; 32].into(), - vec![[1; 32].into()], + let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals( + Vec::new() ); - let message = protocol_v1::ValidationProtocol::PoVDistribution( - pov_distribution_message.clone(), + let message = protocol_v1::ValidationProtocol::ApprovalDistribution( + approval_distribution_message.clone(), ); network_handle.peer_message( @@ -1183,18 +1174,18 @@ mod tests { network_handle.disconnect_peer(peer.clone(), PeerSet::Validation).await; - // PoV distribution message comes first, and the message is only sent to that subsystem. + // Approval distribution message comes first, and the message is only sent to that subsystem. // then a disconnection event arises that is sent to all validation networking subsystems. assert_matches!( virtual_overseer.recv().await, - AllMessages::PoVDistribution( - PoVDistributionMessage::NetworkBridgeUpdateV1( + AllMessages::ApprovalDistribution( + ApprovalDistributionMessage::NetworkBridgeUpdateV1( NetworkBridgeEvent::PeerMessage(p, m) ) ) => { assert_eq!(p, peer); - assert_eq!(m, pov_distribution_message); + assert_eq!(m, approval_distribution_message); } ); @@ -1563,13 +1554,12 @@ mod tests { // send a validation protocol message. { - let pov_distribution_message = protocol_v1::PoVDistributionMessage::Awaiting( - [0; 32].into(), - vec![[1; 32].into()], + let approval_distribution_message = protocol_v1::ApprovalDistributionMessage::Approvals( + Vec::new() ); - let message = protocol_v1::ValidationProtocol::PoVDistribution( - pov_distribution_message.clone(), + let message = protocol_v1::ValidationProtocol::ApprovalDistribution( + approval_distribution_message.clone(), ); virtual_overseer.send(FromOverseer::Communication { @@ -1624,7 +1614,7 @@ mod tests { fn spread_event_to_subsystems_is_up_to_date() { // Number of subsystems expected to be interested in a network event, // and hence the network event broadcasted to. - const EXPECTED_COUNT: usize = 4; + const EXPECTED_COUNT: usize = 3; let mut cnt = 0_usize; for msg in AllMessages::dispatch_iter(NetworkBridgeEvent::PeerDisconnected(PeerId::random())) { @@ -1640,7 +1630,6 @@ mod tests { AllMessages::BitfieldDistribution(_) => { cnt += 1; } AllMessages::BitfieldSigning(_) => unreachable!("Not interested in network events"), AllMessages::Provisioner(_) => unreachable!("Not interested in network events"), - AllMessages::PoVDistribution(_) => { cnt += 1; } AllMessages::RuntimeApi(_) => unreachable!("Not interested in network events"), AllMessages::AvailabilityStore(_) => unreachable!("Not interested in network events"), AllMessages::NetworkBridge(_) => unreachable!("Not interested in network events"), diff --git a/node/network/bridge/src/multiplexer.rs b/node/network/bridge/src/multiplexer.rs index 7ff7187cf1d6..c71ba33a871b 100644 --- a/node/network/bridge/src/multiplexer.rs +++ b/node/network/bridge/src/multiplexer.rs @@ -141,6 +141,11 @@ fn multiplex_single( decode_with_peer::(peer, payload)?, pending_response, )), + Protocol::PoVFetching => From::from(IncomingRequest::new( + peer, + decode_with_peer::(peer, payload)?, + pending_response, + )), Protocol::AvailableDataFetching => From::from(IncomingRequest::new( peer, decode_with_peer::(peer, payload)?, diff --git a/node/network/bridge/src/network.rs b/node/network/bridge/src/network.rs index d85e4819de27..04edb7c2c0e3 100644 --- a/node/network/bridge/src/network.rs +++ b/node/network/bridge/src/network.rs @@ -237,14 +237,14 @@ impl Network for Arc> { Recipient::Peer(peer_id) => Some(peer_id), Recipient::Authority(authority) => authority_discovery - .get_addresses_by_authority_id(authority) - .await - .and_then(|addrs| { - addrs - .into_iter() - .find_map(|addr| peer_id_from_multiaddr(&addr)) - }), - }; + .get_addresses_by_authority_id(authority) + .await + .and_then(|addrs| { + addrs + .into_iter() + .find_map(|addr| peer_id_from_multiaddr(&addr)) + }), + }; let peer_id = match peer_id { None => { diff --git a/node/network/pov-distribution/Cargo.toml b/node/network/pov-distribution/Cargo.toml deleted file mode 100644 index ae405638c4c6..000000000000 --- a/node/network/pov-distribution/Cargo.toml +++ /dev/null @@ -1,25 +0,0 @@ -[package] -name = "polkadot-pov-distribution" -version = "0.1.0" -authors = ["Parity Technologies "] -edition = "2018" - -[dependencies] -futures = "0.3.12" -thiserror = "1.0.23" -tracing = "0.1.25" - -polkadot-primitives = { path = "../../../primitives" } -polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" } -polkadot-node-subsystem-util = { path = "../../subsystem-util" } -polkadot-node-network-protocol = { path = "../../network/protocol" } - -[dev-dependencies] -assert_matches = "1.4.0" -env_logger = "0.8.1" -log = "0.4.13" - -sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } -sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } - -polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } diff --git a/node/network/pov-distribution/src/error.rs b/node/network/pov-distribution/src/error.rs deleted file mode 100644 index 754c1e56c523..000000000000 --- a/node/network/pov-distribution/src/error.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! The `Error` and `Result` types used by the subsystem. - -use thiserror::Error; - -#[derive(Debug, Error)] -pub enum Error { - #[error(transparent)] - Subsystem(#[from] polkadot_subsystem::SubsystemError), - #[error(transparent)] - OneshotRecv(#[from] futures::channel::oneshot::Canceled), - #[error(transparent)] - Runtime(#[from] polkadot_subsystem::errors::RuntimeApiError), - #[error(transparent)] - Util(#[from] polkadot_node_subsystem_util::Error), -} - -pub type Result = std::result::Result; diff --git a/node/network/pov-distribution/src/lib.rs b/node/network/pov-distribution/src/lib.rs deleted file mode 100644 index add86b02aee9..000000000000 --- a/node/network/pov-distribution/src/lib.rs +++ /dev/null @@ -1,996 +0,0 @@ -// Copyright 2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! PoV Distribution Subsystem of Polkadot. -//! -//! This is a gossip implementation of code that is responsible for distributing PoVs -//! among validators. - -#![deny(unused_crate_dependencies)] -#![warn(missing_docs)] - -use polkadot_primitives::v1::{CandidateDescriptor, CompressedPoV, CoreIndex, CoreState, Hash, Id as ParaId, PoV, ValidatorId}; -use polkadot_subsystem::{ - ActiveLeavesUpdate, OverseerSignal, SubsystemContext, SubsystemResult, SubsystemError, Subsystem, - FromOverseer, SpawnedSubsystem, - jaeger, - messages::{ - PoVDistributionMessage, AllMessages, NetworkBridgeMessage, NetworkBridgeEvent, - }, -}; -use polkadot_node_subsystem_util::{ - validator_discovery, - request_validators_ctx, - request_validator_groups_ctx, - request_availability_cores_ctx, - metrics::{self, prometheus}, -}; -use polkadot_node_network_protocol::{ - peer_set::PeerSet, v1 as protocol_v1, PeerId, OurView, UnifiedReputationChange as Rep, -}; - -use futures::prelude::*; -use futures::channel::oneshot; - -use std::collections::{hash_map::{Entry, HashMap}, HashSet}; -use std::sync::Arc; - -mod error; - -#[cfg(test)] -mod tests; - -const COST_APPARENT_FLOOD: Rep = Rep::CostMajor("Peer appears to be flooding us with PoV requests"); -const COST_UNEXPECTED_POV: Rep = Rep::CostMajor("Peer sent us an unexpected PoV"); -const COST_AWAITED_NOT_IN_VIEW: Rep - = Rep::CostMinor("Peer claims to be awaiting something outside of its view"); - -const BENEFIT_FRESH_POV: Rep = Rep::BenefitMinorFirst("Peer supplied us with an awaited PoV"); -const BENEFIT_LATE_POV: Rep = Rep::BenefitMinor("Peer supplied us with an awaited PoV, \ - but was not the first to do so"); - -const LOG_TARGET: &str = "parachain::pov-distribution"; - -/// The PoV Distribution Subsystem. -pub struct PoVDistribution { - // Prometheus metrics - metrics: Metrics, -} - -impl Subsystem for PoVDistribution - where C: SubsystemContext -{ - fn start(self, ctx: C) -> SpawnedSubsystem { - // Swallow error because failure is fatal to the node and we log with more precision - // within `run`. - let future = self.run(ctx) - .map_err(|e| SubsystemError::with_origin("pov-distribution", e)) - .boxed(); - SpawnedSubsystem { - name: "pov-distribution-subsystem", - future, - } - } -} - -#[derive(Default)] -struct State { - /// A state of things going on on a per-relay-parent basis. - relay_parent_state: HashMap, - - /// Info on peers. - peer_state: HashMap, - - /// Our own view. - our_view: OurView, - - /// Connect to relevant groups of validators at different relay parents. - connection_requests: validator_discovery::ConnectionRequests, - - /// Metrics. - metrics: Metrics, -} - -struct BlockBasedState { - known: HashMap, CompressedPoV)>, - - /// All the PoVs we are or were fetching, coupled with channels expecting the data. - /// - /// This may be an empty list, which indicates that we were once awaiting this PoV but have - /// received it already. - fetching: HashMap>>>, - - n_validators: usize, -} - -#[derive(Default)] -struct PeerState { - /// A set of awaited PoV-hashes for each relay-parent in the peer's view. - awaited: HashMap>, -} - -fn awaiting_message(relay_parent: Hash, awaiting: Vec) - -> protocol_v1::ValidationProtocol -{ - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, awaiting) - ) -} - -fn send_pov_message( - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) -> protocol_v1::ValidationProtocol { - protocol_v1::ValidationProtocol::PoVDistribution( - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov.clone()) - ) -} - -/// Handles the signal. If successful, returns `true` if the subsystem should conclude, -/// `false` otherwise. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_signal( - state: &mut State, - ctx: &mut impl SubsystemContext, - signal: OverseerSignal, -) -> SubsystemResult { - match signal { - OverseerSignal::Conclude => Ok(true), - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { activated, deactivated }) => { - let _timer = state.metrics.time_handle_signal(); - - for activated in activated { - let _span = activated.span.child("pov-dist") - .with_stage(jaeger::Stage::PoVDistribution); - - let relay_parent = activated.hash; - match request_validators_ctx(relay_parent, ctx).await { - Ok(vals_rx) => { - let n_validators = match vals_rx.await? { - Ok(v) => v.len(), - Err(e) => { - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - - // Not adding bookkeeping here might make us behave funny, but we - // shouldn't take down the node on spurious runtime API errors. - // - // and this is "behave funny" as in be bad at our job, but not in any - // slashable or security-related way. - continue; - } - }; - - state.relay_parent_state.insert(relay_parent, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators, - }); - } - Err(e) => { - // continue here also as above. - tracing::warn!( - target: LOG_TARGET, - err = ?e, - "Error fetching validators from runtime API for active leaf", - ); - } - } - } - - for relay_parent in deactivated { - state.connection_requests.remove_all(&relay_parent); - state.relay_parent_state.remove(&relay_parent); - } - - Ok(false) - } - OverseerSignal::BlockFinalized(..) => Ok(false), - } -} - -/// Notify peers that we are awaiting a given PoV hash. -/// -/// This only notifies peers who have the relay parent in their view. -#[tracing::instrument(level = "trace", skip(peers, ctx), fields(subsystem = LOG_TARGET))] -async fn notify_all_we_are_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - pov_hash: Hash, -) { - // We use `awaited` as a proxy for which heads are in the peer's view. - let peers_to_send: Vec<_> = peers.iter() - .filter_map(|(peer, state)| if state.awaited.contains_key(&relay_parent) { - Some(peer.clone()) - } else { - None - }) - .collect(); - - if peers_to_send.is_empty() { - return; - } - - let payload = awaiting_message(relay_parent, vec![pov_hash]); - - tracing::trace!( - target: LOG_TARGET, - peers = ?peers_to_send, - ?relay_parent, - ?pov_hash, - "Sending awaiting message", - ); - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; -} - -/// Notify one peer about everything we're awaiting at a given relay-parent. -#[tracing::instrument(level = "trace", skip(ctx, relay_parent_state), fields(subsystem = LOG_TARGET))] -async fn notify_one_we_are_awaiting_many( - peer: &PeerId, - ctx: &mut impl SubsystemContext, - relay_parent_state: &HashMap, - relay_parent: Hash, -) { - let awaiting_hashes = relay_parent_state.get(&relay_parent).into_iter().flat_map(|s| { - // Send the peer everything we are fetching at this relay-parent - s.fetching.iter() - .filter(|(_, senders)| !senders.is_empty()) // that has not been completed already. - .map(|(pov_hash, _)| *pov_hash) - }).collect::>(); - - if awaiting_hashes.is_empty() { - return; - } - - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?awaiting_hashes, - "Sending awaiting message", - ); - let payload = awaiting_message(relay_parent, awaiting_hashes); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - vec![peer.clone()], - payload, - ))).await; -} - -/// Distribute a PoV to peers who are awaiting it. -#[tracing::instrument(level = "trace", skip(peers, ctx, metrics, pov), fields(subsystem = LOG_TARGET))] -async fn distribute_to_awaiting( - peers: &mut HashMap, - ctx: &mut impl SubsystemContext, - metrics: &Metrics, - relay_parent: Hash, - pov_hash: Hash, - pov: &CompressedPoV, -) { - // Send to all peers who are awaiting the PoV and have that relay-parent in their view. - // - // Also removes it from their awaiting set. - let peers_to_send: Vec<_> = peers.iter_mut() - .filter_map(|(peer, state)| state.awaited.get_mut(&relay_parent).and_then(|awaited| { - if awaited.remove(&pov_hash) { - Some(peer.clone()) - } else { - None - } - })) - .collect(); - - tracing::trace!( - target: LOG_TARGET, - peers = ?peers_to_send, - ?relay_parent, - ?pov_hash, - "Sending PoV message", - ); - if peers_to_send.is_empty() { return; } - - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - peers_to_send, - payload, - ))).await; - - metrics.on_pov_distributed(); -} - -/// Connect to relevant validators in case we are not already. -async fn connect_to_relevant_validators( - connection_requests: &mut validator_discovery::ConnectionRequests, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: &CandidateDescriptor, -) { - let para_id = descriptor.para_id; - if let Ok(Some(relevant_validators)) = - determine_relevant_validators(ctx, relay_parent, para_id).await - { - // We only need one connection request per (relay_parent, para_id) - // so here we take this shortcut to avoid calling `connect_to_validators` - // more than once. - if !connection_requests.contains_request(&relay_parent, para_id) { - tracing::debug!( - target: LOG_TARGET, - validators=?relevant_validators, - ?relay_parent, - "connecting to validators" - ); - match validator_discovery::connect_to_validators( - ctx, - relay_parent, - relevant_validators, - PeerSet::Validation, - ).await { - Ok(new_connection_request) => { - connection_requests.put(relay_parent, para_id, new_connection_request); - } - Err(e) => { - tracing::debug!( - target: LOG_TARGET, - "Failed to create a validator connection request {:?}", - e, - ); - } - } - } - } -} - -/// Get the Id of the Core that is assigned to the para being collated on if any -/// and the total number of cores. -async fn determine_core( - ctx: &mut impl SubsystemContext, - para_id: ParaId, - relay_parent: Hash, -) -> error::Result> { - let cores = request_availability_cores_ctx(relay_parent, ctx).await?.await??; - - for (idx, core) in cores.iter().enumerate() { - if let CoreState::Scheduled(occupied) = core { - if occupied.para_id == para_id { - return Ok(Some(((idx as u32).into(), cores.len()))); - } - } - } - - Ok(None) -} - -/// Figure out a group of validators assigned to a given `ParaId`. -async fn determine_validators_for_core( - ctx: &mut impl SubsystemContext, - core_index: CoreIndex, - num_cores: usize, - relay_parent: Hash, -) -> error::Result>> { - let groups = request_validator_groups_ctx(relay_parent, ctx).await?.await??; - - let group_index = groups.1.group_for_core(core_index, num_cores); - - let connect_to_validators = match groups.0.get(group_index.0 as usize) { - Some(group) => group.clone(), - None => return Ok(None), - }; - - let validators = request_validators_ctx(relay_parent, ctx).await?.await??; - - let validators = connect_to_validators - .into_iter() - .map(|idx| validators[idx.0 as usize].clone()) - .collect(); - - Ok(Some(validators)) -} - -async fn determine_relevant_validators( - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - para_id: ParaId, -) -> error::Result>> { - // Determine which core the para_id is assigned to. - let (core, num_cores) = match determine_core(ctx, para_id, relay_parent).await? { - Some(core) => core, - None => { - tracing::warn!( - target: LOG_TARGET, - "Looks like no core is assigned to {:?} at {:?}", - para_id, - relay_parent, - ); - - return Ok(None); - } - }; - - determine_validators_for_core(ctx, core, num_cores, relay_parent).await -} - -/// Handles a `FetchPoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, response_sender), fields(subsystem = LOG_TARGET))] -async fn handle_fetch( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - response_sender: oneshot::Sender>, -) { - let _timer = state.metrics.time_handle_fetch(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - if let Some((pov, _)) = relay_parent_state.known.get(&descriptor.pov_hash) { - let _ = response_sender.send(pov.clone()); - return; - } - - { - match relay_parent_state.fetching.entry(descriptor.pov_hash) { - Entry::Occupied(mut e) => { - // we are already awaiting this PoV if there is an entry. - e.get_mut().push(response_sender); - return; - } - Entry::Vacant(e) => { - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - e.insert(vec![response_sender]); - } - } - } - - if relay_parent_state.fetching.len() > 2 * relay_parent_state.n_validators { - tracing::warn!( - target = LOG_TARGET, - relay_parent_state.fetching.len = relay_parent_state.fetching.len(), - "other subsystems have requested PoV distribution to fetch more PoVs than reasonably expected", - ); - return; - } - - // Issue an `Awaiting` message to all peers with this in their view. - notify_all_we_are_awaiting( - &mut state.peer_state, - ctx, - relay_parent, - descriptor.pov_hash - ).await -} - -/// Handles a `DistributePoV` message. -#[tracing::instrument(level = "trace", skip(ctx, state, pov), fields(subsystem = LOG_TARGET))] -async fn handle_distribute( - state: &mut State, - ctx: &mut impl SubsystemContext, - relay_parent: Hash, - descriptor: CandidateDescriptor, - pov: Arc, -) { - let _timer = state.metrics.time_handle_distribute(); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - Some(s) => s, - None => return, - }; - - connect_to_relevant_validators(&mut state.connection_requests, ctx, relay_parent, &descriptor).await; - - if let Some(our_awaited) = relay_parent_state.fetching.get_mut(&descriptor.pov_hash) { - // Drain all the senders, but keep the entry in the map around intentionally. - // - // It signals that we were at one point awaiting this, so we will be able to tell - // why peers are sending it to us. - for response_sender in our_awaited.drain(..) { - let _ = response_sender.send(pov.clone()); - } - } - - let encoded_pov = match CompressedPoV::compress(&*pov) { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Failed to create `CompressedPov`." - ); - return - } - }; - - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - descriptor.pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(descriptor.pov_hash, (pov, encoded_pov)); -} - -/// Report a reputation change for a peer. -#[tracing::instrument(level = "trace", skip(ctx), fields(subsystem = LOG_TARGET))] -async fn report_peer( - ctx: &mut impl SubsystemContext, - peer: PeerId, - rep: Rep, -) { - ctx.send_message(AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(peer, rep))).await -} - -/// Handle a notification from a peer that they are awaiting some PoVs. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_awaiting( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hashes: Vec, -) { - if !state.our_view.contains(&relay_parent) { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hashes, - "Received awaiting message for unknown block", - ); - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hashes, - "Received awaiting message", - ); - - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - tracing::warn!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - "PoV Distribution relay parent state out-of-sync with our view" - ); - return; - } - Some(s) => s, - }; - - let peer_awaiting = match - state.peer_state.get_mut(&peer).and_then(|s| s.awaited.get_mut(&relay_parent)) - { - None => { - report_peer(ctx, peer, COST_AWAITED_NOT_IN_VIEW).await; - return; - } - Some(a) => a, - }; - - let will_be_awaited = peer_awaiting.len() + pov_hashes.len(); - if will_be_awaited <= 2 * relay_parent_state.n_validators { - for pov_hash in pov_hashes { - // For all requested PoV hashes, if we have it, we complete the request immediately. - // Otherwise, we note that the peer is awaiting the PoV. - if let Some((_, ref pov)) = relay_parent_state.known.get(&pov_hash) { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Sending awaited PoV message", - ); - let payload = send_pov_message(relay_parent, pov_hash, pov); - - ctx.send_message(AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(vec![peer.clone()], payload) - )).await; - } else { - peer_awaiting.insert(pov_hash); - } - } - } else { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - "Too many PoV requests", - ); - report_peer(ctx, peer, COST_APPARENT_FLOOD).await; - } -} - -/// Handle an incoming PoV from our peer. Reports them if unexpected, rewards them if not. -/// -/// Completes any requests awaiting that PoV. -#[tracing::instrument(level = "trace", skip(ctx, state, encoded_pov), fields(subsystem = LOG_TARGET))] -async fn handle_incoming_pov( - state: &mut State, - ctx: &mut impl SubsystemContext, - peer: PeerId, - relay_parent: Hash, - pov_hash: Hash, - encoded_pov: CompressedPoV, -) { - let relay_parent_state = match state.relay_parent_state.get_mut(&relay_parent) { - None => { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Unexpected PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - }, - Some(r) => r, - }; - - let pov = match encoded_pov.decompress() { - Ok(pov) => pov, - Err(error) => { - tracing::debug!( - target: LOG_TARGET, - error = ?error, - "Could not extract PoV", - ); - return; - } - }; - - let pov = { - // Do validity checks and complete all senders awaiting this PoV. - let fetching = match relay_parent_state.fetching.get_mut(&pov_hash) { - None => { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Unexpected PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - Some(f) => f, - }; - - let hash = pov.hash(); - if hash != pov_hash { - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - ?hash, - "Mismatched PoV", - ); - report_peer(ctx, peer, COST_UNEXPECTED_POV).await; - return; - } - - let pov = Arc::new(pov); - - if fetching.is_empty() { - // fetching is empty whenever we were awaiting something and - // it was completed afterwards. - report_peer(ctx, peer.clone(), BENEFIT_LATE_POV).await; - } else { - // fetching is non-empty when the peer just provided us with data we needed. - report_peer(ctx, peer.clone(), BENEFIT_FRESH_POV).await; - } - - for response_sender in fetching.drain(..) { - let _ = response_sender.send(pov.clone()); - } - - pov - }; - - tracing::debug!( - target: LOG_TARGET, - ?peer, - ?relay_parent, - ?pov_hash, - "Received PoV", - ); - // make sure we don't consider this peer as awaiting that PoV anymore. - if let Some(peer_state) = state.peer_state.get_mut(&peer) { - peer_state.awaited.remove(&pov_hash); - } - - // distribute the PoV to all other peers who are awaiting it. - distribute_to_awaiting( - &mut state.peer_state, - ctx, - &state.metrics, - relay_parent, - pov_hash, - &encoded_pov, - ).await; - - relay_parent_state.known.insert(pov_hash, (pov, encoded_pov)); -} - -/// Handles a newly or already connected validator in the context of some relay leaf. -fn handle_validator_connected(state: &mut State, peer_id: PeerId) { - state.peer_state.entry(peer_id).or_default(); -} - -/// Handles a network bridge update. -#[tracing::instrument(level = "trace", skip(ctx, state), fields(subsystem = LOG_TARGET))] -async fn handle_network_update( - state: &mut State, - ctx: &mut impl SubsystemContext, - update: NetworkBridgeEvent, -) { - let _timer = state.metrics.time_handle_network_update(); - - match update { - NetworkBridgeEvent::PeerConnected(peer, role) => { - tracing::trace!( - target: LOG_TARGET, - ?peer, - ?role, - "Peer connected", - ); - handle_validator_connected(state, peer); - } - NetworkBridgeEvent::PeerDisconnected(peer) => { - tracing::trace!( - target: LOG_TARGET, - ?peer, - "Peer disconnected", - ); - state.peer_state.remove(&peer); - } - NetworkBridgeEvent::PeerViewChange(peer_id, view) => { - tracing::trace!( - target: LOG_TARGET, - ?peer_id, - ?view, - "Peer view change", - ); - if let Some(peer_state) = state.peer_state.get_mut(&peer_id) { - // prune anything not in the new view. - peer_state.awaited.retain(|relay_parent, _| view.contains(&relay_parent)); - - // introduce things from the new view. - for relay_parent in view.iter() { - if let Entry::Vacant(entry) = peer_state.awaited.entry(*relay_parent) { - entry.insert(HashSet::new()); - - // Notify the peer about everything we're awaiting at the new relay-parent. - notify_one_we_are_awaiting_many( - &peer_id, - ctx, - &state.relay_parent_state, - *relay_parent, - ).await; - } - } - } - - } - NetworkBridgeEvent::PeerMessage(peer, message) => { - match message { - protocol_v1::PoVDistributionMessage::Awaiting(relay_parent, pov_hashes) - => handle_awaiting( - state, - ctx, - peer, - relay_parent, - pov_hashes, - ).await, - protocol_v1::PoVDistributionMessage::SendPoV(relay_parent, pov_hash, pov) - => handle_incoming_pov( - state, - ctx, - peer, - relay_parent, - pov_hash, - pov, - ).await, - } - } - NetworkBridgeEvent::OurViewChange(view) => { - tracing::trace!( - target: LOG_TARGET, - "Own view change", - ); - state.our_view = view; - } - } -} - -impl PoVDistribution { - /// Create a new instance of `PovDistribution`. - pub fn new(metrics: Metrics) -> Self { - Self { metrics } - } - - #[tracing::instrument(skip(self, ctx), fields(subsystem = LOG_TARGET))] - async fn run( - self, - ctx: impl SubsystemContext, - ) -> SubsystemResult<()> { - self.run_with_state(ctx, State::default()).await - } - - async fn run_with_state( - self, - mut ctx: impl SubsystemContext, - mut state: State, - ) -> SubsystemResult<()> { - state.metrics = self.metrics; - - loop { - // `select_biased` is used since receiving connection notifications and - // peer view update messages may be racy and we want connection notifications - // first. - futures::select_biased! { - v = state.connection_requests.next().fuse() => handle_validator_connected(&mut state, v.peer_id), - v = ctx.recv().fuse() => { - match v? { - FromOverseer::Signal(signal) => if handle_signal( - &mut state, - &mut ctx, - signal, - ).await? { - return Ok(()); - } - FromOverseer::Communication { msg } => match msg { - PoVDistributionMessage::FetchPoV(relay_parent, descriptor, response_sender) => - handle_fetch( - &mut state, - &mut ctx, - relay_parent, - descriptor, - response_sender, - ).await, - PoVDistributionMessage::DistributePoV(relay_parent, descriptor, pov) => - handle_distribute( - &mut state, - &mut ctx, - relay_parent, - descriptor, - pov, - ).await, - PoVDistributionMessage::NetworkBridgeUpdateV1(event) => - handle_network_update( - &mut state, - &mut ctx, - event, - ).await, - } - } - } - } - } - } -} - - - -#[derive(Clone)] -struct MetricsInner { - povs_distributed: prometheus::Counter, - handle_signal: prometheus::Histogram, - handle_fetch: prometheus::Histogram, - handle_distribute: prometheus::Histogram, - handle_network_update: prometheus::Histogram, -} - -/// Availability Distribution metrics. -#[derive(Default, Clone)] -pub struct Metrics(Option); - -impl Metrics { - fn on_pov_distributed(&self) { - if let Some(metrics) = &self.0 { - metrics.povs_distributed.inc(); - } - } - - /// Provide a timer for `handle_signal` which observes on drop. - fn time_handle_signal(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_signal.start_timer()) - } - - /// Provide a timer for `handle_fetch` which observes on drop. - fn time_handle_fetch(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_fetch.start_timer()) - } - - /// Provide a timer for `handle_distribute` which observes on drop. - fn time_handle_distribute(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_distribute.start_timer()) - } - - /// Provide a timer for `handle_network_update` which observes on drop. - fn time_handle_network_update(&self) -> Option { - self.0.as_ref().map(|metrics| metrics.handle_network_update.start_timer()) - } -} - -impl metrics::Metrics for Metrics { - fn try_register(registry: &prometheus::Registry) -> std::result::Result { - let metrics = MetricsInner { - povs_distributed: prometheus::register( - prometheus::Counter::new( - "parachain_povs_distributed_total", - "Number of PoVs distributed to other peers." - )?, - registry, - )?, - handle_signal: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_signal", - "Time spent within `pov_distribution::handle_signal`", - ) - )?, - registry, - )?, - handle_fetch: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_fetch", - "Time spent within `pov_distribution::handle_fetch`", - ) - )?, - registry, - )?, - handle_distribute: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_distribute", - "Time spent within `pov_distribution::handle_distribute`", - ) - )?, - registry, - )?, - handle_network_update: prometheus::register( - prometheus::Histogram::with_opts( - prometheus::HistogramOpts::new( - "parachain_pov_distribution_handle_network_update", - "Time spent within `pov_distribution::handle_network_update`", - ) - )?, - registry, - )?, - }; - Ok(Metrics(Some(metrics))) - } -} diff --git a/node/network/pov-distribution/src/tests.rs b/node/network/pov-distribution/src/tests.rs deleted file mode 100644 index cf545fc406d8..000000000000 --- a/node/network/pov-distribution/src/tests.rs +++ /dev/null @@ -1,1578 +0,0 @@ -// Copyright 2020-2021 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -use super::*; - -use std::{time::Duration, sync::Arc}; - -use assert_matches::assert_matches; -use futures::executor; -use tracing::trace; - -use sp_keyring::Sr25519Keyring; - -use polkadot_primitives::v1::{AuthorityDiscoveryId, BlockData, CoreState, GroupRotationInfo, Id as ParaId, ScheduledCore, SessionIndex, SessionInfo, ValidatorIndex}; -use polkadot_subsystem::{messages::{RuntimeApiMessage, RuntimeApiRequest}, jaeger, ActivatedLeaf}; -use polkadot_node_subsystem_test_helpers as test_helpers; -use polkadot_node_subsystem_util::TimeoutExt; -use polkadot_node_network_protocol::{view, our_view}; - -fn make_pov(data: Vec) -> PoV { - PoV { block_data: BlockData(data) } -} - -fn make_peer_state(awaited: Vec<(Hash, Vec)>) - -> PeerState -{ - PeerState { - awaited: awaited.into_iter().map(|(rp, h)| (rp, h.into_iter().collect())).collect() - } -} - -fn validator_pubkeys(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -fn validator_authority_id(val_ids: &[Sr25519Keyring]) -> Vec { - val_ids.iter().map(|v| v.public().into()).collect() -} - -type VirtualOverseer = test_helpers::TestSubsystemContextHandle; - -struct TestHarness { - virtual_overseer: VirtualOverseer, -} - -fn test_harness>( - state: State, - test: impl FnOnce(TestHarness) -> T, -) { - let _ = env_logger::builder() - .is_test(true) - .filter( - Some("polkadot_pov_distribution"), - log::LevelFilter::Trace, - ) - .filter( - Some(LOG_TARGET), - log::LevelFilter::Trace, - ) - .try_init(); - - let pool = sp_core::testing::TaskExecutor::new(); - - let (context, virtual_overseer) = test_helpers::make_subsystem_context(pool.clone()); - - let subsystem = super::PoVDistribution::new(Metrics::default()); - - let subsystem = subsystem.run_with_state(context, state); - - let test_fut = test(TestHarness { virtual_overseer }); - - futures::pin_mut!(test_fut); - futures::pin_mut!(subsystem); - - executor::block_on(future::select(test_fut, subsystem)); -} - -const TIMEOUT: Duration = Duration::from_millis(100); - -async fn overseer_send( - overseer: &mut VirtualOverseer, - msg: PoVDistributionMessage, -) { - trace!("Sending message:\n{:?}", &msg); - overseer - .send(FromOverseer::Communication { msg }) - .timeout(TIMEOUT) - .await - .expect(&format!("{:?} is more than enough for sending messages.", TIMEOUT)); -} - -async fn overseer_recv( - overseer: &mut VirtualOverseer, -) -> AllMessages { - let msg = overseer_recv_with_timeout(overseer, TIMEOUT) - .await - .expect(&format!("{:?} is more than enough to receive messages", TIMEOUT)); - - trace!("Received message:\n{:?}", &msg); - - msg -} - -async fn overseer_recv_with_timeout( - overseer: &mut VirtualOverseer, - timeout: Duration, -) -> Option { - trace!("Waiting for message..."); - overseer - .recv() - .timeout(timeout) - .await -} - -async fn overseer_signal( - overseer: &mut VirtualOverseer, - signal: OverseerSignal, -) { - overseer - .send(FromOverseer::Signal(signal)) - .timeout(TIMEOUT) - .await - .expect(&format!("{:?} is more than enough for sending signals.", TIMEOUT)); -} - -#[derive(Clone)] -struct TestState { - chain_ids: Vec, - validators: Vec, - validator_public: Vec, - validator_authority_id: Vec, - validator_peer_id: Vec, - validator_groups: (Vec>, GroupRotationInfo), - relay_parent: Hash, - availability_cores: Vec, - session_index: SessionIndex, -} - -impl Default for TestState { - fn default() -> Self { - let chain_a = ParaId::from(1); - let chain_b = ParaId::from(2); - - let chain_ids = vec![chain_a, chain_b]; - - let validators = vec![ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ]; - - let validator_public = validator_pubkeys(&validators); - let validator_authority_id = validator_authority_id(&validators); - - let validator_peer_id = std::iter::repeat_with(|| PeerId::random()) - .take(validator_public.len()) - .collect(); - - let validator_groups = vec![vec![2, 0, 4], vec![1], vec![3]] - .into_iter().map(|g| g.into_iter().map(ValidatorIndex).collect()).collect(); - let group_rotation_info = GroupRotationInfo { - session_start_block: 0, - group_rotation_frequency: 100, - now: 1, - }; - let validator_groups = (validator_groups, group_rotation_info); - - let availability_cores = vec![ - CoreState::Scheduled(ScheduledCore { - para_id: chain_ids[0], - collator: None, - }), - CoreState::Scheduled(ScheduledCore { - para_id: chain_ids[1], - collator: None, - }), - ]; - - let relay_parent = Hash::repeat_byte(0x05); - - Self { - chain_ids, - validators, - validator_public, - validator_authority_id, - validator_peer_id, - validator_groups, - relay_parent, - availability_cores, - session_index: 1, - } - } -} - -async fn test_validator_discovery( - virtual_overseer: &mut VirtualOverseer, - expected_relay_parent: Hash, - session_index: SessionIndex, - validator_ids: &[ValidatorId], - discovery_ids: &[AuthorityDiscoveryId], - validator_group: &[ValidatorIndex], -) { - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::SessionIndexForChild(tx), - )) => { - assert_eq!(relay_parent, expected_relay_parent); - tx.send(Ok(session_index)).unwrap(); - } - ); - - assert_matches!( - overseer_recv(virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::SessionInfo(index, tx), - )) => { - assert_eq!(relay_parent, expected_relay_parent); - assert_eq!(index, session_index); - - let validators = validator_group.iter() - .map(|idx| validator_ids[idx.0 as usize].clone()) - .collect(); - - let discovery_keys = validator_group.iter() - .map(|idx| discovery_ids[idx.0 as usize].clone()) - .collect(); - - tx.send(Ok(Some(SessionInfo { - validators, - discovery_keys, - ..Default::default() - }))).unwrap(); - } - ); -} - -#[test] -fn ask_validators_for_povs() { - let test_state = TestState::default(); - - test_harness(State::default(), |test_harness| async move { - let mut virtual_overseer = test_harness.virtual_overseer; - - let pov_block = PoV { - block_data: BlockData(vec![42, 43, 44]), - }; - - let pov_hash = pov_block.hash(); - - let mut candidate = CandidateDescriptor::default(); - - let current = test_state.relay_parent.clone(); - candidate.para_id = test_state.chain_ids[0]; - candidate.pov_hash = pov_hash; - candidate.relay_parent = test_state.relay_parent; - - overseer_signal( - &mut virtual_overseer, - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { - activated: vec![ActivatedLeaf { - hash: test_state.relay_parent, - number: 1, - span: Arc::new(jaeger::Span::Disabled), - }].into(), - deactivated: [][..].into(), - }), - ).await; - - // first subsystem will try to obtain validators. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - let (tx, pov_fetch_result) = oneshot::channel(); - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::FetchPoV(test_state.relay_parent.clone(), candidate, tx), - ).await; - - // obtain the availability cores. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.availability_cores.clone())).unwrap(); - } - ); - - // Obtain the validator groups - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_groups.clone())).unwrap(); - } - ); - - // obtain the validators per relay parent - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, current); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - test_validator_discovery( - &mut virtual_overseer, - current, - test_state.session_index, - &test_state.validator_public, - &test_state.validator_authority_id, - &test_state.validator_groups.0[0], - ).await; - - // We now should connect to our validator group. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| test_state.validator_authority_id.contains(id))); - - let result = vec![ - (test_state.validator_authority_id[2].clone(), test_state.validator_peer_id[2].clone()), - (test_state.validator_authority_id[0].clone(), test_state.validator_peer_id[0].clone()), - (test_state.validator_authority_id[4].clone(), test_state.validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - for i in vec![2, 0, 4] { - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerViewChange( - test_state.validator_peer_id[i].clone(), - view![current], - ) - ) - ).await; - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - to_peers, - payload, - )) => { - assert_eq!(to_peers, vec![test_state.validator_peer_id[i].clone()]); - assert_eq!(payload, awaiting_message(current.clone(), vec![pov_hash.clone()])); - } - ); - } - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerMessage( - test_state.validator_peer_id[2].clone(), - protocol_v1::PoVDistributionMessage::SendPoV( - current, - pov_hash, - CompressedPoV::compress(&pov_block).unwrap(), - ), - ) - ) - ).await; - - assert_eq!(*pov_fetch_result.await.unwrap(), pov_block); - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::ReportPeer(id, benefit)) => { - assert_eq!(benefit, BENEFIT_FRESH_POV); - assert_eq!(id, test_state.validator_peer_id[2].clone()); - } - ); - - // Now let's test that if some peer is ahead of us we would still - // send `Await` on `FetchPoV` message to it. - let next_leaf = Hash::repeat_byte(10); - - // A validator's view changes and now is lets say ahead of us. - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::NetworkBridgeUpdateV1( - NetworkBridgeEvent::PeerViewChange( - test_state.validator_peer_id[2].clone(), - view![next_leaf], - ) - ) - ).await; - - let pov_block = PoV { - block_data: BlockData(vec![45, 46, 47]), - }; - - let pov_hash = pov_block.hash(); - - let candidate = CandidateDescriptor { - para_id: test_state.chain_ids[0], - pov_hash, - relay_parent: next_leaf.clone(), - ..Default::default() - }; - - let (tx, _pov_fetch_result) = oneshot::channel(); - - overseer_signal( - &mut virtual_overseer, - OverseerSignal::ActiveLeaves(ActiveLeavesUpdate { - activated: vec![ActivatedLeaf { - hash: next_leaf, - number: 2, - span: Arc::new(jaeger::Span::Disabled), - }].into(), - deactivated: [current.clone()][..].into(), - }) - ).await; - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::FetchPoV(next_leaf.clone(), candidate, tx), - ).await; - - // Obtain the availability cores. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.availability_cores.clone())).unwrap(); - } - ); - - // Obtain the validator groups - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_groups.clone())).unwrap(); - } - ); - - // obtain the validators per relay parent - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, next_leaf); - tx.send(Ok(test_state.validator_public.clone())).unwrap(); - } - ); - - // obtain the validator_id to authority_id mapping - test_validator_discovery( - &mut virtual_overseer, - next_leaf, - test_state.session_index, - &test_state.validator_public, - &test_state.validator_authority_id, - &test_state.validator_groups.0[0], - ).await; - - // We now should connect to our validator group. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| test_state.validator_authority_id.contains(id))); - - let result = vec![ - (test_state.validator_authority_id[2].clone(), test_state.validator_peer_id[2].clone()), - (test_state.validator_authority_id[0].clone(), test_state.validator_peer_id[0].clone()), - (test_state.validator_authority_id[4].clone(), test_state.validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - // We already know that the leaf in question in the peer's view so we request - // a chunk from them right away. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::NetworkBridge(NetworkBridgeMessage::SendValidationMessage( - to_peers, - payload, - )) => { - assert_eq!(to_peers, vec![test_state.validator_peer_id[2].clone()]); - assert_eq!(payload, awaiting_message(next_leaf.clone(), vec![pov_hash.clone()])); - } - ); - }); -} - -#[test] -fn distributes_to_those_awaiting_and_completes_local() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - let peer_c = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - b.fetching.insert(pov_hash, vec![pov_send]); - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A has hash_a in its view and is awaiting the PoV. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - // peer B has hash_a in its view but is not awaiting. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer C doesn't have hash_a in its view but is awaiting the PoV under hash_b. - s.insert( - peer_c.clone(), - make_peer_state(vec![(hash_b, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a, hash_b], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let mut descriptor = CandidateDescriptor::default(); - descriptor.pov_hash = pov_hash; - - test_harness(state, |test_harness| async move { - let mut virtual_overseer = test_harness.virtual_overseer; - - overseer_send( - &mut virtual_overseer, - PoVDistributionMessage::DistributePoV( - hash_a, - descriptor, - Arc::new(pov.clone()) - ) - ).await; - - // Let's assume runtime call failed and we're already connected to the peers. - assert_matches!( - virtual_overseer.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Err("nope".to_string().into())).unwrap(); - } - ); - - // our local sender also completed - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - virtual_overseer.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_a.clone()]); - assert_eq!( - message, - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ); - } - ) - }); -} - - -#[test] -fn we_inform_peers_with_same_view_we_are_awaiting() { - - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, _) = oneshot::channel(); - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A has hash_a in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer B doesn't have hash_a in its view. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - let mut descriptor = CandidateDescriptor::default(); - descriptor.pov_hash = pov_hash; - - let para_id_1 = ParaId::from(1); - let para_id_2 = ParaId::from(2); - - descriptor.para_id = para_id_1; - - let availability_cores = vec![ - CoreState::Scheduled(ScheduledCore { - para_id: para_id_1, - collator: None, - }), - CoreState::Scheduled(ScheduledCore { - para_id: para_id_2, - collator: None, - }), - ]; - - let validators = vec![ - Sr25519Keyring::Alice, - Sr25519Keyring::Bob, - Sr25519Keyring::Charlie, - Sr25519Keyring::Dave, - Sr25519Keyring::Ferdie, - ]; - - let validator_authority_id = validator_authority_id(&validators); - let validators = validator_pubkeys(&validators); - - let validator_peer_id: Vec<_> = std::iter::repeat_with(|| PeerId::random()) - .take(validators.len()) - .collect(); - - let validator_groups = vec![vec![2, 0, 4], vec![1], vec![3]] - .into_iter().map(|g| g.into_iter().map(ValidatorIndex).collect()).collect(); - let group_rotation_info = GroupRotationInfo { - session_start_block: 0, - group_rotation_frequency: 100, - now: 1, - }; - - let validator_groups = (validator_groups, group_rotation_info); - - executor::block_on(async move { - let handle_future = handle_fetch( - &mut state, - &mut ctx, - hash_a, - descriptor, - pov_send, - ); - - let check_future = async move { - //assert_eq!(state.relay_parent_state[&hash_a].fetching[&pov_hash].len(), 1); - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::AvailabilityCores(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(availability_cores)).unwrap(); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::ValidatorGroups(tx) - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(validator_groups.clone())).unwrap(); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - relay_parent, - RuntimeApiRequest::Validators(tx), - )) => { - assert_eq!(relay_parent, hash_a); - tx.send(Ok(validators.clone())).unwrap(); - } - ); - - test_validator_discovery( - &mut handle, - hash_a, - 1, - &validators, - &validator_authority_id, - &validator_groups.0[0], - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ConnectToValidators { - validator_ids, - mut connected, - .. - } - ) => { - assert_eq!(validator_ids.len(), 3); - assert!(validator_ids.iter().all(|id| validator_authority_id.contains(id))); - - let result = vec![ - (validator_authority_id[2].clone(), validator_peer_id[2].clone()), - (validator_authority_id[0].clone(), validator_peer_id[0].clone()), - (validator_authority_id[4].clone(), validator_peer_id[4].clone()), - ]; - - result.into_iter().for_each(|r| connected.try_send(r).unwrap()); - } - ); - - }; - - futures::join!(handle_future, check_future); - }); -} - -#[test] -fn peer_view_change_leads_to_us_informing() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_a_send, _) = oneshot::channel(); - - let pov_a = make_pov(vec![1, 2, 3]); - let pov_a_hash = pov_a.hash(); - - let pov_b = make_pov(vec![4, 5, 6]); - let pov_b_hash = pov_b.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov_a is still being fetched, whereas the fetch of pov_b has already - // completed, as implied by the empty vector. - b.fetching.insert(pov_a_hash, vec![pov_a_send]); - b.fetching.insert(pov_b_hash, vec![]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A doesn't yet have hash_a in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerViewChange(peer_a.clone(), view![hash_a, hash_b]), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_a.clone()]); - assert_eq!( - message, - awaiting_message(hash_a, vec![pov_a_hash]), - ); - } - ) - }); -} - -#[test] -fn peer_complete_fetch_and_is_rewarded() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peers A and B are functionally the same. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request before peer B. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_b.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_b); - assert_eq!(rep, BENEFIT_LATE_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_bad_pov() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_send, _) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let bad_pov = make_pov(vec![6, 6, 6]); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&bad_pov).unwrap()), - ).focus().unwrap(), - ).await; - - // didn't complete our sender. - assert_eq!(state.relay_parent_state[&hash_a].fetching[&pov_hash].len(), 1); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_unexpected_pov() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_punished_for_sending_pov_out_of_our_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - // Peer A answers our request: right relay parent, awaited hash, wrong PoV. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_b, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_UNEXPECTED_POV); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_too_much() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let n_validators = 10; - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators, - }; - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let max_plausibly_awaited = n_validators * 2; - - // The peer awaits a plausible (albeit unlikely) amount of PoVs. - for i in 0..max_plausibly_awaited { - let pov_hash = make_pov(vec![i as u8; 32]).hash(); - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_a, vec![pov_hash]), - ).focus().unwrap(), - ).await; - } - - assert_eq!(state.peer_state[&peer_a].awaited[&hash_a].len(), max_plausibly_awaited); - - // The last straw: - let last_pov_hash = make_pov(vec![max_plausibly_awaited as u8; 32]).hash(); - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_a, vec![last_pov_hash]), - ).focus().unwrap(), - ).await; - - // No more bookkeeping for you! - assert_eq!(state.peer_state[&peer_a].awaited[&hash_a].len(), max_plausibly_awaited); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_APPARENT_FLOOD); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_outside_their_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - s.insert(hash_a, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s.insert(hash_b, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s - }, - peer_state: { - let mut s = HashMap::new(); - - // Peer has only hash A in its view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - s - }, - our_view: our_view![hash_a, hash_b], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let pov_hash = make_pov(vec![1, 2, 3]).hash(); - - // Hash B is in our view but not the peer's - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_b, vec![pov_hash]), - ).focus().unwrap(), - ).await; - - assert!(state.peer_state[&peer_a].awaited.get(&hash_b).is_none()); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_AWAITED_NOT_IN_VIEW); - } - ); - }); -} - -#[test] -fn peer_reported_for_awaiting_outside_our_view() { - let hash_a: Hash = [0; 32].into(); - let hash_b: Hash = [1; 32].into(); - - let peer_a = PeerId::random(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - s.insert(hash_a, BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }); - - s - }, - peer_state: { - let mut s = HashMap::new(); - - // Peer has hashes A and B in their view. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![]), (hash_b, vec![])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - let pov_hash = make_pov(vec![1, 2, 3]).hash(); - - // Hash B is in peer's view but not ours. - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - awaiting_message(hash_b, vec![pov_hash]), - ).focus().unwrap(), - ).await; - - // Illegal `awaited` is ignored. - assert!(state.peer_state[&peer_a].awaited[&hash_b].is_empty()); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, COST_AWAITED_NOT_IN_VIEW); - } - ); - }); -} - -#[test] -fn peer_complete_fetch_leads_to_us_completing_others() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - let peer_b = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![])]), - ); - - // peer B is awaiting peer A's request. - s.insert( - peer_b.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::SendValidationMessage(peers, message) - ) => { - assert_eq!(peers, vec![peer_b.clone()]); - assert_eq!( - message, - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ); - } - ); - - assert!(!state.peer_state[&peer_b].awaited[&hash_a].contains(&pov_hash)); - }); -} - -#[test] -fn peer_completing_request_no_longer_awaiting() { - let hash_a: Hash = [0; 32].into(); - - let peer_a = PeerId::random(); - - let (pov_send, pov_recv) = oneshot::channel(); - - let pov = make_pov(vec![1, 2, 3]); - let pov_hash = pov.hash(); - - let mut state = State { - relay_parent_state: { - let mut s = HashMap::new(); - let mut b = BlockBasedState { - known: HashMap::new(), - fetching: HashMap::new(), - n_validators: 10, - }; - - // pov is being fetched. - b.fetching.insert(pov_hash, vec![pov_send]); - - s.insert(hash_a, b); - s - }, - peer_state: { - let mut s = HashMap::new(); - - // peer A is registered as awaiting. - s.insert( - peer_a.clone(), - make_peer_state(vec![(hash_a, vec![pov_hash])]), - ); - - s - }, - our_view: our_view![hash_a], - metrics: Default::default(), - connection_requests: Default::default(), - }; - - let pool = sp_core::testing::TaskExecutor::new(); - let (mut ctx, mut handle) = polkadot_node_subsystem_test_helpers::make_subsystem_context(pool); - - executor::block_on(async move { - handle_network_update( - &mut state, - &mut ctx, - NetworkBridgeEvent::PeerMessage( - peer_a.clone(), - send_pov_message(hash_a, pov_hash, &CompressedPoV::compress(&pov).unwrap()), - ).focus().unwrap(), - ).await; - - assert_eq!(&*pov_recv.await.unwrap(), &pov); - - assert_matches!( - handle.recv().await, - AllMessages::NetworkBridge( - NetworkBridgeMessage::ReportPeer(peer, rep) - ) => { - assert_eq!(peer, peer_a); - assert_eq!(rep, BENEFIT_FRESH_POV); - } - ); - - // We received the PoV from peer A, so we do not consider it awaited by peer A anymore. - assert!(!state.peer_state[&peer_a].awaited[&hash_a].contains(&pov_hash)); - }); -} diff --git a/node/network/protocol/Cargo.toml b/node/network/protocol/Cargo.toml index f7a0c72b2a08..6f32c346e53b 100644 --- a/node/network/protocol/Cargo.toml +++ b/node/network/protocol/Cargo.toml @@ -13,3 +13,4 @@ parity-scale-codec = { version = "2.0.0", default-features = false, features = [ sc-network = { git = "https://github.com/paritytech/substrate", branch = "master" } strum = { version = "0.20", features = ["derive"] } futures = "0.3.12" +thiserror = "1.0.23" diff --git a/node/network/protocol/src/lib.rs b/node/network/protocol/src/lib.rs index ee5d3398e6d8..211f2dcf37f3 100644 --- a/node/network/protocol/src/lib.rs +++ b/node/network/protocol/src/lib.rs @@ -289,7 +289,7 @@ pub mod v1 { use std::convert::TryFrom; use polkadot_primitives::v1::{ - CandidateIndex, CollatorId, CompressedPoV, Hash, Id as ParaId, SignedAvailabilityBitfield, + CandidateIndex, CollatorId, Hash, Id as ParaId, SignedAvailabilityBitfield, CollatorSignature, }; use polkadot_node_primitives::{ @@ -305,19 +305,6 @@ pub mod v1 { Bitfield(Hash, SignedAvailabilityBitfield), } - /// Network messages used by the PoV distribution subsystem. - #[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)] - pub enum PoVDistributionMessage { - /// Notification that we are awaiting the given PoVs (by hash) against a - /// specific relay-parent hash. - #[codec(index = 0)] - Awaiting(Hash, Vec), - /// Notification of an awaited PoV, in a given relay-parent context. - /// (relay_parent, pov_hash, compressed_pov) - #[codec(index = 1)] - SendPoV(Hash, Hash, CompressedPoV), - } - /// Network messages used by the statement distribution subsystem. #[derive(Debug, Clone, Encode, Decode, PartialEq, Eq)] pub enum StatementDistributionMessage { @@ -361,9 +348,6 @@ pub mod v1 { /// Bitfield distribution messages #[codec(index = 1)] BitfieldDistribution(BitfieldDistributionMessage), - /// PoV Distribution messages - #[codec(index = 2)] - PoVDistribution(PoVDistributionMessage), /// Statement distribution messages #[codec(index = 3)] StatementDistribution(StatementDistributionMessage), @@ -373,7 +357,6 @@ pub mod v1 { } impl_try_from!(ValidationProtocol, BitfieldDistribution, BitfieldDistributionMessage); - impl_try_from!(ValidationProtocol, PoVDistribution, PoVDistributionMessage); impl_try_from!(ValidationProtocol, StatementDistribution, StatementDistributionMessage); impl_try_from!(ValidationProtocol, ApprovalDistribution, ApprovalDistributionMessage); diff --git a/node/network/protocol/src/request_response/mod.rs b/node/network/protocol/src/request_response/mod.rs index 71914fb88eae..85eab9fc7a80 100644 --- a/node/network/protocol/src/request_response/mod.rs +++ b/node/network/protocol/src/request_response/mod.rs @@ -60,6 +60,8 @@ pub enum Protocol { ChunkFetching, /// Protocol for fetching collations from collators. CollationFetching, + /// Protocol for fetching seconded PoVs from validators of the same group. + PoVFetching, /// Protocol for fetching available data. AvailableDataFetching, } @@ -107,11 +109,18 @@ impl Protocol { request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED, inbound_queue: Some(tx), }, + Protocol::PoVFetching => RequestResponseConfig { + name: p_name, + max_request_size: 1_000, + max_response_size: MAX_COMPRESSED_POV_SIZE as u64, + request_timeout: DEFAULT_REQUEST_TIMEOUT_CONNECTED, + inbound_queue: Some(tx), + }, Protocol::AvailableDataFetching => RequestResponseConfig { name: p_name, max_request_size: 1_000, // Available data size is dominated by the PoV size. - max_response_size: 30_000_000, + max_response_size: MAX_COMPRESSED_POV_SIZE as u64, request_timeout: DEFAULT_REQUEST_TIMEOUT, inbound_queue: Some(tx), }, @@ -130,6 +139,8 @@ impl Protocol { Protocol::ChunkFetching => 100, // 10 seems reasonable, considering group sizes of max 10 validators. Protocol::CollationFetching => 10, + // 10 seems reasonable, considering group sizes of max 10 validators. + Protocol::PoVFetching => 10, // Validators are constantly self-selecting to request available data which may lead // to constant load and occasional burstiness. Protocol::AvailableDataFetching => 100, @@ -146,6 +157,7 @@ impl Protocol { match self { Protocol::ChunkFetching => "/polkadot/req_chunk/1", Protocol::CollationFetching => "/polkadot/req_collation/1", + Protocol::PoVFetching => "/polkadot/req_pov/1", Protocol::AvailableDataFetching => "/polkadot/req_available_data/1", } } diff --git a/node/network/protocol/src/request_response/request.rs b/node/network/protocol/src/request_response/request.rs index 3db34f673c90..da7ddaaa91e1 100644 --- a/node/network/protocol/src/request_response/request.rs +++ b/node/network/protocol/src/request_response/request.rs @@ -17,6 +17,7 @@ use futures::channel::oneshot; use futures::prelude::Future; +use thiserror::Error; use parity_scale_codec::{Decode, Encode, Error as DecodingError}; use sc_network as network; use sc_network::config as netconfig; @@ -42,6 +43,8 @@ pub enum Requests { ChunkFetching(OutgoingRequest), /// Fetch a collation from a collator which previously announced it. CollationFetching(OutgoingRequest), + /// Fetch a PoV from a validator which previously sent out a seconded statement. + PoVFetching(OutgoingRequest), /// Request full available data from a node. AvailableDataFetching(OutgoingRequest), } @@ -52,6 +55,7 @@ impl Requests { match self { Self::ChunkFetching(_) => Protocol::ChunkFetching, Self::CollationFetching(_) => Protocol::CollationFetching, + Self::PoVFetching(_) => Protocol::PoVFetching, Self::AvailableDataFetching(_) => Protocol::AvailableDataFetching, } } @@ -67,6 +71,7 @@ impl Requests { match self { Self::ChunkFetching(r) => r.encode_request(), Self::CollationFetching(r) => r.encode_request(), + Self::PoVFetching(r) => r.encode_request(), Self::AvailableDataFetching(r) => r.encode_request(), } } @@ -96,16 +101,19 @@ pub struct OutgoingRequest { } /// Any error that can occur when sending a request. -#[derive(Debug)] +#[derive(Debug, Error)] pub enum RequestError { /// Response could not be decoded. - InvalidResponse(DecodingError), + #[error("Response could not be decoded")] + InvalidResponse(#[source] DecodingError), /// Some error in substrate/libp2p happened. - NetworkError(network::RequestFailure), + #[error("Some network error occurred")] + NetworkError(#[source] network::RequestFailure), /// Response got canceled by networking. - Canceled(oneshot::Canceled), + #[error("Response channel got canceled")] + Canceled(#[source] oneshot::Canceled), } /// Responses received for an `OutgoingRequest`. diff --git a/node/network/protocol/src/request_response/v1.rs b/node/network/protocol/src/request_response/v1.rs index f88fc7f56650..c6dde110b62c 100644 --- a/node/network/protocol/src/request_response/v1.rs +++ b/node/network/protocol/src/request_response/v1.rs @@ -114,6 +114,29 @@ impl IsRequest for CollationFetchingRequest { const PROTOCOL: Protocol = Protocol::CollationFetching; } +/// Request the advertised collation at that relay-parent. +#[derive(Debug, Clone, Encode, Decode)] +pub struct PoVFetchingRequest { + /// Candidate we want a PoV for. + pub candidate_hash: CandidateHash, +} + +/// Responses to `PoVFetchingRequest`. +#[derive(Debug, Clone, Encode, Decode)] +pub enum PoVFetchingResponse { + /// Deliver requested PoV. + #[codec(index = 0)] + PoV(CompressedPoV), + /// PoV was not found in store. + #[codec(index = 1)] + NoSuchPoV, +} + +impl IsRequest for PoVFetchingRequest { + type Response = PoVFetchingResponse; + const PROTOCOL: Protocol = Protocol::PoVFetching; +} + /// Request the entire available data for a candidate. #[derive(Debug, Clone, Encode, Decode)] pub struct AvailableDataFetchingRequest { diff --git a/node/overseer/src/lib.rs b/node/overseer/src/lib.rs index f2769aabe7be..1661b6dd3d39 100644 --- a/node/overseer/src/lib.rs +++ b/node/overseer/src/lib.rs @@ -83,7 +83,7 @@ use polkadot_subsystem::messages::{ CandidateValidationMessage, CandidateBackingMessage, CandidateSelectionMessage, ChainApiMessage, StatementDistributionMessage, AvailabilityDistributionMessage, BitfieldSigningMessage, BitfieldDistributionMessage, - ProvisionerMessage, PoVDistributionMessage, RuntimeApiMessage, + ProvisionerMessage, RuntimeApiMessage, AvailabilityStoreMessage, NetworkBridgeMessage, AllMessages, CollationGenerationMessage, CollatorProtocolMessage, AvailabilityRecoveryMessage, ApprovalDistributionMessage, ApprovalVotingMessage, GossipSupportMessage, @@ -522,7 +522,6 @@ pub struct Overseer { OverseenSubsystem, OverseenSubsystem, OverseenSubsystem, - OverseenSubsystem, OverseenSubsystem, OverseenSubsystem, OverseenSubsystem, @@ -588,7 +587,7 @@ impl MapSubsystem for F where F: Fn(T) -> U { /// subsystems are implemented and the rest can be mocked with the [`DummySubsystem`]. pub struct AllSubsystems< CV = (), CB = (), CS = (), SD = (), AD = (), AR = (), BS = (), BD = (), P = (), - PoVD = (), RA = (), AS = (), NB = (), CA = (), CG = (), CP = (), ApD = (), ApV = (), + RA = (), AS = (), NB = (), CA = (), CG = (), CP = (), ApD = (), ApV = (), GS = (), > { /// A candidate validation subsystem. @@ -609,8 +608,6 @@ pub struct AllSubsystems< pub bitfield_distribution: BD, /// A provisioner subsystem. pub provisioner: P, - /// A PoV distribution subsystem. - pub pov_distribution: PoVD, /// A runtime API subsystem. pub runtime_api: RA, /// An availability store subsystem. @@ -631,8 +628,8 @@ pub struct AllSubsystems< pub gossip_support: GS, } -impl - AllSubsystems +impl + AllSubsystems { /// Create a new instance of [`AllSubsystems`]. /// @@ -665,7 +662,6 @@ impl { AllSubsystems { candidate_validation: DummySubsystem, @@ -677,7 +673,6 @@ impl( self, candidate_validation: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation, candidate_backing: self.candidate_backing, @@ -705,7 +700,6 @@ impl( self, candidate_backing: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing, @@ -733,7 +727,6 @@ impl( self, candidate_selection: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -761,7 +754,6 @@ impl( self, statement_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -789,7 +781,6 @@ impl( self, availability_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -817,7 +808,6 @@ impl( self, availability_recovery: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -845,7 +835,6 @@ impl( self, bitfield_signing: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -873,7 +862,6 @@ impl( self, bitfield_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -901,7 +889,6 @@ impl( self, provisioner: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -929,35 +916,6 @@ impl( - self, - pov_distribution: NEW, - ) -> AllSubsystems { - AllSubsystems { - candidate_validation: self.candidate_validation, - candidate_backing: self.candidate_backing, - candidate_selection: self.candidate_selection, - statement_distribution: self.statement_distribution, - availability_distribution: self.availability_distribution, - availability_recovery: self.availability_recovery, - bitfield_signing: self.bitfield_signing, - bitfield_distribution: self.bitfield_distribution, - provisioner: self.provisioner, - pov_distribution, runtime_api: self.runtime_api, availability_store: self.availability_store, network_bridge: self.network_bridge, @@ -974,7 +932,7 @@ impl( self, runtime_api: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -985,7 +943,6 @@ impl( self, availability_store: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1013,7 +970,6 @@ impl( self, network_bridge: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1041,7 +997,6 @@ impl( self, chain_api: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1069,7 +1024,6 @@ impl( self, collation_generation: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1097,7 +1051,6 @@ impl( self, collator_protocol: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1125,7 +1078,6 @@ impl( self, approval_distribution: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1153,7 +1105,6 @@ impl( self, approval_voting: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1181,7 +1132,6 @@ impl( self, gossip_support: NEW, - ) -> AllSubsystems { + ) -> AllSubsystems { AllSubsystems { candidate_validation: self.candidate_validation, candidate_backing: self.candidate_backing, @@ -1209,7 +1159,6 @@ impl AllSubsystems<&'_ CV, &'_ CB, &'_ CS, &'_ SD, &'_ AD, &'_ AR, &'_ BS, &'_ BD, &'_ P, &'_ PoVD, &'_ RA, &'_ AS, &'_ NB, &'_ CA, &'_ CG, &'_ CP, &'_ ApD, &'_ ApV, &'_ GS> { + fn as_ref(&self) -> AllSubsystems<&'_ CV, &'_ CB, &'_ CS, &'_ SD, &'_ AD, &'_ AR, &'_ BS, &'_ BD, &'_ P, &'_ RA, &'_ AS, &'_ NB, &'_ CA, &'_ CG, &'_ CP, &'_ ApD, &'_ ApV, &'_ GS> { AllSubsystems { candidate_validation: &self.candidate_validation, candidate_backing: &self.candidate_backing, @@ -1233,7 +1182,6 @@ impl>::Output, >::Output, >::Output, - >::Output, >::Output, >::Output, >::Output, @@ -1278,7 +1225,6 @@ impl, M: MapSubsystem, M: MapSubsystem

, - M: MapSubsystem, M: MapSubsystem, M: MapSubsystem, M: MapSubsystem, @@ -1299,7 +1245,6 @@ impl = AllSubsystems< T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, - T, T, T, T, + T, T, T, >; /// Overseer Prometheus metrics. @@ -1566,9 +1511,9 @@ where /// # /// # }); } /// ``` - pub fn new( + pub fn new( leaves: impl IntoIterator, - all_subsystems: AllSubsystems, + all_subsystems: AllSubsystems, prometheus_registry: Option<&prometheus::Registry>, mut s: S, ) -> SubsystemResult<(Self, OverseerHandler)> @@ -1582,7 +1527,6 @@ where BS: Subsystem> + Send, BD: Subsystem> + Send, P: Subsystem> + Send, - PoVD: Subsystem> + Send, RA: Subsystem> + Send, AS: Subsystem> + Send, NB: Subsystem> + Send, @@ -1697,16 +1641,6 @@ where TaskKind::Regular, )?; - let pov_distribution_subsystem = spawn( - &mut s, - &mut running_subsystems, - metered::UnboundedMeteredSender::<_>::clone(&to_overseer_tx), - all_subsystems.pov_distribution, - &metrics, - &mut seed, - TaskKind::Regular, - )?; - let runtime_api_subsystem = spawn( &mut s, &mut running_subsystems, @@ -1816,7 +1750,6 @@ where bitfield_signing: bitfield_signing_subsystem, bitfield_distribution: bitfield_distribution_subsystem, provisioner: provisioner_subsystem, - pov_distribution: pov_distribution_subsystem, runtime_api: runtime_api_subsystem, availability_store: availability_store_subsystem, network_bridge: network_bridge_subsystem, @@ -1893,7 +1826,6 @@ where let _ = self.subsystems.bitfield_signing.send_signal(OverseerSignal::Conclude).await; let _ = self.subsystems.bitfield_distribution.send_signal(OverseerSignal::Conclude).await; let _ = self.subsystems.provisioner.send_signal(OverseerSignal::Conclude).await; - let _ = self.subsystems.pov_distribution.send_signal(OverseerSignal::Conclude).await; let _ = self.subsystems.runtime_api.send_signal(OverseerSignal::Conclude).await; let _ = self.subsystems.availability_store.send_signal(OverseerSignal::Conclude).await; let _ = self.subsystems.network_bridge.send_signal(OverseerSignal::Conclude).await; @@ -2072,7 +2004,6 @@ where self.subsystems.bitfield_signing.send_signal(signal.clone()).await?; self.subsystems.bitfield_distribution.send_signal(signal.clone()).await?; self.subsystems.provisioner.send_signal(signal.clone()).await?; - self.subsystems.pov_distribution.send_signal(signal.clone()).await?; self.subsystems.runtime_api.send_signal(signal.clone()).await?; self.subsystems.availability_store.send_signal(signal.clone()).await?; self.subsystems.network_bridge.send_signal(signal.clone()).await?; @@ -2118,9 +2049,6 @@ where AllMessages::Provisioner(msg) => { self.subsystems.provisioner.send_message(msg).await?; }, - AllMessages::PoVDistribution(msg) => { - self.subsystems.pov_distribution.send_message(msg).await?; - }, AllMessages::RuntimeApi(msg) => { self.subsystems.runtime_api.send_message(msg).await?; }, @@ -2244,9 +2172,9 @@ fn spawn( let fut = Box::pin(async move { if let Err(e) = future.await { - tracing::error!(subsystem=name, err = ?e, "subsystem exited with error"); + tracing::error!(target: LOG_TARGET, subsystem=name, err = ?e, "subsystem exited with error"); } else { - tracing::debug!(subsystem=name, "subsystem exited without an error"); + tracing::debug!(target: LOG_TARGET, subsystem=name, "subsystem exited without an error"); } let _ = tx.send(()); }); @@ -3006,10 +2934,6 @@ mod tests { ProvisionerMessage::RequestInherentData(Default::default(), sender) } - fn test_pov_distribution_msg() -> PoVDistributionMessage { - PoVDistributionMessage::NetworkBridgeUpdateV1(test_network_bridge_event()) - } - fn test_runtime_api_msg() -> RuntimeApiMessage { let (sender, _) = oneshot::channel(); RuntimeApiMessage::Request(Default::default(), RuntimeApiRequest::Validators(sender)) @@ -3061,7 +2985,6 @@ mod tests { bitfield_signing: subsystem.clone(), bitfield_distribution: subsystem.clone(), provisioner: subsystem.clone(), - pov_distribution: subsystem.clone(), runtime_api: subsystem.clone(), availability_store: subsystem.clone(), network_bridge: subsystem.clone(), @@ -3100,7 +3023,6 @@ mod tests { // handler.send_msg(AllMessages::GossipSupport(test_bitfield_signing_msg())).await; handler.send_msg(AllMessages::BitfieldDistribution(test_bitfield_distribution_msg())).await; handler.send_msg(AllMessages::Provisioner(test_provisioner_msg())).await; - handler.send_msg(AllMessages::PoVDistribution(test_pov_distribution_msg())).await; handler.send_msg(AllMessages::RuntimeApi(test_runtime_api_msg())).await; handler.send_msg(AllMessages::AvailabilityStore(test_availability_store_msg())).await; handler.send_msg(AllMessages::NetworkBridge(test_network_bridge_msg())).await; @@ -3113,7 +3035,7 @@ mod tests { select! { res = overseer_fut => { - const NUM_SUBSYSTEMS: usize = 19; + const NUM_SUBSYSTEMS: usize = 18; assert_eq!(stop_signals_received.load(atomic::Ordering::SeqCst), NUM_SUBSYSTEMS); assert_eq!(signals_received.load(atomic::Ordering::SeqCst), NUM_SUBSYSTEMS); diff --git a/node/service/Cargo.toml b/node/service/Cargo.toml index 9f9b7b58c72d..fab0327c2cb4 100644 --- a/node/service/Cargo.toml +++ b/node/service/Cargo.toml @@ -93,7 +93,6 @@ polkadot-node-core-candidate-validation = { path = "../core/candidate-validation polkadot-node-core-chain-api = { path = "../core/chain-api", optional = true } polkadot-node-core-provisioner = { path = "../core/provisioner", optional = true } polkadot-node-core-runtime-api = { path = "../core/runtime-api", optional = true } -polkadot-pov-distribution = { path = "../network/pov-distribution", optional = true } polkadot-statement-distribution = { path = "../network/statement-distribution", optional = true } polkadot-approval-distribution = { path = "../network/approval-distribution", optional = true } polkadot-node-core-approval-voting = { path = "../core/approval-voting", optional = true } @@ -139,7 +138,6 @@ real-overseer = [ "polkadot-node-core-chain-api", "polkadot-node-core-provisioner", "polkadot-node-core-runtime-api", - "polkadot-pov-distribution", "polkadot-statement-distribution", "polkadot-approval-distribution", ] diff --git a/node/service/src/lib.rs b/node/service/src/lib.rs index cefe716298a8..b0be6abd8acc 100644 --- a/node/service/src/lib.rs +++ b/node/service/src/lib.rs @@ -450,7 +450,6 @@ where use polkadot_node_collation_generation::CollationGenerationSubsystem; use polkadot_collator_protocol::{CollatorProtocolSubsystem, ProtocolSide}; use polkadot_network_bridge::NetworkBridge as NetworkBridgeSubsystem; - use polkadot_pov_distribution::PoVDistribution as PoVDistributionSubsystem; use polkadot_node_core_provisioner::ProvisioningSubsystem as ProvisionerSubsystem; use polkadot_node_core_runtime_api::RuntimeApiSubsystem; use polkadot_statement_distribution::StatementDistribution as StatementDistributionSubsystem; @@ -518,9 +517,6 @@ where authority_discovery, request_multiplexer, ), - pov_distribution: PoVDistributionSubsystem::new( - Metrics::register(registry)?, - ), provisioner: ProvisionerSubsystem::new( spawner.clone(), (), diff --git a/node/subsystem/src/messages.rs b/node/subsystem/src/messages.rs index ec463dc01e08..a8fcf9c3e85a 100644 --- a/node/subsystem/src/messages.rs +++ b/node/subsystem/src/messages.rs @@ -264,10 +264,29 @@ impl NetworkBridgeMessage { } /// Availability Distribution Message. -#[derive(Debug, derive_more::From)] +#[derive(Debug)] pub enum AvailabilityDistributionMessage { /// Incoming network request for an availability chunk. - ChunkFetchingRequest(IncomingRequest) + ChunkFetchingRequest(IncomingRequest), + /// Incoming network request for a seconded PoV. + PoVFetchingRequest(IncomingRequest), + /// Instruct availability distribution to fetch a remote PoV. + /// + /// NOTE: The result of this fetch is not yet locally validated and could be bogus. + FetchPoV { + /// The relay parent giving the necessary context. + relay_parent: Hash, + /// Validator to fetch the PoV from. + from_validator: ValidatorIndex, + /// Candidate hash to fetch the PoV for. + candidate_hash: CandidateHash, + /// Expected hash of the PoV, a PoV not matching this hash will be rejected. + pov_hash: Hash, + /// Sender for getting back the result of this fetch. + /// + /// The sender will be canceled if the fetching failed for some reason. + tx: oneshot::Sender, + }, } /// Availability Recovery Message. @@ -285,15 +304,6 @@ pub enum AvailabilityRecoveryMessage { AvailableDataFetchingRequest(IncomingRequest), } -impl AvailabilityDistributionMessage { - /// If the current variant contains the relay parent hash, return it. - pub fn relay_parent(&self) -> Option { - match self { - Self::ChunkFetchingRequest(_) => None, - } - } -} - /// Bitfield distribution message. #[derive(Debug, derive_more::From)] pub enum BitfieldDistributionMessage { @@ -568,33 +578,6 @@ impl BoundToRelayParent for ProvisionerMessage { } } -/// Message to the PoV Distribution subsystem. -#[derive(Debug, derive_more::From)] -pub enum PoVDistributionMessage { - /// Fetch a PoV from the network. - /// - /// This `CandidateDescriptor` should correspond to a candidate seconded under the provided - /// relay-parent hash. - FetchPoV(Hash, CandidateDescriptor, oneshot::Sender>), - /// Distribute a PoV for the given relay-parent and CandidateDescriptor. - /// The PoV should correctly hash to the PoV hash mentioned in the CandidateDescriptor - DistributePoV(Hash, CandidateDescriptor, Arc), - /// An update from the network bridge. - #[from] - NetworkBridgeUpdateV1(NetworkBridgeEvent), -} - -impl PoVDistributionMessage { - /// If the current variant contains the relay parent hash, return it. - pub fn relay_parent(&self) -> Option { - match self { - Self::FetchPoV(hash, _, _) => Some(*hash), - Self::DistributePoV(hash, _, _) => Some(*hash), - Self::NetworkBridgeUpdateV1(_) => None, - } - } -} - /// Message to the Collation Generation subsystem. #[derive(Debug)] pub enum CollationGenerationMessage { @@ -717,8 +700,6 @@ pub enum AllMessages { /// Message for the Provisioner subsystem. #[skip] Provisioner(ProvisionerMessage), - /// Message for the PoV Distribution subsystem. - PoVDistribution(PoVDistributionMessage), /// Message for the Runtime API subsystem. #[skip] RuntimeApi(RuntimeApiMessage), @@ -741,6 +722,28 @@ pub enum AllMessages { GossipSupport(GossipSupportMessage), } +impl From> for AvailabilityDistributionMessage { + fn from(req: IncomingRequest) -> Self { + Self::PoVFetchingRequest(req) + } +} +impl From> for AvailabilityDistributionMessage { + fn from(req: IncomingRequest) -> Self { + Self::ChunkFetchingRequest(req) + } +} +impl From> for CollatorProtocolMessage { + fn from(req: IncomingRequest) -> Self { + Self::CollationFetchingRequest(req) + } +} + + +impl From> for AllMessages { + fn from(req: IncomingRequest) -> Self { + From::::from(From::from(req)) + } +} impl From> for AllMessages { fn from(req: IncomingRequest) -> Self { From::::from(From::from(req)) @@ -751,11 +754,6 @@ impl From> for AllMessages From::::from(From::from(req)) } } -impl From> for CollatorProtocolMessage { - fn from(req: IncomingRequest) -> Self { - Self::CollationFetchingRequest(req) - } -} impl From> for AllMessages { fn from(req: IncomingRequest) -> Self { From::::from(From::from(req)) diff --git a/roadmap/implementers-guide/src/node/availability/availability-distribution.md b/roadmap/implementers-guide/src/node/availability/availability-distribution.md index db7fdebe8293..da12cf4b33cb 100644 --- a/roadmap/implementers-guide/src/node/availability/availability-distribution.md +++ b/roadmap/implementers-guide/src/node/availability/availability-distribution.md @@ -1,49 +1,79 @@ # Availability Distribution -Distribute availability erasure-coded chunks to validators. - -After a candidate is backed, the availability of the PoV block must be confirmed -by 2/3+ of all validators. Backing nodes will serve chunks for a PoV block from -their [Availability Store](../utility/availability-store.md), all other -validators request their chunks from backing nodes and store those received chunks in -their local availability store. +This subsystem is responsible for distribution availability data to peers. +Availability data are chunks, `PoV`s and `AvailableData` (which is `PoV` + +`PersistedValidationData`). It does so via request response protocols. + +In particular this subsystem is responsible for: + +- Respond to network requests requesting availability data by querying the + [Availability Store](../utility/availability-store.md). +- Request chunks from backing validators to put them in the local `Availability + Store` whenever we find an occupied core on the chain, + this is to ensure availability by at least 2/3+ of all validators, this + happens after a candidate is backed. +- Fetch `PoV` from validators, when requested via `FetchPoV` message from + backing (pov_requester module). +- +The backing subsystem is responsible of making available data available in the +local `Availability Store` upon validation. This subsystem will serve any +network requests by querying that store. ## Protocol -This subsystem has no associated peer set right now, but instead relies on -a request/response protocol, defined by `Protocol::ChunkFetching`. +This subsystem does not handle any peer set messages, but the `pov_requester` +does connecto to validators of the same backing group on the validation peer +set, to ensure fast propagation of statements between those validators and for +ensuring already established connections for requesting `PoV`s. Other than that +this subsystem drives request/response protocols. Input: - OverseerSignal::ActiveLeaves(`[ActiveLeavesUpdate]`) - AvailabilityDistributionMessage{msg: ChunkFetchingRequest} +- AvailabilityDistributionMessage{msg: PoVFetchingRequest} +- AvailabilityDistributionMessage{msg: FetchPoV} Output: - NetworkBridgeMessage::SendRequests(`[Requests]`, IfDisconnected::TryConnect) - AvailabilityStore::QueryChunk(candidate_hash, index, response_channel) - AvailabilityStore::StoreChunk(candidate_hash, chunk) +- AvailabilityStore::QueryAvailableData(candidate_hash, response_channel) - RuntimeApiRequest::SessionIndexForChild - RuntimeApiRequest::SessionInfo - RuntimeApiRequest::AvailabilityCores ## Functionality -### Requesting +### PoV Requester -This subsystems monitors currently occupied cores for all active leaves. For -each occupied core it will spawn a task fetching the erasure chunk which has the -`ValidatorIndex` of the node. For this an `ChunkFetchingRequest` is -issued, via substrate's generic request/response protocol. +The PoV requester in the `pov_requester` module takes care of staying connected +to validators of the current backing group of this very validator on the `Validation` +peer set and it will handle `FetchPoV` requests by issuing network requests to +those validators. It will check the hash of the received `PoV`, but will not do any +further validation. That needs to be done by the original `FetchPoV` sender +(backing subsystem). + +### Chunk Requester + +After a candidate is backed, the availability of the PoV block must be confirmed +by 2/3+ of all validators. The chunk requester is responsible of making that +availability a reality. + +It does that by querying checking occupied cores for all active leaves. For each +occupied core it will spawn a task fetching the erasure chunk which has the +`ValidatorIndex` of the node. For this an `ChunkFetchingRequest` is issued, via +substrate's generic request/response protocol. The spawned task will start trying to fetch the chunk from validators in responsible group of the occupied core, in a random order. For ensuring that we -use already open TCP connections wherever possible, the subsystem maintains a +use already open TCP connections wherever possible, the requester maintains a cache and preserves that random order for the entire session. Note however that, because not all validators in a group have to be actual backers, not all of them are required to have the needed chunk. This in turn -could lead to low throughput, as we have to wait for a fetches to fail, +could lead to low throughput, as we have to wait for fetches to fail, before reaching a validator finally having our chunk. We do rank back validators not delivering our chunk, but as backers could vary from block to block on a perfectly legitimate basis, this is still not ideal. See issues [2509](https://github.com/paritytech/polkadot/issues/2509) and [2512](https://github.com/paritytech/polkadot/issues/2512) @@ -59,6 +89,10 @@ as we would like as many validators as possible to have their chunk. See this ### Serving -On the other side the subsystem will listen for incoming -`ChunkFetchingRequest`s from the network bridge and will respond to -queries, by looking the requested chunk up in the availability store. +On the other side the subsystem will listen for incoming `ChunkFetchingRequest`s +and `PoVFetchingRequest`s from the network bridge and will respond to queries, +by looking the requested chunks and `PoV`s up in the availability store, this +happens in the `responder` module. + +We rely on the backing subsystem to make available data available locally in the +`Availability Store` after it has validated it. diff --git a/roadmap/implementers-guide/src/node/backing/candidate-backing.md b/roadmap/implementers-guide/src/node/backing/candidate-backing.md index 016c50967494..c44f5e1f09b1 100644 --- a/roadmap/implementers-guide/src/node/backing/candidate-backing.md +++ b/roadmap/implementers-guide/src/node/backing/candidate-backing.md @@ -18,7 +18,7 @@ Output: - [`RuntimeApiMessage`][RAM] - [`CandidateSelectionMessage`][CSM] - [`ProvisionerMessage`][PM] -- [`PoVDistributionMessage`][PDM] +- [`AvailabilityDistributionMessage`][ADM] - [`StatementDistributionMessage`][SDM] ## Functionality @@ -39,8 +39,11 @@ The subsystem should maintain a set of handles to Candidate Backing Jobs that ar ### On Receiving `CandidateBackingMessage` * If the message is a [`CandidateBackingMessage`][CBM]`::GetBackedCandidates`, get all backable candidates from the statement table and send them back. -* If the message is a [`CandidateBackingMessage`][CBM]`::Second`, sign and dispatch a `Seconded` statement only if we have not seconded any other candidate and have not signed a `Valid` statement for the requested candidate. Signing both a `Seconded` and `Valid` message is a double-voting misbehavior with a heavy penalty, and this could occur if another validator has seconded the same candidate and we've received their message before the internal seconding request. After successfully dispatching the `Seconded` statement we have to distribute the PoV. -* If the message is a [`CandidateBackingMessage`][CBM]`::Statement`, count the statement to the quorum. If the statement in the message is `Seconded` and it contains a candidate that belongs to our assignment, request the corresponding `PoV` from the `PoVDistribution` and launch validation. Issue our own `Valid` or `Invalid` statement as a result. +* If the message is a [`CandidateBackingMessage`][CBM]`::Second`, sign and dispatch a `Seconded` statement only if we have not seconded any other candidate and have not signed a `Valid` statement for the requested candidate. Signing both a `Seconded` and `Valid` message is a double-voting misbehavior with a heavy penalty, and this could occur if another validator has seconded the same candidate and we've received their message before the internal seconding request. +* If the message is a [`CandidateBackingMessage`][CBM]`::Statement`, count the statement to the quorum. If the statement in the message is `Seconded` and it contains a candidate that belongs to our assignment, request the corresponding `PoV` from the backing node via `AvailabilityDistribution` and launch validation. Issue our own `Valid` or `Invalid` statement as a result. + +If the seconding node did not provide us with the `PoV` we will retry fetching from other backing validators. + > big TODO: "contextual execution" > @@ -112,11 +115,8 @@ fn spawn_validation_work(candidate, parachain head, validation function) { ### Fetch Pov Block Create a `(sender, receiver)` pair. -Dispatch a [`PoVDistributionMessage`][PDM]`::FetchPoV(relay_parent, candidate_hash, sender)` and listen on the receiver for a response. - -### Distribute Pov Block +Dispatch a [`AvailabilityDistributionMessage`][PDM]`::FetchPoV{ validator_index, pov_hash, candidate_hash, tx, } and listen on the passed receiver for a response. Availability distribution will send the request to the validator specified by `validator_index`, which might not be serving it for whatever reasons, therefore we need to retry with other backing validators in that case. -Dispatch a [`PoVDistributionMessage`][PDM]`::DistributePoV(relay_parent, candidate_descriptor, pov)`. ### Validate PoV Block @@ -135,7 +135,7 @@ Dispatch a [`StatementDistributionMessage`][PDM]`::Share(relay_parent, SignedFul [CVM]: ../../types/overseer-protocol.md#validation-request-type [PM]: ../../types/overseer-protocol.md#provisioner-message [CBM]: ../../types/overseer-protocol.md#candidate-backing-message -[PDM]: ../../types/overseer-protocol.md#pov-distribution-message +[ADM]: ../../types/overseer-protocol.md#availability-distribution-message [SDM]: ../../types/overseer-protocol.md#statement-distribution-message [CS]: candidate-selection.md diff --git a/roadmap/implementers-guide/src/node/backing/pov-distribution.md b/roadmap/implementers-guide/src/node/backing/pov-distribution.md deleted file mode 100644 index 3cf6dd995aa5..000000000000 --- a/roadmap/implementers-guide/src/node/backing/pov-distribution.md +++ /dev/null @@ -1,110 +0,0 @@ -# PoV Distribution - -This subsystem is responsible for distributing PoV blocks. For now, unified with [Statement Distribution subsystem](statement-distribution.md). - -## Protocol - -`PeerSet`: `Validation` - -Input: [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message) - - -Output: - -- NetworkBridge::SendMessage(`[PeerId]`, message) -- NetworkBridge::ReportPeer(PeerId, cost_or_benefit) - - -## Functionality - -This network protocol is responsible for distributing [`PoV`s](../../types/availability.md#proof-of-validity) by gossip. Since PoVs are heavy in practice, gossip is far from the most efficient way to distribute them. In the future, this should be replaced by a better network protocol that finds validators who have validated the block and connects to them directly. This protocol is described. - -This protocol is described in terms of "us" and our peers, with the understanding that this is the procedure that any honest node will run. It has the following goals: - - We never have to buffer an unbounded amount of data - - PoVs will flow transitively across a network of honest nodes, stemming from the validators that originally seconded candidates requiring those PoVs. - -As we are gossiping, we need to track which PoVs our peers are waiting for to avoid sending them data that they are not expecting. It is not reasonable to expect our peers to buffer unexpected PoVs, just as we will not buffer unexpected PoVs. So notifying our peers about what is being awaited is key. However it is important that the notifications system is also bounded. - -For this, in order to avoid reaching into the internals of the [Statement Distribution](statement-distribution.md) Subsystem, we can rely on an expected property of candidate backing: that each validator can second up to 2 candidates per chain head. This will typically be only one, because they are only supposed to issue one, but they can equivocate if they are willing to be slashed. So we can set a cap on the number of PoVs each peer is allowed to notify us that they are waiting for at a given relay-parent. This cap will be twice the number of validators at that relay-parent. In practice, this is a very lax upper bound that can be reduced much further if desired. - -The view update mechanism of the [Network Bridge](../utility/network-bridge.md) ensures that peers are only allowed to consider a certain set of relay-parents as live. So this bounding mechanism caps the amount of data we need to store per peer at any time at `sum({ 2 * n_validators_at_head(head) * sizeof(hash) for head in view_heads })`. Additionally, peers should only be allowed to notify us of PoV hashes they are waiting for in the context of relay-parents in our own local view, which means that `n_validators_at_head` is implied to be `0` for relay-parents not in our own local view. - -View updates from peers and our own view updates are received from the network bridge. These will lag somewhat behind the `ActiveLeavesUpdate` messages received from the overseer, which will influence the actual data we store. The `OurViewUpdate`s from the [`NetworkBridgeEvent`](../../types/overseer-protocol.md#network-bridge-update) must be considered canonical in terms of our peers' perception of us. - -Lastly, the system needs to be bootstrapped with our own perception of which PoVs we are cognizant of but awaiting data for. This is done by receipt of the [`PoVDistributionMessage`](../../types/overseer-protocol.md#pov-distribution-message)::FetchPoV variant. Proper operation of this subsystem depends on the descriptors passed faithfully representing candidates which have been seconded by other validators. - -## Formal Description - -This protocol can be implemented as a state machine with the following state: - -```rust -struct State { - relay_parent_state: Map, - peer_state: Map, - our_view: View, -} - -struct BlockBasedState { - known: Map, // should be a shared PoV in practice. these things are heavy. - fetching: Map]>, - n_validators: usize, -} - -struct PeerState { - awaited: Map>, -} -``` - -We also use the [`PoVDistributionV1Message`](../../types/network.md#pov-distribution) as our `NetworkMessage`, which are sent and received by the [Network Bridge](../utility/network-bridge.md) - -Here is the logic of the state machine: - -*Overseer Signals* -- On `ActiveLeavesUpdate(relay_parent)`: - - For each relay-parent in the `activated` list: - - Get the number of validators at that relay parent by querying the [Runtime API](../utility/runtime-api.md) for the validators and then counting them. - - Create a blank entry in `relay_parent_state` under `relay_parent` with correct `n_validators` set. - - For each relay-parent in the `deactivated` list: - - Remove the entry for `relay_parent` from `relay_parent_state`. -- On `Conclude`: conclude. - -*PoV Distribution Messages* -- On `FetchPoV(relay_parent, descriptor, response_channel)` - - If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - - If there is a PoV under `descriptor.pov_hash` in the `known` map, send that PoV on the channel and return. - - Otherwise, place the `response_channel` in the `fetching` map under `descriptor.pov_hash`. - - If the `pov_hash` had no previous entry in `fetching` and there are `2 * n_validators` or fewer entries in the `fetching` set, send `NetworkMessage::Awaiting(relay_parent, vec![pov_hash])` to all peers. -- On `DistributePoV(relay_parent, descriptor, PoV)` - - If there is no entry in `relay_parent_state` under `relay_parent`, ignore. - - Complete and remove any channels under `descriptor.pov_hash` in the `fetching` map. - - Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`. - - Note the PoV under `descriptor.pov_hash` in `known`. - -*Network Bridge Updates* -- On `PeerConnected(peer_id, observed_role)` - - Make a fresh entry in the `peer_state` map for the `peer_id`. -- On `PeerDisconnected(peer_id)` - - Remove the entry for `peer_id` from the `peer_state` map. -- On `PeerMessage(peer_id, bytes)` - - If the bytes do not decode to a `NetworkMessage` or the `peer_id` has no entry in the `peer_state` map, report and ignore. - - If this is `NetworkMessage::Awaiting(relay_parent, pov_hashes)`: - - If there is no entry under `peer_state.awaited` for the `relay_parent`, report and ignore. - - If `relay_parent` is not contained within `our_view`, report and ignore. - - Otherwise, if the peer's `awaited` map combined with the `pov_hashes` would have more than ` 2 * relay_parent_state[relay_parent].n_validators` entries, report and ignore. Note that we are leaning on the property of the network bridge that it sets our view based on `activated` heads in `ActiveLeavesUpdate` signals. - - For each new `pov_hash` in `pov_hashes`, if there is a `pov` under `pov_hash` in the `known` map, send the peer a `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`. - - Otherwise, add the `pov_hash` to the `awaited` map - - If this is `NetworkMessage::SendPoV(relay_parent, pov_hash, pov)`: - - If there is no entry under `relay_parent` in `relay_parent_state` or no entry under `pov_hash` in our `fetching` map for that `relay_parent`, report and ignore. - - If the blake2-256 hash of the pov doesn't equal `pov_hash`, report and ignore. - - Complete and remove any listeners in the `fetching` map under `pov_hash`. However, leave an empty set of listeners in the `fetching` map to denote that this was something we once awaited. This will allow us to recognize peers who have sent us something we were expecting, but just a little late. - - Add to `known` map. - - Remove the `pov_hash` from the `peer.awaited` map, if any. - - Send `NetworkMessage::SendPoV(relay_parent, descriptor.pov_hash, PoV)` to all peers who have the `descriptor.pov_hash` in the set under `relay_parent` in the `peer.awaited` map and remove the entry from `peer.awaited`. -- On `PeerViewChange(peer_id, view)` - - If Peer is unknown, ignore. - - Ensure there is an entry under `relay_parent` for each `relay_parent` in `view` within the `peer.awaited` map, creating blank `awaited` lists as necessary. - - Remove all entries under `peer.awaited` that are not within `view`. - - For all hashes in `view` but were not within the old, send the peer all the keys in our `fetching` map under the block-based state for that hash - i.e. notify the peer of everything we are awaiting at that hash. -- On `OurViewChange(view)` - - Update `our_view` to `view` - diff --git a/roadmap/implementers-guide/src/types/overseer-protocol.md b/roadmap/implementers-guide/src/types/overseer-protocol.md index cedf01d3f881..2eb94c42fdce 100644 --- a/roadmap/implementers-guide/src/types/overseer-protocol.md +++ b/roadmap/implementers-guide/src/types/overseer-protocol.md @@ -136,13 +136,27 @@ This is a network protocol that receives messages of type [`AvailabilityDistribu ```rust enum AvailabilityDistributionMessage { - /// Distribute an availability chunk to other validators. - DistributeChunk(Hash, ErasureChunk), - /// Fetch an erasure chunk from network by candidate hash and chunk index. - FetchChunk(Hash, u32), - /// Event from the network. - /// An update on network state from the network bridge. - NetworkBridgeUpdateV1(NetworkBridgeEvent), + /// Incoming network request for an availability chunk. + ChunkFetchingRequest(IncomingRequest), + /// Incoming network request for a seconded PoV. + PoVFetchingRequest(IncomingRequest), + /// Instruct availability distribution to fetch a remote PoV. + /// + /// NOTE: The result of this fetch is not yet locally validated and could be bogus. + FetchPoV { + /// The relay parent giving the necessary context. + relay_parent: Hash, + /// Validator to fetch the PoV from. + from_validator: ValidatorIndex, + /// Candidate hash to fetch the PoV for. + candidate_hash: CandidateHash, + /// Expected hash of the PoV, a PoV not matching this hash will be rejected. + pov_hash: Hash, + /// Sender for getting back the result of this fetch. + /// + /// The sender will be canceled if the fetching failed for some reason. + tx: oneshot::Sender, + }, } ```