From 30c32151f552ff9460352b8bad402a0a8aa64d3e Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 2 Jan 2023 13:48:44 -0500 Subject: [PATCH 01/13] pre-checking: Reject failed PVFs --- node/core/candidate-validation/src/lib.rs | 3 +-- node/core/pvf-checker/src/lib.rs | 11 ++++---- node/core/pvf-checker/src/tests.rs | 25 ++++++++++++------- .../src/node/utility/pvf-prechecker.md | 8 +++--- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs index 70fc24eacade..3ceb658a5b92 100644 --- a/node/core/candidate-validation/src/lib.rs +++ b/node/core/candidate-validation/src/lib.rs @@ -295,8 +295,7 @@ where _ => { // The reasoning why this is "failed" and not invalid is because we assume that // during pre-checking voting the relay-chain will pin the code. In case the code - // actually is not there, we issue failed since this looks more like a bug. This - // leads to us abstaining. + // actually is not there, we issue failed since this looks more like a bug. gum::warn!( target: LOG_TARGET, ?relay_parent, diff --git a/node/core/pvf-checker/src/lib.rs b/node/core/pvf-checker/src/lib.rs index f31f3f728aa6..2a428fe7baf0 100644 --- a/node/core/pvf-checker/src/lib.rs +++ b/node/core/pvf-checker/src/lib.rs @@ -190,16 +190,15 @@ async fn handle_pvf_check( PreCheckOutcome::Valid => Judgement::Valid, PreCheckOutcome::Invalid => Judgement::Invalid, PreCheckOutcome::Failed => { - // Abstain. - // - // Returning here will leave the PVF in the view dangling. Since it is there, no new - // pre-checking request will be sent. + // Always vote against in case of failures. Voting against a PVF when encountering a + // timeout (or an unlikely node-specific issue) can be considered safe, since + // there is no slashing for being on the wrong side on a pre-check vote. gum::info!( target: LOG_TARGET, ?validation_code_hash, - "Pre-check failed, abstaining from voting", + "Pre-check failed, voting against", ); - return + Judgement::Invalid }, }; diff --git a/node/core/pvf-checker/src/tests.rs b/node/core/pvf-checker/src/tests.rs index f47e642ae98d..8bf0b62f626e 100644 --- a/node/core/pvf-checker/src/tests.rs +++ b/node/core/pvf-checker/src/tests.rs @@ -157,7 +157,7 @@ impl TestState { self.active_leaves_update(handle, Some(leaf), None, &[]).await } - async fn deactive_leaves( + async fn deactivate_leaves( &mut self, handle: &mut VirtualOverseer, deactivated: impl IntoIterator, @@ -894,8 +894,8 @@ fn unexpected_pvf_check_judgement() { // Catch the pre-check request, but don't reply just yet. let pre_check = test_state.expect_candidate_precheck(&mut handle).await; - // Now deactive the leaf and reply to the precheck request. - test_state.deactive_leaves(&mut handle, &[block_1.block_hash]).await; + // Now deactivate the leaf and reply to the precheck request. + test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await; pre_check.reply(PreCheckOutcome::Invalid); // the subsystem must remain silent. @@ -906,14 +906,17 @@ fn unexpected_pvf_check_judgement() { }); } +// Check that we do not abstain for a nondeterministic failure. Currently, this means the behavior +// is the same as if the pre-check returned `PreCheckOutcome::Invalid`. #[test] -fn abstain_for_nondeterministic_pvfcheck_failure() { +fn dont_abstain_for_nondeterministic_pvfcheck_failure() { test_harness(|mut test_state, mut handle| { async move { + let block_1 = FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]); test_state .activate_leaf_with_session( &mut handle, - FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]), + block_1.clone(), StartsNewSession { session_index: 2, validators: vec![OUR_VALIDATOR] }, ) .await; @@ -922,10 +925,14 @@ fn abstain_for_nondeterministic_pvfcheck_failure() { test_state.expect_session_for_child(&mut handle).await; test_state.expect_validators(&mut handle).await; - test_state - .expect_candidate_precheck(&mut handle) - .await - .reply(PreCheckOutcome::Failed); + // Catch the pre-check request, but don't reply just yet. + let pre_check = test_state.expect_candidate_precheck(&mut handle).await; + + // Now deactivate the leaf and reply to the precheck request. + test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await; + pre_check.reply(PreCheckOutcome::Failed); + + // the subsystem must remain silent. test_state.send_conclude(&mut handle).await; } diff --git a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md index fd75ce9e3804..7320bf9f07e8 100644 --- a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md +++ b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md @@ -16,11 +16,13 @@ To be relevant for the subsystem, a PVF must be returned by the [`pvfs_require_p When a PVF just becomes relevant, the subsystem will send a message to the [Candidate Validation] subsystem asking for the pre-check. -Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored. It is possible that the candidate validation was not able to check the PVF. In that case, the PVF pre-checker will abstain and won't submit any check statements. +Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored. -Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves. +It is possible that the candidate validation was not able to check the PVF, e.g. if it timed out. In that case, the PVF pre-checker will vote against it. This is considered safe, as there is no slashing for being on the wrong side of a pre-check vote. -The subsystem tracks all the statement that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session. +Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves. + +The subsystem tracks all the statements that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session. If the node is not in the active validator set, it will still perform all the checks. However, it will only submit the check statements when the node is in the active validator set. From d14c9798d454edb8c771fa17075c554a2cebeb0e Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 2 Jan 2023 14:05:18 -0500 Subject: [PATCH 02/13] paras: immediately reject any PVF that cannot reach a supermajority --- runtime/parachains/src/paras/mod.rs | 32 ++++++++++++++------------- runtime/parachains/src/paras/tests.rs | 17 ++++++++++++-- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/runtime/parachains/src/paras/mod.rs b/runtime/parachains/src/paras/mod.rs index 0fca9a004099..3d22324079c4 100644 --- a/runtime/parachains/src/paras/mod.rs +++ b/runtime/parachains/src/paras/mod.rs @@ -73,13 +73,14 @@ //! //! # PVF Pre-checking //! -//! As was mentioned above, a brand new validation code should go through a process of approval. -//! As part of this process, validators from the active set will take the validation code and -//! check if it is malicious. Once they did that and have their judgement, either accept or reject, -//! they issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this -//! pallet. Once supermajority is gained for accept, then the process that initiated the check -//! is resumed (as mentioned before this can be either upgrading of validation code or onboarding). -//! If supermajority is gained for reject, then the process is canceled. +//! As was mentioned above, a brand new validation code should go through a process of approval. As +//! part of this process, validators from the active set will take the validation code and check if +//! it is malicious. Once they did that and have their judgement, either accept or reject, they +//! issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this +//! pallet. Once supermajority is gained for accept, then the process that initiated the check is +//! resumed (as mentioned before this can be either upgrading of validation code or onboarding). If +//! getting a supermajority becomes impossible (>1/3 of validators have already voted against), then +//! we reject. //! //! Below is a state diagram that depicts states of a single PVF pre-checking vote. //! @@ -92,8 +93,8 @@ //! │ │ │ //! │ ┌───────┐ //! │ │ │ -//! └─▶│ init │────supermajority ┌──────────┐ -//! │ │ against │ │ +//! └─▶│ init │──── >1/3 against ┌──────────┐ +//! │ │ │ │ │ //! └───────┘ └──────────▶│ rejected │ //! ▲ │ │ │ //! │ │ session └──────────┘ @@ -452,12 +453,13 @@ impl PvfCheckActiveVoteState { /// Returns `None` if the quorum is not reached, or the direction of the decision. fn quorum(&self, n_validators: usize) -> Option { - let q_threshold = primitives::v2::supermajority_threshold(n_validators); - // NOTE: counting the reject votes is deliberately placed first. This is to err on the safe. - if self.votes_reject.count_ones() >= q_threshold { - Some(PvfCheckOutcome::Rejected) - } else if self.votes_accept.count_ones() >= q_threshold { + let accept_threshold = primitives::v2::supermajority_threshold(n_validators); + let reject_threshold = primitives::v2::byzantine_threshold(n_validators); + + if self.votes_accept.count_ones() >= accept_threshold { Some(PvfCheckOutcome::Accepted) + } else if self.votes_reject.count_ones() > reject_threshold { + Some(PvfCheckOutcome::Rejected) } else { None } @@ -1011,7 +1013,7 @@ pub mod pallet { } if let Some(outcome) = active_vote.quorum(validators.len()) { - // The supermajority quorum has been achieved. + // The quorum has been achieved. // // Remove the PVF vote from the active map and finalize the PVF checking according // to the outcome. diff --git a/runtime/parachains/src/paras/tests.rs b/runtime/parachains/src/paras/tests.rs index f110e2a0d38a..9e9023837b26 100644 --- a/runtime/parachains/src/paras/tests.rs +++ b/runtime/parachains/src/paras/tests.rs @@ -1252,8 +1252,21 @@ fn pvf_check_upgrade_reject() { Paras::schedule_code_upgrade(a, new_code.clone(), RELAY_PARENT, &Configuration::config()); check_code_is_stored(&new_code); - // Supermajority of validators vote against `new_code`. PVF should be rejected. - IntoIterator::into_iter([0, 1, 2, 3]) + // 1/3 of validators vote against `new_code`. PVF should not be rejected yet. + IntoIterator::into_iter([0]) + .map(|i| PvfCheckStatement { + accept: false, + subject: new_code.hash(), + session_index: EXPECTED_SESSION, + validator_index: i.into(), + }) + .for_each(sign_and_include_pvf_check_statement); + + // Verify that the new code is not yet discarded. + check_code_is_stored(&new_code); + + // >1/3 of validators vote against `new_code`. PVF should be rejected. + IntoIterator::into_iter([1]) .map(|i| PvfCheckStatement { accept: false, subject: new_code.hash(), From 79f16778fc0cb1476575019b7e2ca75fc64279ac Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 2 Jan 2023 14:20:10 -0500 Subject: [PATCH 03/13] Make the `quorum` reject condition a bit more clear semantically --- runtime/parachains/src/paras/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/parachains/src/paras/mod.rs b/runtime/parachains/src/paras/mod.rs index 3d22324079c4..91d105b60f88 100644 --- a/runtime/parachains/src/paras/mod.rs +++ b/runtime/parachains/src/paras/mod.rs @@ -454,7 +454,7 @@ impl PvfCheckActiveVoteState { /// Returns `None` if the quorum is not reached, or the direction of the decision. fn quorum(&self, n_validators: usize) -> Option { let accept_threshold = primitives::v2::supermajority_threshold(n_validators); - let reject_threshold = primitives::v2::byzantine_threshold(n_validators); + let reject_threshold = n_validators - accept_threshold; if self.votes_accept.count_ones() >= accept_threshold { Some(PvfCheckOutcome::Accepted) From 978e992cb05fe85dd2e31adf1ae0ad3c2d68279f Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 2 Jan 2023 14:21:27 -0500 Subject: [PATCH 04/13] Add comment --- runtime/parachains/src/paras/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/parachains/src/paras/mod.rs b/runtime/parachains/src/paras/mod.rs index 91d105b60f88..7017eaeae7d4 100644 --- a/runtime/parachains/src/paras/mod.rs +++ b/runtime/parachains/src/paras/mod.rs @@ -454,6 +454,7 @@ impl PvfCheckActiveVoteState { /// Returns `None` if the quorum is not reached, or the direction of the decision. fn quorum(&self, n_validators: usize) -> Option { let accept_threshold = primitives::v2::supermajority_threshold(n_validators); + // At this threshold, a supermajority is no longer possible, so we reject. let reject_threshold = n_validators - accept_threshold; if self.votes_accept.count_ones() >= accept_threshold { From 3675052282542387521e0d67cc624d161eeee136 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 2 Jan 2023 14:28:47 -0500 Subject: [PATCH 05/13] Update implementer's guide --- roadmap/implementers-guide/src/pvf-prechecking.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/roadmap/implementers-guide/src/pvf-prechecking.md b/roadmap/implementers-guide/src/pvf-prechecking.md index 4dce61d2a83b..fd2ca12330bd 100644 --- a/roadmap/implementers-guide/src/pvf-prechecking.md +++ b/roadmap/implementers-guide/src/pvf-prechecking.md @@ -29,9 +29,9 @@ The problem is solved by having a pre-checking process which is run when a new v Before any of those operations finish, the PVF pre-checking vote is initiated. The PVF pre-checking vote is identified by the PVF code hash that is being voted on. If there is already PVF pre-checking process running, then no new PVF pre-checking vote will be started. Instead, the operation just subscribes to the existing vote. -The pre-checking vote can be concluded either by obtaining a supermajority or if it expires. +The pre-checking vote can be concluded either by obtaining a threshold of votes for a decision, or if it expires. The threshold to accept is a supermajority of 2/3 of validators. We reject once a supermajority is no longer possible. -Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the supermajority of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted. +Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the threshold of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted. Only validators from the active set can participate in the vote. The set of active validators can change each session. That's why we reset the votes each session. A voting that observed a certain number of sessions will be rejected. From baed8f747bbc66b6501854e0a3a2226ae26de603 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 3 Jan 2023 07:26:56 -0500 Subject: [PATCH 06/13] Update a link Not related to the rest of the PR, but I randomly noticed and fixed this. --- node/collation-generation/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node/collation-generation/src/lib.rs b/node/collation-generation/src/lib.rs index 500b500636ba..a09d7d8aa09c 100644 --- a/node/collation-generation/src/lib.rs +++ b/node/collation-generation/src/lib.rs @@ -177,7 +177,7 @@ async fn handle_new_activations( sender: &mpsc::Sender, ) -> crate::error::Result<()> { // follow the procedure from the guide: - // https://w3f.github.io/parachain-implementers-guide/node/collators/collation-generation.html + // https://paritytech.github.io/polkadot/book/node/collators/collation-generation.html let _overall_timer = metrics.time_new_activations(); From 2806d886de89814ee7c0979a9e216ca2bec6a5db Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 3 Jan 2023 09:38:39 -0500 Subject: [PATCH 07/13] Update runtime/parachains/src/paras/tests.rs Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com> --- runtime/parachains/src/paras/tests.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/runtime/parachains/src/paras/tests.rs b/runtime/parachains/src/paras/tests.rs index 9e9023837b26..b020b6badf39 100644 --- a/runtime/parachains/src/paras/tests.rs +++ b/runtime/parachains/src/paras/tests.rs @@ -1253,14 +1253,12 @@ fn pvf_check_upgrade_reject() { check_code_is_stored(&new_code); // 1/3 of validators vote against `new_code`. PVF should not be rejected yet. - IntoIterator::into_iter([0]) - .map(|i| PvfCheckStatement { - accept: false, - subject: new_code.hash(), - session_index: EXPECTED_SESSION, - validator_index: i.into(), - }) - .for_each(sign_and_include_pvf_check_statement); + sign_and_include_pvf_check_statement(PvfCheckStatement { + accept: false, + subject: new_code.hash(), + session_index: EXPECTED_SESSION, + validator_index: 0.into(), + }); // Verify that the new code is not yet discarded. check_code_is_stored(&new_code); From f05224d3691d74624f5305218761b05d742b8387 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 3 Jan 2023 09:43:25 -0500 Subject: [PATCH 08/13] Remove unneeded loop --- runtime/parachains/src/paras/tests.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/runtime/parachains/src/paras/tests.rs b/runtime/parachains/src/paras/tests.rs index b020b6badf39..27ef7b225ef3 100644 --- a/runtime/parachains/src/paras/tests.rs +++ b/runtime/parachains/src/paras/tests.rs @@ -1264,14 +1264,12 @@ fn pvf_check_upgrade_reject() { check_code_is_stored(&new_code); // >1/3 of validators vote against `new_code`. PVF should be rejected. - IntoIterator::into_iter([1]) - .map(|i| PvfCheckStatement { - accept: false, - subject: new_code.hash(), - session_index: EXPECTED_SESSION, - validator_index: i.into(), - }) - .for_each(sign_and_include_pvf_check_statement); + sign_and_include_pvf_check_statement(PvfCheckStatement { + accept: false, + subject: new_code.hash(), + session_index: EXPECTED_SESSION, + validator_index: 1.into(), + }); // Verify that the new code is discarded. check_code_is_not_stored(&new_code); From db1a82263e393c4f68822f3f241c75e5ab8ac2eb Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 5 Jan 2023 06:29:31 -0500 Subject: [PATCH 09/13] Log PVF retries using `info!` --- node/core/candidate-validation/src/lib.rs | 2 +- node/core/pvf/src/host.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs index 2df00ff3c048..2822753dffb6 100644 --- a/node/core/candidate-validation/src/lib.rs +++ b/node/core/candidate-validation/src/lib.rs @@ -633,7 +633,7 @@ trait ValidationBackend { // Wait a brief delay before retrying. futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await; - gum::debug!( + gum::info!( target: LOG_TARGET, ?pvf, "Re-trying failed candidate validation due to AmbiguousWorkerDeath." diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 96aed4eae7a8..366f7e2232e6 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -525,7 +525,7 @@ async fn handle_execute_pvf( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { - gum::debug!( + gum::info!( target: LOG_TARGET, ?pvf, ?artifact_id, @@ -595,7 +595,7 @@ async fn handle_heads_up( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { - gum::debug!( + gum::info!( target: LOG_TARGET, ?active_pvf, ?artifact_id, From 30ec605d1bedcb7ee951e22c95be9c737e80986f Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 5 Jan 2023 07:04:01 -0500 Subject: [PATCH 10/13] Change retry logs to `warn!` and add preparation failure log --- node/core/candidate-validation/src/lib.rs | 2 +- node/core/pvf/src/host.rs | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs index 2822753dffb6..35edaa34524c 100644 --- a/node/core/candidate-validation/src/lib.rs +++ b/node/core/candidate-validation/src/lib.rs @@ -633,7 +633,7 @@ trait ValidationBackend { // Wait a brief delay before retrying. futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await; - gum::info!( + gum::warn!( target: LOG_TARGET, ?pvf, "Re-trying failed candidate validation due to AmbiguousWorkerDeath." diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 366f7e2232e6..7e55242faf0d 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -525,7 +525,7 @@ async fn handle_execute_pvf( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { - gum::info!( + gum::warn!( target: LOG_TARGET, ?pvf, ?artifact_id, @@ -595,7 +595,7 @@ async fn handle_heads_up( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { - gum::info!( + gum::warn!( target: LOG_TARGET, ?active_pvf, ?artifact_id, @@ -723,10 +723,19 @@ async fn handle_prepare_done( *state = match result { Ok(cpu_time_elapsed) => ArtifactState::Prepared { last_time_needed: SystemTime::now(), cpu_time_elapsed }, - Err(error) => ArtifactState::FailedToProcess { - last_time_failed: SystemTime::now(), - num_failures: *num_failures + 1, - error, + Err(error) => { + let last_time_failed = SystemTime::now(); + let num_failures = *num_failures + 1; + + gum::warn!( + target: LOG_TARGET, + ?artifact_id, + time_failed = ?last_time_failed, + %num_failures, + "artifact preparation failed: {}", + error + ); + ArtifactState::FailedToProcess { last_time_failed, num_failures, error } }, }; From c94135b477bf610bcf092fe2b10eba5ce497979f Mon Sep 17 00:00:00 2001 From: Marcin S Date: Thu, 5 Jan 2023 07:20:18 -0500 Subject: [PATCH 11/13] Log PVF execution failure --- node/core/pvf/src/execute/queue.rs | 44 ++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/node/core/pvf/src/execute/queue.rs b/node/core/pvf/src/execute/queue.rs index 72b6e450351b..02957fea1b5d 100644 --- a/node/core/pvf/src/execute/queue.rs +++ b/node/core/pvf/src/execute/queue.rs @@ -224,32 +224,48 @@ fn handle_job_finish( artifact_id: ArtifactId, result_tx: ResultSender, ) { - let (idle_worker, result) = match outcome { - Outcome::Ok { result_descriptor, duration: _, idle_worker } => { + let (idle_worker, result, duration) = match outcome { + Outcome::Ok { result_descriptor, duration, idle_worker } => { // TODO: propagate the soft timeout - (Some(idle_worker), Ok(result_descriptor)) + (Some(idle_worker), Ok(result_descriptor), Some(duration)) }, Outcome::InvalidCandidate { err, idle_worker } => ( Some(idle_worker), Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))), + None, ), Outcome::InternalError { err, idle_worker } => - (Some(idle_worker), Err(ValidationError::InternalError(err))), + (Some(idle_worker), Err(ValidationError::InternalError(err)), None), Outcome::HardTimeout => - (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))), - Outcome::IoErr => - (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))), + (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None), + Outcome::IoErr => ( + None, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)), + None, + ), }; queue.metrics.execute_finished(); - gum::debug!( - target: LOG_TARGET, - validation_code_hash = ?artifact_id.code_hash, - ?worker, - worker_rip = idle_worker.is_none(), - "execute worker concluded", - ); + if let Err(ref err) = result { + gum::warn!( + target: LOG_TARGET, + ?artifact_id, + ?worker, + worker_rip = idle_worker.is_none(), + "execution worker concluded, error occurred: {:?}", + err + ); + } else { + gum::debug!( + target: LOG_TARGET, + ?artifact_id, + ?worker, + worker_rip = idle_worker.is_none(), + ?duration, + "execute worker concluded successfully", + ); + } // First we send the result. It may fail due to the other end of the channel being dropped, // that's legitimate and we don't treat that as an error. From 876daa67478fb6440812bf27f87fc61846eae7b6 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 9 Jan 2023 07:45:40 -0500 Subject: [PATCH 12/13] Clarify why we reject failed PVFs --- node/core/pvf-checker/src/lib.rs | 3 +++ .../src/node/utility/pvf-prechecker.md | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/node/core/pvf-checker/src/lib.rs b/node/core/pvf-checker/src/lib.rs index 2a428fe7baf0..118bbddccf76 100644 --- a/node/core/pvf-checker/src/lib.rs +++ b/node/core/pvf-checker/src/lib.rs @@ -193,6 +193,9 @@ async fn handle_pvf_check( // Always vote against in case of failures. Voting against a PVF when encountering a // timeout (or an unlikely node-specific issue) can be considered safe, since // there is no slashing for being on the wrong side on a pre-check vote. + // + // Also, by being more strict here, we can safely be more lenient during preparation and + // avoid the risk of getting slashed there. gum::info!( target: LOG_TARGET, ?validation_code_hash, diff --git a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md index 7320bf9f07e8..90193ec00e18 100644 --- a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md +++ b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md @@ -18,14 +18,23 @@ When a PVF just becomes relevant, the subsystem will send a message to the [Cand Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored. -It is possible that the candidate validation was not able to check the PVF, e.g. if it timed out. In that case, the PVF pre-checker will vote against it. This is considered safe, as there is no slashing for being on the wrong side of a pre-check vote. - Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves. The subsystem tracks all the statements that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session. If the node is not in the active validator set, it will still perform all the checks. However, it will only submit the check statements when the node is in the active validator set. +### Rejecting failed PVFs + +It is possible that the candidate validation was not able to check the PVF, e.g. if it timed out. In that case, the PVF pre-checker will vote against it. This is considered safe, as there is no slashing for being on the wrong side of a pre-check vote. + +Rejecting instead of abstaining is better in several ways: + +1. Conclusion is reached faster - we have actual votes, instead of relying on a timeout. +1. Being strict in pre-checking makes it safer to be more lenient in preparation errors afterwards. Hence we have more leeway in avoiding raising dubious disputes, without making things less secure. + +Also, if we only abstain, an attacker can specially craft a PVF wasm blob so that it will fail on e.g. 50% of the validators. In that case a supermajority will never be reached and the vote will repeat multiple times, most likely with the same result (since all votes are cleared on a session change). This is avoided by rejecting failed PVFs, and by only requiring 1/3 of validators to reject a PVF to reach a decision. + [overview]: ../../pvf-prechecking.md [Runtime API]: runtime-api.md [PVF pre-checking runtime API]: ../../runtime-api/pvf-prechecking.md From 1d5753b1ed666843806d4c8ed54312f395afffd8 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 11 Jan 2023 13:32:09 -0500 Subject: [PATCH 13/13] Fix PVF reject runtime benchmarks --- runtime/parachains/src/paras/benchmarking/pvf_check.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/runtime/parachains/src/paras/benchmarking/pvf_check.rs b/runtime/parachains/src/paras/benchmarking/pvf_check.rs index 7c7d1ceecbeb..a16c016531ee 100644 --- a/runtime/parachains/src/paras/benchmarking/pvf_check.rs +++ b/runtime/parachains/src/paras/benchmarking/pvf_check.rs @@ -170,7 +170,8 @@ where } /// Generates a list of votes combined with signatures for the active validator set. The number of -/// votes is equal to the minimum number of votes required to reach the supermajority. +/// votes is equal to the minimum number of votes required to reach the threshold for either accept +/// or reject. fn generate_statements( vote_outcome: VoteOutcome, ) -> impl Iterator @@ -179,7 +180,11 @@ where { let validators = ParasShared::::active_validator_keys(); - let required_votes = primitives::supermajority_threshold(validators.len()); + let accept_threshold = primitives::supermajority_threshold(validators.len()); + let required_votes = match vote_outcome { + VoteOutcome::Accept => accept_threshold, + VoteOutcome::Reject => validators.len() - accept_threshold, + }; (0..required_votes).map(move |validator_index| { let stmt = PvfCheckStatement { accept: vote_outcome == VoteOutcome::Accept,