From a779892f8ee2a072ac9676d7949ac4539f630ecd Mon Sep 17 00:00:00 2001
From: Marcin S <marcin@bytedude.com>
Date: Thu, 12 Jan 2023 04:24:42 -0500
Subject: [PATCH] pre-checking: Reject failed PVFs (#6492)

* pre-checking: Reject failed PVFs

* paras: immediately reject any PVF that cannot reach a supermajority

* Make the `quorum` reject condition a bit more clear semantically

* Add comment

* Update implementer's guide

* Update a link

Not related to the rest of the PR, but I randomly noticed and fixed this.

* Update runtime/parachains/src/paras/tests.rs

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>

* Remove unneeded loop

* Log PVF retries using `info!`

* Change retry logs to `warn!` and add preparation failure log

* Log PVF execution failure

* Clarify why we reject failed PVFs

* Fix PVF reject runtime benchmarks

Co-authored-by: s0me0ne-unkn0wn <48632512+s0me0ne-unkn0wn@users.noreply.github.com>
---
 node/collation-generation/src/lib.rs          |  2 +-
 node/core/candidate-validation/src/lib.rs     |  5 +--
 node/core/pvf-checker/src/lib.rs              | 12 ++---
 node/core/pvf-checker/src/tests.rs            | 25 +++++++----
 node/core/pvf/src/execute/queue.rs            | 44 +++++++++++++------
 node/core/pvf/src/host.rs                     | 22 +++++-----
 .../src/node/utility/pvf-prechecker.md        | 17 +++++--
 .../implementers-guide/src/pvf-prechecking.md |  4 +-
 .../src/paras/benchmarking/pvf_check.rs       |  9 +++-
 runtime/parachains/src/paras/mod.rs           | 33 +++++++-------
 runtime/parachains/src/paras/tests.rs         | 27 ++++++++----
 11 files changed, 126 insertions(+), 74 deletions(-)
diff --git a/node/collation-generation/src/lib.rs b/node/collation-generation/src/lib.rs
index ab325f12d251..b3d728f70a48 100644
--- a/node/collation-generation/src/lib.rs
+++ b/node/collation-generation/src/lib.rs
@@ -177,7 +177,7 @@ async fn handle_new_activations<Context>(
 	sender: &mpsc::Sender<overseer::CollationGenerationOutgoingMessages>,
 ) -> crate::error::Result<()> {
 	// follow the procedure from the guide:
-	// https://w3f.github.io/parachain-implementers-guide/node/collators/collation-generation.html
+	// https://paritytech.github.io/polkadot/book/node/collators/collation-generation.html
 
 	let _overall_timer = metrics.time_new_activations();
 
diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs
index dac5e0a4fe53..58d91946830f 100644
--- a/node/core/candidate-validation/src/lib.rs
+++ b/node/core/candidate-validation/src/lib.rs
@@ -295,8 +295,7 @@ where
 			_ => {
 				// The reasoning why this is "failed" and not invalid is because we assume that
 				// during pre-checking voting the relay-chain will pin the code. In case the code
-				// actually is not there, we issue failed since this looks more like a bug. This
-				// leads to us abstaining.
+				// actually is not there, we issue failed since this looks more like a bug.
 				gum::warn!(
 					target: LOG_TARGET,
 					?relay_parent,
@@ -635,7 +634,7 @@ trait ValidationBackend {
 			// Wait a brief delay before retrying.
 			futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;
 
-			gum::debug!(
+			gum::warn!(
 				target: LOG_TARGET,
 				?pvf,
 				"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
diff --git a/node/core/pvf-checker/src/lib.rs b/node/core/pvf-checker/src/lib.rs
index 9bfe7ae092b0..7e961dcf4b88 100644
--- a/node/core/pvf-checker/src/lib.rs
+++ b/node/core/pvf-checker/src/lib.rs
@@ -190,16 +190,18 @@ async fn handle_pvf_check(
 		PreCheckOutcome::Valid => Judgement::Valid,
 		PreCheckOutcome::Invalid => Judgement::Invalid,
 		PreCheckOutcome::Failed => {
-			// Abstain.
+			// Always vote against in case of failures. Voting against a PVF when encountering a
+			// timeout (or an unlikely node-specific issue) can be considered safe, since
+			// there is no slashing for being on the wrong side on a pre-check vote.
 			//
-			// Returning here will leave the PVF in the view dangling. Since it is there, no new
-			// pre-checking request will be sent.
+			// Also, by being more strict here, we can safely be more lenient during preparation and
+			// avoid the risk of getting slashed there.
 			gum::info!(
 				target: LOG_TARGET,
 				?validation_code_hash,
-				"Pre-check failed, abstaining from voting",
+				"Pre-check failed, voting against",
 			);
-			return
+			Judgement::Invalid
 		},
 	};
 
diff --git a/node/core/pvf-checker/src/tests.rs b/node/core/pvf-checker/src/tests.rs
index 6ebd636f6fb0..e1b9b8a5b0db 100644
--- a/node/core/pvf-checker/src/tests.rs
+++ b/node/core/pvf-checker/src/tests.rs
@@ -157,7 +157,7 @@ impl TestState {
 		self.active_leaves_update(handle, Some(leaf), None, &[]).await
 	}
 
-	async fn deactive_leaves(
+	async fn deactivate_leaves(
 		&mut self,
 		handle: &mut VirtualOverseer,
 		deactivated: impl IntoIterator<Item = &Hash>,
@@ -894,8 +894,8 @@ fn unexpected_pvf_check_judgement() {
 			// Catch the pre-check request, but don't reply just yet.
 			let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
 
-			// Now deactive the leaf and reply to the precheck request.
-			test_state.deactive_leaves(&mut handle, &[block_1.block_hash]).await;
+			// Now deactivate the leaf and reply to the precheck request.
+			test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
 			pre_check.reply(PreCheckOutcome::Invalid);
 
 			// the subsystem must remain silent.
@@ -906,14 +906,17 @@ fn unexpected_pvf_check_judgement() {
 	});
 }
 
+// Check that we do not abstain for a nondeterministic failure. Currently, this means the behavior
+// is the same as if the pre-check returned `PreCheckOutcome::Invalid`.
 #[test]
-fn abstain_for_nondeterministic_pvfcheck_failure() {
+fn dont_abstain_for_nondeterministic_pvfcheck_failure() {
 	test_harness(|mut test_state, mut handle| {
 		async move {
+			let block_1 = FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]);
 			test_state
 				.activate_leaf_with_session(
 					&mut handle,
-					FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]),
+					block_1.clone(),
 					StartsNewSession { session_index: 2, validators: vec![OUR_VALIDATOR] },
 				)
 				.await;
@@ -922,10 +925,14 @@ fn abstain_for_nondeterministic_pvfcheck_failure() {
 			test_state.expect_session_for_child(&mut handle).await;
 			test_state.expect_validators(&mut handle).await;
 
-			test_state
-				.expect_candidate_precheck(&mut handle)
-				.await
-				.reply(PreCheckOutcome::Failed);
+			// Catch the pre-check request, but don't reply just yet.
+			let pre_check = test_state.expect_candidate_precheck(&mut handle).await;
+
+			// Now deactivate the leaf and reply to the precheck request.
+			test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
+			pre_check.reply(PreCheckOutcome::Failed);
+
+			// the subsystem must remain silent.
 
 			test_state.send_conclude(&mut handle).await;
 		}
diff --git a/node/core/pvf/src/execute/queue.rs b/node/core/pvf/src/execute/queue.rs
index f2f1b4e0cfff..c20099b0e798 100644
--- a/node/core/pvf/src/execute/queue.rs
+++ b/node/core/pvf/src/execute/queue.rs
@@ -223,32 +223,48 @@ fn handle_job_finish(
 	artifact_id: ArtifactId,
 	result_tx: ResultSender,
 ) {
-	let (idle_worker, result) = match outcome {
-		Outcome::Ok { result_descriptor, duration: _, idle_worker } => {
+	let (idle_worker, result, duration) = match outcome {
+		Outcome::Ok { result_descriptor, duration, idle_worker } => {
 			// TODO: propagate the soft timeout
 
-			(Some(idle_worker), Ok(result_descriptor))
+			(Some(idle_worker), Ok(result_descriptor), Some(duration))
 		},
 		Outcome::InvalidCandidate { err, idle_worker } => (
 			Some(idle_worker),
 			Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
+			None,
 		),
 		Outcome::InternalError { err, idle_worker } =>
-			(Some(idle_worker), Err(ValidationError::InternalError(err))),
+			(Some(idle_worker), Err(ValidationError::InternalError(err)), None),
 		Outcome::HardTimeout =>
-			(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))),
-		Outcome::IoErr =>
-			(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))),
+			(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
+		Outcome::IoErr => (
+			None,
+			Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
+			None,
+		),
 	};
 
 	queue.metrics.execute_finished();
-	gum::debug!(
-		target: LOG_TARGET,
-		validation_code_hash = ?artifact_id.code_hash,
-		?worker,
-		worker_rip = idle_worker.is_none(),
-		"execute worker concluded",
-	);
+	if let Err(ref err) = result {
+		gum::warn!(
+			target: LOG_TARGET,
+			?artifact_id,
+			?worker,
+			worker_rip = idle_worker.is_none(),
+			"execution worker concluded, error occurred: {:?}",
+			err
+		);
+	} else {
+		gum::debug!(
+			target: LOG_TARGET,
+			?artifact_id,
+			?worker,
+			worker_rip = idle_worker.is_none(),
+			?duration,
+			"execute worker concluded successfully",
+		);
+	}
 
 	// First we send the result. It may fail due to the other end of the channel being dropped,
 	// that's legitimate and we don't treat that as an error.
diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs
index 6b51b8cc1351..3514aff1d896 100644
--- a/node/core/pvf/src/host.rs
+++ b/node/core/pvf/src/host.rs
@@ -525,7 +525,7 @@ async fn handle_execute_pvf(
 			},
 			ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
 				if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
-					gum::debug!(
+					gum::warn!(
 						target: LOG_TARGET,
 						?pvf,
 						?artifact_id,
@@ -599,7 +599,7 @@ async fn handle_heads_up(
 				},
 				ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
 					if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
-						gum::debug!(
+						gum::warn!(
 							target: LOG_TARGET,
 							?active_pvf,
 							?artifact_id,
@@ -728,18 +728,18 @@ async fn handle_prepare_done(
 		Ok(cpu_time_elapsed) =>
 			ArtifactState::Prepared { last_time_needed: SystemTime::now(), cpu_time_elapsed },
 		Err(error) => {
-			gum::debug!(
+			let last_time_failed = SystemTime::now();
+			let num_failures = *num_failures + 1;
+
+			gum::warn!(
 				target: LOG_TARGET,
-				artifact_id = ?artifact_id,
-				num_failures = ?num_failures,
-				"Failed to process artifact: {}",
+				?artifact_id,
+				time_failed = ?last_time_failed,
+				%num_failures,
+				"artifact preparation failed: {}",
 				error
 			);
-			ArtifactState::FailedToProcess {
-				last_time_failed: SystemTime::now(),
-				num_failures: *num_failures + 1,
-				error,
-			}
+			ArtifactState::FailedToProcess { last_time_failed, num_failures, error }
 		},
 	};
 
diff --git a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md
index fd75ce9e3804..90193ec00e18 100644
--- a/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md
+++ b/roadmap/implementers-guide/src/node/utility/pvf-prechecker.md
@@ -16,14 +16,25 @@ To be relevant for the subsystem, a PVF must be returned by the [`pvfs_require_p
 
 When a PVF just becomes relevant, the subsystem will send a message to the [Candidate Validation] subsystem asking for the pre-check.
 
-Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored. It is possible that the candidate validation was not able to check the PVF. In that case, the PVF pre-checker will abstain and won't submit any check statements.
+Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored.
 
-Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves.
+Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves.
 
-The subsystem tracks all the statement that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session.
+The subsystem tracks all the statements that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session.
 
 If the node is not in the active validator set, it will still perform all the checks. However, it will only submit the check statements when the node is in the active validator set.
 
+### Rejecting failed PVFs
+
+It is possible that the candidate validation was not able to check the PVF, e.g. if it timed out. In that case, the PVF pre-checker will vote against it. This is considered safe, as there is no slashing for being on the wrong side of a pre-check vote.
+
+Rejecting instead of abstaining is better in several ways:
+
+1. Conclusion is reached faster - we have actual votes, instead of relying on a timeout.
+1. Being strict in pre-checking makes it safer to be more lenient in preparation errors afterwards. Hence we have more leeway in avoiding raising dubious disputes, without making things less secure.
+
+Also, if we only abstain, an attacker can specially craft a PVF wasm blob so that it will fail on e.g. 50% of the validators. In that case a supermajority will never be reached and the vote will repeat multiple times, most likely with the same result (since all votes are cleared on a session change). This is avoided by rejecting failed PVFs, and by only requiring 1/3 of validators to reject a PVF to reach a decision.
+
 [overview]: ../../pvf-prechecking.md
 [Runtime API]: runtime-api.md
 [PVF pre-checking runtime API]: ../../runtime-api/pvf-prechecking.md
diff --git a/roadmap/implementers-guide/src/pvf-prechecking.md b/roadmap/implementers-guide/src/pvf-prechecking.md
index 4dce61d2a83b..fd2ca12330bd 100644
--- a/roadmap/implementers-guide/src/pvf-prechecking.md
+++ b/roadmap/implementers-guide/src/pvf-prechecking.md
@@ -29,9 +29,9 @@ The problem is solved by having a pre-checking process which is run when a new v
 Before any of those operations finish, the PVF pre-checking vote is initiated. The PVF pre-checking vote is identified by the PVF code hash that is being voted on. If there is already PVF pre-checking process running, then no
 new PVF pre-checking vote will be started. Instead, the operation just subscribes to the existing vote.
 
-The pre-checking vote can be concluded either by obtaining a supermajority or if it expires.
+The pre-checking vote can be concluded either by obtaining a threshold of votes for a decision, or if it expires. The threshold to accept is a supermajority of 2/3 of validators. We reject once a supermajority is no longer possible.
 
-Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the supermajority of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted.
+Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the threshold of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted.
 
 Only validators from the active set can participate in the vote. The set of active validators can change each session. That's why we reset the votes each session. A voting that observed a certain number of sessions will be rejected.
 
diff --git a/runtime/parachains/src/paras/benchmarking/pvf_check.rs b/runtime/parachains/src/paras/benchmarking/pvf_check.rs
index 7c7d1ceecbeb..a16c016531ee 100644
--- a/runtime/parachains/src/paras/benchmarking/pvf_check.rs
+++ b/runtime/parachains/src/paras/benchmarking/pvf_check.rs
@@ -170,7 +170,8 @@ where
 }
 
 /// Generates a list of votes combined with signatures for the active validator set. The number of
-/// votes is equal to the minimum number of votes required to reach the supermajority.
+/// votes is equal to the minimum number of votes required to reach the threshold for either accept
+/// or reject.
 fn generate_statements<T>(
 	vote_outcome: VoteOutcome,
 ) -> impl Iterator<Item = (PvfCheckStatement, ValidatorSignature)>
@@ -179,7 +180,11 @@ where
 {
 	let validators = ParasShared::<T>::active_validator_keys();
 
-	let required_votes = primitives::supermajority_threshold(validators.len());
+	let accept_threshold = primitives::supermajority_threshold(validators.len());
+	let required_votes = match vote_outcome {
+		VoteOutcome::Accept => accept_threshold,
+		VoteOutcome::Reject => validators.len() - accept_threshold,
+	};
 	(0..required_votes).map(move |validator_index| {
 		let stmt = PvfCheckStatement {
 			accept: vote_outcome == VoteOutcome::Accept,
diff --git a/runtime/parachains/src/paras/mod.rs b/runtime/parachains/src/paras/mod.rs
index ca442e09df44..f372a88713e6 100644
--- a/runtime/parachains/src/paras/mod.rs
+++ b/runtime/parachains/src/paras/mod.rs
@@ -73,13 +73,14 @@
 //!
 //! # PVF Pre-checking
 //!
-//! As was mentioned above, a brand new validation code should go through a process of approval.
-//! As part of this process, validators from the active set will take the validation code and
-//! check if it is malicious. Once they did that and have their judgement, either accept or reject,
-//! they issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this
-//! pallet. Once supermajority is gained for accept, then the process that initiated the check
-//! is resumed (as mentioned before this can be either upgrading of validation code or onboarding).
-//! If supermajority is gained for reject, then the process is canceled.
+//! As was mentioned above, a brand new validation code should go through a process of approval. As
+//! part of this process, validators from the active set will take the validation code and check if
+//! it is malicious. Once they did that and have their judgement, either accept or reject, they
+//! issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this
+//! pallet. Once supermajority is gained for accept, then the process that initiated the check is
+//! resumed (as mentioned before this can be either upgrading of validation code or onboarding). If
+//! getting a supermajority becomes impossible (>1/3 of validators have already voted against), then
+//! we reject.
 //!
 //! Below is a state diagram that depicts states of a single PVF pre-checking vote.
 //!
@@ -92,8 +93,8 @@
 //!         │      │   │
 //!         │  ┌───────┐
 //!         │  │       │
-//!         └─▶│ init  │────supermajority      ┌──────────┐
-//!            │       │       against         │          │
+//!         └─▶│ init  │──── >1/3 against      ┌──────────┐
+//!            │       │           │           │          │
 //!            └───────┘           └──────────▶│ rejected │
 //!             ▲  │                           │          │
 //!             │  │ session                   └──────────┘
@@ -452,12 +453,14 @@ impl<BlockNumber> PvfCheckActiveVoteState<BlockNumber> {
 
 	/// Returns `None` if the quorum is not reached, or the direction of the decision.
 	fn quorum(&self, n_validators: usize) -> Option<PvfCheckOutcome> {
-		let q_threshold = primitives::supermajority_threshold(n_validators);
-		// NOTE: counting the reject votes is deliberately placed first. This is to err on the safe.
-		if self.votes_reject.count_ones() >= q_threshold {
-			Some(PvfCheckOutcome::Rejected)
-		} else if self.votes_accept.count_ones() >= q_threshold {
+		let accept_threshold = primitives::supermajority_threshold(n_validators);
+		// At this threshold, a supermajority is no longer possible, so we reject.
+		let reject_threshold = n_validators - accept_threshold;
+
+		if self.votes_accept.count_ones() >= accept_threshold {
 			Some(PvfCheckOutcome::Accepted)
+		} else if self.votes_reject.count_ones() > reject_threshold {
+			Some(PvfCheckOutcome::Rejected)
 		} else {
 			None
 		}
@@ -1011,7 +1014,7 @@ pub mod pallet {
 			}
 
 			if let Some(outcome) = active_vote.quorum(validators.len()) {
-				// The supermajority quorum has been achieved.
+				// The quorum has been achieved.
 				//
 				// Remove the PVF vote from the active map and finalize the PVF checking according
 				// to the outcome.
diff --git a/runtime/parachains/src/paras/tests.rs b/runtime/parachains/src/paras/tests.rs
index 728eb90429a6..df639b495fb9 100644
--- a/runtime/parachains/src/paras/tests.rs
+++ b/runtime/parachains/src/paras/tests.rs
@@ -1249,15 +1249,24 @@ fn pvf_check_upgrade_reject() {
 		Paras::schedule_code_upgrade(a, new_code.clone(), RELAY_PARENT, &Configuration::config());
 		check_code_is_stored(&new_code);
 
-		// Supermajority of validators vote against `new_code`. PVF should be rejected.
-		IntoIterator::into_iter([0, 1, 2, 3])
-			.map(|i| PvfCheckStatement {
-				accept: false,
-				subject: new_code.hash(),
-				session_index: EXPECTED_SESSION,
-				validator_index: i.into(),
-			})
-			.for_each(sign_and_include_pvf_check_statement);
+		// 1/3 of validators vote against `new_code`. PVF should not be rejected yet.
+		sign_and_include_pvf_check_statement(PvfCheckStatement {
+			accept: false,
+			subject: new_code.hash(),
+			session_index: EXPECTED_SESSION,
+			validator_index: 0.into(),
+		});
+
+		// Verify that the new code is not yet discarded.
+		check_code_is_stored(&new_code);
+
+		// >1/3 of validators vote against `new_code`. PVF should be rejected.
+		sign_and_include_pvf_check_statement(PvfCheckStatement {
+			accept: false,
+			subject: new_code.hash(),
+			session_index: EXPECTED_SESSION,
+			validator_index: 1.into(),
+		});
 
 		// Verify that the new code is discarded.
 		check_code_is_not_stored(&new_code);