Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

pre-checking: Reject failed PVFs #6492

Merged
merged 15 commits into from
Jan 12, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion node/collation-generation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ async fn handle_new_activations<Context>(
sender: &mpsc::Sender<overseer::CollationGenerationOutgoingMessages>,
) -> crate::error::Result<()> {
// follow the procedure from the guide:
// https://w3f.github.io/parachain-implementers-guide/node/collators/collation-generation.html
// https://paritytech.github.io/polkadot/book/node/collators/collation-generation.html

let _overall_timer = metrics.time_new_activations();

Expand Down
5 changes: 2 additions & 3 deletions node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,7 @@ where
_ => {
// The reasoning why this is "failed" and not invalid is because we assume that
// during pre-checking voting the relay-chain will pin the code. In case the code
// actually is not there, we issue failed since this looks more like a bug. This
// leads to us abstaining.
// actually is not there, we issue failed since this looks more like a bug.
gum::warn!(
target: LOG_TARGET,
?relay_parent,
Expand Down Expand Up @@ -634,7 +633,7 @@ trait ValidationBackend {
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;

gum::debug!(
gum::warn!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
Expand Down
11 changes: 5 additions & 6 deletions node/core/pvf-checker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,15 @@ async fn handle_pvf_check(
PreCheckOutcome::Valid => Judgement::Valid,
PreCheckOutcome::Invalid => Judgement::Invalid,
PreCheckOutcome::Failed => {
// Abstain.
//
// Returning here will leave the PVF in the view dangling. Since it is there, no new
// pre-checking request will be sent.
// Always vote against in case of failures. Voting against a PVF when encountering a
// timeout (or an unlikely node-specific issue) can be considered safe, since
// there is no slashing for being on the wrong side on a pre-check vote.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. Also by being more strict here, we can safely be more lenient during preparation and avoid risk of getting slashed there.

gum::info!(
target: LOG_TARGET,
?validation_code_hash,
"Pre-check failed, abstaining from voting",
"Pre-check failed, voting against",
);
return
Judgement::Invalid
},
};

Expand Down
25 changes: 16 additions & 9 deletions node/core/pvf-checker/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ impl TestState {
self.active_leaves_update(handle, Some(leaf), None, &[]).await
}

async fn deactive_leaves(
async fn deactivate_leaves(
&mut self,
handle: &mut VirtualOverseer,
deactivated: impl IntoIterator<Item = &Hash>,
Expand Down Expand Up @@ -894,8 +894,8 @@ fn unexpected_pvf_check_judgement() {
// Catch the pre-check request, but don't reply just yet.
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;

// Now deactive the leaf and reply to the precheck request.
test_state.deactive_leaves(&mut handle, &[block_1.block_hash]).await;
// Now deactivate the leaf and reply to the precheck request.
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
pre_check.reply(PreCheckOutcome::Invalid);

// the subsystem must remain silent.
Expand All @@ -906,14 +906,17 @@ fn unexpected_pvf_check_judgement() {
});
}

// Check that we do not abstain for a nondeterministic failure. Currently, this means the behavior
// is the same as if the pre-check returned `PreCheckOutcome::Invalid`.
#[test]
fn abstain_for_nondeterministic_pvfcheck_failure() {
fn dont_abstain_for_nondeterministic_pvfcheck_failure() {
test_harness(|mut test_state, mut handle| {
async move {
let block_1 = FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]);
test_state
.activate_leaf_with_session(
&mut handle,
FakeLeaf::new(dummy_hash(), 1, vec![dummy_validation_code_hash(1)]),
block_1.clone(),
StartsNewSession { session_index: 2, validators: vec![OUR_VALIDATOR] },
)
.await;
Expand All @@ -922,10 +925,14 @@ fn abstain_for_nondeterministic_pvfcheck_failure() {
test_state.expect_session_for_child(&mut handle).await;
test_state.expect_validators(&mut handle).await;

test_state
.expect_candidate_precheck(&mut handle)
.await
.reply(PreCheckOutcome::Failed);
// Catch the pre-check request, but don't reply just yet.
let pre_check = test_state.expect_candidate_precheck(&mut handle).await;

// Now deactivate the leaf and reply to the precheck request.
test_state.deactivate_leaves(&mut handle, &[block_1.block_hash]).await;
pre_check.reply(PreCheckOutcome::Failed);

// the subsystem must remain silent.

test_state.send_conclude(&mut handle).await;
}
Expand Down
44 changes: 30 additions & 14 deletions node/core/pvf/src/execute/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,32 +224,48 @@ fn handle_job_finish(
artifact_id: ArtifactId,
result_tx: ResultSender,
) {
let (idle_worker, result) = match outcome {
Outcome::Ok { result_descriptor, duration: _, idle_worker } => {
let (idle_worker, result, duration) = match outcome {
Outcome::Ok { result_descriptor, duration, idle_worker } => {
// TODO: propagate the soft timeout

(Some(idle_worker), Ok(result_descriptor))
(Some(idle_worker), Ok(result_descriptor), Some(duration))
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))),
None,
),
Outcome::InternalError { err, idle_worker } =>
(Some(idle_worker), Err(ValidationError::InternalError(err))),
(Some(idle_worker), Err(ValidationError::InternalError(err)), None),
Outcome::HardTimeout =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout))),
Outcome::IoErr =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath))),
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
Outcome::IoErr => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
None,
),
};

queue.metrics.execute_finished();
gum::debug!(
target: LOG_TARGET,
validation_code_hash = ?artifact_id.code_hash,
?worker,
worker_rip = idle_worker.is_none(),
"execute worker concluded",
);
if let Err(ref err) = result {
gum::warn!(
target: LOG_TARGET,
?artifact_id,
?worker,
worker_rip = idle_worker.is_none(),
"execution worker concluded, error occurred: {:?}",
err
);
} else {
gum::debug!(
target: LOG_TARGET,
?artifact_id,
?worker,
worker_rip = idle_worker.is_none(),
?duration,
"execute worker concluded successfully",
);
}

// First we send the result. It may fail due to the other end of the channel being dropped,
// that's legitimate and we don't treat that as an error.
Expand Down
21 changes: 15 additions & 6 deletions node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ async fn handle_execute_pvf(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
gum::warn!(
target: LOG_TARGET,
?pvf,
?artifact_id,
Expand Down Expand Up @@ -595,7 +595,7 @@ async fn handle_heads_up(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
gum::warn!(
target: LOG_TARGET,
?active_pvf,
?artifact_id,
Expand Down Expand Up @@ -723,10 +723,19 @@ async fn handle_prepare_done(
*state = match result {
Ok(cpu_time_elapsed) =>
ArtifactState::Prepared { last_time_needed: SystemTime::now(), cpu_time_elapsed },
Err(error) => ArtifactState::FailedToProcess {
last_time_failed: SystemTime::now(),
num_failures: *num_failures + 1,
error,
Err(error) => {
let last_time_failed = SystemTime::now();
let num_failures = *num_failures + 1;

gum::warn!(
target: LOG_TARGET,
?artifact_id,
time_failed = ?last_time_failed,
%num_failures,
"artifact preparation failed: {}",
error
);
ArtifactState::FailedToProcess { last_time_failed, num_failures, error }
},
};

Expand Down
8 changes: 5 additions & 3 deletions roadmap/implementers-guide/src/node/utility/pvf-prechecker.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ To be relevant for the subsystem, a PVF must be returned by the [`pvfs_require_p

When a PVF just becomes relevant, the subsystem will send a message to the [Candidate Validation] subsystem asking for the pre-check.

Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored. It is possible that the candidate validation was not able to check the PVF. In that case, the PVF pre-checker will abstain and won't submit any check statements.
Upon receving a message from the candidate-validation subsystem, the pre-checker will note down that the PVF has its judgement and will also sign and submit a [`PvfCheckStatement`][PvfCheckStatement] via the [`submit_pvf_check_statement` runtime API][PVF pre-checking runtime API]. In case, a judgement was received for a PVF that is no longer in view it is ignored.

Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves.
It is possible that the candidate validation was not able to check the PVF, e.g. if it timed out. In that case, the PVF pre-checker will vote against it. This is considered safe, as there is no slashing for being on the wrong side of a pre-check vote.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to have a rationale on why we must reject instead of abstaining.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better in two ways:

  1. Conclusion is reached faster - we have actual votes, instead of relying on a timeout.
  2. Being strict in pre-checking makes it safer to be more lenient in preparation errors afterwards. Hence we have more leeway in avoiding raising dubious disputes, without making things less secure.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know why, sr-labs explains it, but I don't see it in a guide :)


The subsystem tracks all the statement that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session.
Since a vote only is valid during [one session][overview], the subsystem will have to resign and submit the statements for the new session. The new session is assumed to be started if at least one of the leaves has a greater session index that was previously observed in any of the leaves.

The subsystem tracks all the statements that it submitted within a session. If for some reason a PVF became irrelevant and then becomes relevant again, the subsystem will not submit a new statement for that PVF within the same session.

If the node is not in the active validator set, it will still perform all the checks. However, it will only submit the check statements when the node is in the active validator set.

Expand Down
4 changes: 2 additions & 2 deletions roadmap/implementers-guide/src/pvf-prechecking.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ The problem is solved by having a pre-checking process which is run when a new v
Before any of those operations finish, the PVF pre-checking vote is initiated. The PVF pre-checking vote is identified by the PVF code hash that is being voted on. If there is already PVF pre-checking process running, then no
new PVF pre-checking vote will be started. Instead, the operation just subscribes to the existing vote.

The pre-checking vote can be concluded either by obtaining a supermajority or if it expires.
The pre-checking vote can be concluded either by obtaining a threshold of votes for a decision, or if it expires. The threshold to accept is a supermajority of 2/3 of validators. We reject once a supermajority is no longer possible.

Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the supermajority of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted.
Each validator checks the list of PVFs available for voting. The vote is binary, i.e. accept or reject a given PVF. As soon as the threshold of votes are collected for one of the sides of the vote, the voting is concluded in that direction and the effects of the voting are enacted.

Only validators from the active set can participate in the vote. The set of active validators can change each session. That's why we reset the votes each session. A voting that observed a certain number of sessions will be rejected.

Expand Down
33 changes: 18 additions & 15 deletions runtime/parachains/src/paras/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,14 @@
//!
//! # PVF Pre-checking
//!
//! As was mentioned above, a brand new validation code should go through a process of approval.
//! As part of this process, validators from the active set will take the validation code and
//! check if it is malicious. Once they did that and have their judgement, either accept or reject,
//! they issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this
//! pallet. Once supermajority is gained for accept, then the process that initiated the check
//! is resumed (as mentioned before this can be either upgrading of validation code or onboarding).
//! If supermajority is gained for reject, then the process is canceled.
//! As was mentioned above, a brand new validation code should go through a process of approval. As
//! part of this process, validators from the active set will take the validation code and check if
//! it is malicious. Once they did that and have their judgement, either accept or reject, they
//! issue a statement in a form of an unsigned extrinsic. This extrinsic is processed by this
//! pallet. Once supermajority is gained for accept, then the process that initiated the check is
//! resumed (as mentioned before this can be either upgrading of validation code or onboarding). If
//! getting a supermajority becomes impossible (>1/3 of validators have already voted against), then
//! we reject.
//!
//! Below is a state diagram that depicts states of a single PVF pre-checking vote.
//!
Expand All @@ -92,8 +93,8 @@
//! │ │ │
//! │ ┌───────┐
//! │ │ │
//! └─▶│ init │────supermajority ┌──────────┐
//! │ │ against │ │
//! └─▶│ init │──── >1/3 against ┌──────────┐
//! │ │ │ │
//! └───────┘ └──────────▶│ rejected │
//! ▲ │ │ │
//! │ │ session └──────────┘
Expand Down Expand Up @@ -452,12 +453,14 @@ impl<BlockNumber> PvfCheckActiveVoteState<BlockNumber> {

/// Returns `None` if the quorum is not reached, or the direction of the decision.
fn quorum(&self, n_validators: usize) -> Option<PvfCheckOutcome> {
let q_threshold = primitives::v2::supermajority_threshold(n_validators);
// NOTE: counting the reject votes is deliberately placed first. This is to err on the safe.
if self.votes_reject.count_ones() >= q_threshold {
Some(PvfCheckOutcome::Rejected)
} else if self.votes_accept.count_ones() >= q_threshold {
let accept_threshold = primitives::v2::supermajority_threshold(n_validators);
// At this threshold, a supermajority is no longer possible, so we reject.
let reject_threshold = n_validators - accept_threshold;

if self.votes_accept.count_ones() >= accept_threshold {
Some(PvfCheckOutcome::Accepted)
} else if self.votes_reject.count_ones() > reject_threshold {
Some(PvfCheckOutcome::Rejected)
} else {
None
}
Expand Down Expand Up @@ -1011,7 +1014,7 @@ pub mod pallet {
}

if let Some(outcome) = active_vote.quorum(validators.len()) {
// The supermajority quorum has been achieved.
// The quorum has been achieved.
//
// Remove the PVF vote from the active map and finalize the PVF checking according
// to the outcome.
Expand Down
27 changes: 18 additions & 9 deletions runtime/parachains/src/paras/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1252,15 +1252,24 @@ fn pvf_check_upgrade_reject() {
Paras::schedule_code_upgrade(a, new_code.clone(), RELAY_PARENT, &Configuration::config());
check_code_is_stored(&new_code);

// Supermajority of validators vote against `new_code`. PVF should be rejected.
IntoIterator::into_iter([0, 1, 2, 3])
.map(|i| PvfCheckStatement {
accept: false,
subject: new_code.hash(),
session_index: EXPECTED_SESSION,
validator_index: i.into(),
})
.for_each(sign_and_include_pvf_check_statement);
// 1/3 of validators vote against `new_code`. PVF should not be rejected yet.
sign_and_include_pvf_check_statement(PvfCheckStatement {
accept: false,
subject: new_code.hash(),
session_index: EXPECTED_SESSION,
validator_index: 0.into(),
});

// Verify that the new code is not yet discarded.
check_code_is_stored(&new_code);

// >1/3 of validators vote against `new_code`. PVF should be rejected.
sign_and_include_pvf_check_statement(PvfCheckStatement {
accept: false,
subject: new_code.hash(),
session_index: EXPECTED_SESSION,
validator_index: 1.into(),
});

// Verify that the new code is discarded.
check_code_is_not_stored(&new_code);
Expand Down