Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ValidationError #2406

Merged
merged 12 commits into from
Nov 21, 2023
60 changes: 35 additions & 25 deletions polkadot/node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#![warn(missing_docs)]

use polkadot_node_core_pvf::{
InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PrepareError,
PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
InternalValidationError, InvalidCandidate as WasmInvalidCandidate, PossiblyInvalidError,
PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
};
use polkadot_node_primitives::{
BlockData, InvalidCandidate, PoV, ValidationResult, POV_BOMB_LIMIT, VALIDATION_CODE_BOMB_LIMIT,
Expand Down Expand Up @@ -642,7 +642,7 @@ async fn validate_candidate_exhaustive(
}

match result {
Err(ValidationError::InternalError(e)) => {
Err(ValidationError::Internal(e)) => {
gum::warn!(
target: LOG_TARGET,
?para_id,
Expand All @@ -651,34 +651,29 @@ async fn validate_candidate_exhaustive(
);
Err(ValidationFailed(e.to_string()))
},
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) =>
Err(ValidationError::Invalid(WasmInvalidCandidate::HardTimeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
Err(ValidationError::Invalid(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
"ambiguous worker death".to_string(),
))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(err))) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),

Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(err))) =>
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
"ambiguous job death: {err}"
)))),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => {
// In principle if preparation of the `WASM` fails, the current candidate can not be the
// reason for that. So we can't say whether it is invalid or not. In addition, with
// pre-checking enabled only valid runtimes should ever get enacted, so we can be
// reasonably sure that this is some local problem on the current node. However, as this
// particular error *seems* to indicate a deterministic error, we raise a warning.
Err(ValidationError::Preparation(e)) => {
gum::warn!(
target: LOG_TARGET,
?para_id,
?e,
"Deterministic error occurred during preparation (should have been ruled out by pre-checking phase)",
);
Err(ValidationFailed(e))
Err(ValidationFailed(e.to_string()))
},
Ok(res) =>
if res.head_data.hash() != candidate_receipt.descriptor.para_head {
Expand Down Expand Up @@ -762,16 +757,31 @@ trait ValidationBackend {
}

match validation_result {
Err(ValidationError::InvalidCandidate(
WasmInvalidCandidate::AmbiguousWorkerDeath |
WasmInvalidCandidate::AmbiguousJobDeath(_),
)) if num_death_retries_left > 0 => num_death_retries_left -= 1,
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(_)))
if num_job_error_retries_left > 0 =>
num_job_error_retries_left -= 1,
Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 =>
num_internal_retries_left -= 1,
_ => break,
Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::AmbiguousWorkerDeath |
PossiblyInvalidError::AmbiguousJobDeath(_),
)) =>
if num_death_retries_left > 0 {
num_death_retries_left -= 1;
} else {
break;
},

Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
if num_job_error_retries_left > 0 {
num_job_error_retries_left -= 1;
} else {
break;
},

Err(ValidationError::Internal(_)) =>
if num_internal_retries_left > 0 {
num_internal_retries_left -= 1;
} else {
break;
},

Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
}

// If we got a possibly transient error, retry once after a brief delay, on the
Expand Down
40 changes: 20 additions & 20 deletions polkadot/node/core/candidate-validation/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ fn candidate_validation_bad_return_is_invalid() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down Expand Up @@ -561,7 +561,7 @@ fn candidate_validation_one_ambiguous_error_is_valid() {

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
Ok(validation_result),
]),
validation_data.clone(),
Expand Down Expand Up @@ -602,8 +602,8 @@ fn candidate_validation_multiple_ambiguous_errors_is_invalid() {

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result_list(vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
]),
validation_data,
validation_code,
Expand All @@ -626,7 +626,7 @@ fn candidate_validation_retry_internal_errors() {
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AJD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(
"baz".into(),
))),
// Throw another internal error.
Expand All @@ -644,7 +644,7 @@ fn candidate_validation_dont_retry_internal_errors() {
vec![
Err(InternalValidationError::HostCommunication("foo".into()).into()),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another internal error.
Err(InternalValidationError::HostCommunication("bar".into()).into()),
],
Expand All @@ -659,11 +659,11 @@ fn candidate_validation_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Approval,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("bar".into()))),
],
);

Expand All @@ -676,11 +676,11 @@ fn candidate_validation_dont_retry_panic_errors() {
let v = candidate_validation_retry_on_error_helper(
PvfExecKind::Backing,
vec![
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("foo".into()))),
// Throw an AWD error, we should still retry again.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
// Throw another panic error.
Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError("bar".into()))),
],
);

Expand Down Expand Up @@ -758,9 +758,9 @@ fn candidate_validation_timeout_is_internal_error() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down Expand Up @@ -852,9 +852,9 @@ fn candidate_validation_code_mismatch_is_invalid() {
let (_ctx, _ctx_handle) = test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(
ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout),
)),
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
))),
validation_data,
validation_code,
candidate_receipt,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@

use criterion::{criterion_group, criterion_main, BatchSize, Criterion, SamplingMode};
use polkadot_node_core_pvf::{
start, testing, Config, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData,
ValidationHost,
start, testing, Config, Metrics, PrepareError, PrepareJobKind, PvfPrepData, ValidationHost,
};
use polkadot_primitives::ExecutorParams;
use rococo_runtime::WASM_BINARY;
Expand Down
19 changes: 8 additions & 11 deletions polkadot/node/core/pvf/common/src/executor_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,22 +140,20 @@ pub unsafe fn create_runtime_from_artifact_bytes(
executor_params: &ExecutorParams,
) -> Result<WasmtimeRuntime, WasmError> {
let mut config = DEFAULT_CONFIG.clone();
config.semantics =
params_to_wasmtime_semantics(executor_params).map_err(|err| WasmError::Other(err))?;
config.semantics = params_to_wasmtime_semantics(executor_params);

sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
compiled_artifact_blob,
config,
)
}

pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Result<Semantics, String> {
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
let mut sem = DEFAULT_CONFIG.semantics.clone();
let mut stack_limit = if let Some(stack_limit) = sem.deterministic_stack_limit.clone() {
stack_limit
} else {
return Err("No default stack limit set".to_owned())
};
let mut stack_limit = sem
.deterministic_stack_limit
.expect("There is a comment to not change the default stack limit; it should always be available; qed")
.clone();

for p in par.iter() {
match p {
Expand All @@ -172,7 +170,7 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Result<Semantics, S
}
}
sem.deterministic_stack_limit = Some(stack_limit);
Ok(sem)
sem
}

/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
Expand All @@ -191,8 +189,7 @@ pub fn prepare(
blob: RuntimeBlob,
executor_params: &ExecutorParams,
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
let semantics = params_to_wasmtime_semantics(executor_params)
.map_err(|e| sc_executor_common::error::WasmError::Other(e))?;
let semantics = params_to_wasmtime_semantics(executor_params);
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
}

Expand Down
37 changes: 25 additions & 12 deletions polkadot/node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,44 @@ use polkadot_node_core_pvf_common::error::{InternalValidationError, PrepareError
/// A error raised during validation of the candidate.
#[derive(Debug, Clone)]
pub enum ValidationError {
/// The error was raised because the candidate is invalid.
/// Deterministic preparation issue. In practice, most of the problems should be caught by
/// prechecking, so this may be a sign of internal conditions.
///
/// Whenever we are unsure if the error was due to the candidate or not, we must vote invalid.
InvalidCandidate(InvalidCandidate),
/// Some internal error occurred.
InternalError(InternalValidationError),
/// In principle if preparation of the `WASM` fails, the current candidate cannot be the
/// reason for that. So we can't say whether it is invalid or not. In addition, with
/// pre-checking enabled only valid runtimes should ever get enacted, so we can be
/// reasonably sure that this is some local problem on the current node. However, as this
/// particular error *seems* to indicate a deterministic error, we raise a warning.
Preparation(PrepareError),
/// The error was raised because the candidate is invalid. Should vote against.
Invalid(InvalidCandidate),
/// Possibly transient issue that may resolve after retries. Should vote against when retries
/// fail.
PossiblyInvalid(PossiblyInvalidError),
/// Preparation or execution issue caused by an internal condition. Should not vote against.
Internal(InternalValidationError),
}

/// A description of an error raised during executing a PVF and can be attributed to the combination
/// of the candidate [`polkadot_parachain_primitives::primitives::ValidationParams`] and the PVF.
#[derive(Debug, Clone)]
pub enum InvalidCandidate {
/// PVF preparation ended up with a deterministic error.
PrepareError(String),
/// The candidate is reported to be invalid by the execution worker. The string contains the
/// error message.
WorkerReportedInvalid(String),
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
}

/// Possibly transient issue that may resolve after retries.
#[derive(Debug, Clone)]
pub enum PossiblyInvalidError {
/// The worker process (not the job) has died during validation of a candidate.
///
/// It's unlikely that this is caused by malicious code since workers spawn separate job
/// processes, and those job processes are sandboxed. But, it is possible. We retry in this
/// case, and if the error persists, we assume it's caused by the candidate and vote against.
AmbiguousWorkerDeath,
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
/// The job process (not the worker) has died for one of the following reasons:
///
/// (a) A seccomp violation occurred, most likely due to an attempt by malicious code to
Expand All @@ -68,7 +81,7 @@ pub enum InvalidCandidate {

impl From<InternalValidationError> for ValidationError {
fn from(error: InternalValidationError) -> Self {
Self::InternalError(error)
Self::Internal(error)
}
}

Expand All @@ -77,9 +90,9 @@ impl From<PrepareError> for ValidationError {
// Here we need to classify the errors into two errors: deterministic and non-deterministic.
// See [`PrepareError::is_deterministic`].
if error.is_deterministic() {
Self::InvalidCandidate(InvalidCandidate::PrepareError(error.to_string()))
Self::Preparation(error)
} else {
Self::InternalError(InternalValidationError::NonDeterministicPrepareError(error))
Self::Internal(InternalValidationError::NonDeterministicPrepareError(error))
}
}
}
14 changes: 7 additions & 7 deletions polkadot/node/core/pvf/src/execute/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::{
host::ResultSender,
metrics::Metrics,
worker_intf::{IdleWorker, WorkerHandle},
InvalidCandidate, ValidationError, LOG_TARGET,
InvalidCandidate, PossiblyInvalidError, ValidationError, LOG_TARGET,
};
use futures::{
channel::mpsc,
Expand Down Expand Up @@ -342,27 +342,27 @@ fn handle_job_finish(
},
Outcome::InvalidCandidate { err, idle_worker } => (
Some(idle_worker),
Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedInvalid(err))),
Err(ValidationError::Invalid(InvalidCandidate::WorkerReportedInvalid(err))),
None,
),
Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None),
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Outcome::HardTimeout =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None),
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
// "Maybe invalid" errors (will retry).
Outcome::WorkerIntfErr => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)),
None,
),
Outcome::JobDied { err } => (
None,
Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
None,
),
Outcome::JobError { err } =>
(None, Err(ValidationError::InvalidCandidate(InvalidCandidate::JobError(err))), None),
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
};

queue.metrics.execute_finished();
Expand Down
Loading
Loading