paritytech · mrcnski · Dec 6, 2022 · Dec 2, 2022 · Dec 2, 2022 · Dec 2, 2022
diff --git a/node/core/pvf/src/execute/worker.rs b/node/core/pvf/src/execute/worker.rs
@@ -74,6 +74,8 @@ pub enum Outcome {
 
 /// Given the idle token of a worker and parameters of work, communicates with the worker and
 /// returns the outcome.
+///
+/// NOTE: Returning the `HardTimeout` or `IoErr` errors will trigger the child process being killed.
 pub async fn start_work(
 	worker: IdleWorker,
 	artifact: ArtifactPathId,
@@ -148,7 +150,7 @@ pub async fn start_work(
 				target: LOG_TARGET,
 				worker_pid = %pid,
 				validation_code_hash = ?artifact.id.code_hash,
-				"execution worker exceeded alloted time for execution",
+				"execution worker exceeded allotted time for execution",
 			);
 			// TODO: This case is not really a hard timeout as the timeout here in the host is
 			// lenient. Should fix this as part of

diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs
@@ -390,7 +390,8 @@ async fn run(
 			from_prepare_queue = from_prepare_queue_rx.next() => {
 				let from_queue = break_if_fatal!(from_prepare_queue.ok_or(Fatal));
 
-				// Note that preparation always succeeds.
+				// Note that unless the worker dies, the preparation outcome is always reported as
+				// concluded.
 				//
 				// That's because the error conditions are written into the artifact and will be
 				// reported at the time of the execution. It potentially, but not necessarily, can

diff --git a/node/core/pvf/src/prepare/pool.rs b/node/core/pvf/src/prepare/pool.rs
@@ -299,7 +299,8 @@ fn handle_mux(
 					let data = match spawned.get_mut(worker) {
 						None => {
 							// Perhaps the worker was killed meanwhile and the result is no longer
-							// relevant.
+							// relevant. We already send `Rip` when purging if we detect that the
+							// worker is dead.
 							return Ok(())
 						},
 						Some(data) => data,
@@ -321,6 +322,7 @@ fn handle_mux(
 
 					Ok(())
 				},
+				// Sending `rip: true` triggers killing the worker.
 				Outcome::DidNotMakeIt => {
 					if attempt_retire(metrics, spawned, worker) {
 						reply(
@@ -335,6 +337,7 @@ fn handle_mux(
 
 					Ok(())
 				},
+				// Sending `rip: true` triggers killing the worker.
 				Outcome::TimedOut => {
 					if attempt_retire(metrics, spawned, worker) {
 						reply(

diff --git a/node/core/pvf/src/prepare/worker.rs b/node/core/pvf/src/prepare/worker.rs
@@ -78,6 +78,9 @@ enum Selected {
 
 /// Given the idle token of a worker and parameters of work, communicates with the worker and
 /// returns the outcome.
+///
+/// NOTE: Returning the `TimedOut` or `DidNotMakeIt` errors will trigger the child process being
+/// killed.
 pub async fn start_work(
 	worker: IdleWorker,
 	code: Arc<Vec<u8>>,
@@ -149,6 +152,7 @@ pub async fn start_work(
 			},
 		};
 
+		// NOTE: A `TimedOut` or `DidNotMakeIt` error triggers the child process being killed.
 		match selected {
 			// Timed out on the child. This should already be logged by the child.
 			Selected::Done(Err(PrepareError::TimedOut)) => Outcome::TimedOut,
@@ -162,6 +166,9 @@ pub async fn start_work(
 }
 
 /// Handles the case where we successfully received response bytes on the host from the child.
+///
+/// NOTE: Here we know the artifact exists, but is still located in a temporary file which will be
+/// cleared by `with_tmp_file`.
 async fn handle_response_bytes(
 	response_bytes: Vec<u8>,
 	pid: u32,
@@ -201,9 +208,6 @@ async fn handle_response_bytes(
 		);
 
 		// Return a timeout error.
-		//
-		// NOTE: The artifact exists, but is located in a temporary file which
-		// will be cleared by `with_tmp_file`.
 		return Selected::Deadline
 	}
 

diff --git a/node/core/pvf/src/worker_common.rs b/node/core/pvf/src/worker_common.rs
@@ -233,7 +233,11 @@ pub async fn cpu_time_monitor_loop(
 				timeout.as_millis(),
 			);
 
-			// Send back a TimedOut error on timeout.
+			// Send back a `TimedOut` error.
+			//
+			// NOTE: This will cause the worker, whether preparation or execution, to be killed by
+			// the host. We do not kill the process here because it would interfere with the proper
+			// handling of this error.
 			let encoded_result = match job_kind {
 				JobKind::Prepare => {
 					let result: Result<(), PrepareError> = Err(PrepareError::TimedOut);
@@ -244,8 +248,8 @@ pub async fn cpu_time_monitor_loop(
 					result.encode()
 				},
 			};
-			// If we error there is nothing else we can do here, and we are killing the process,
-			// anyway. The receiving side will just have to time out.
+			// If we error here there is nothing we can do apart from log it. The receiving side
+			// will just have to time out.
 			if let Err(err) = framed_send(&mut stream, encoded_result.as_slice()).await {
 				gum::warn!(
 					target: LOG_TARGET,
@@ -254,9 +258,6 @@ pub async fn cpu_time_monitor_loop(
 					err
 				);
 			}
-
-			// Kill the process.
-			std::process::exit(1);
 		}
 
 		// Sleep for the remaining CPU time, plus a bit to account for overhead. Note that the sleep