diff --git a/.gitlab/pipeline/test.yml b/.gitlab/pipeline/test.yml index 79ef9070dba9..9c3fa7701c8f 100644 --- a/.gitlab/pipeline/test.yml +++ b/.gitlab/pipeline/test.yml @@ -522,4 +522,4 @@ test-syscalls: - if [[ "$CI_JOB_STATUS" == "failed" ]]; then printf "The x86_64 syscalls used by the worker binaries have changed. Please review if this is expected and update polkadot/scripts/list-syscalls/*-worker-syscalls as needed.\n"; fi - allow_failure: true # TODO: remove this once we have an idea how often the syscall lists will change + allow_failure: false # this rarely triggers in practice diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index fc6cd8fc7256..16a7c290b528 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -18,7 +18,6 @@ use crate::{ artifacts::ArtifactPathId, - security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -33,7 +32,7 @@ use polkadot_node_core_pvf_common::{ execute::{Handshake, WorkerResponse}, worker_dir, SecurityStatus, }; -use polkadot_parachain_primitives::primitives::{ValidationCodeHash, ValidationResult}; +use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::ExecutorParams; use std::{path::Path, time::Duration}; use tokio::{io, net::UnixStream}; @@ -132,10 +131,7 @@ pub async fn start_work( artifact.path.display(), ); - let artifact_path = artifact.path.clone(); with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move { - let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; - if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await { gum::warn!( target: LOG_TARGET, @@ -160,10 +156,7 @@ pub async fn start_work( handle_response( response, pid, - &artifact.id.code_hash, - &artifact_path, execution_timeout, - audit_log_file ) .await, Err(error) => { @@ -217,10 +210,7 @@ pub async fn start_work( async fn handle_response( response: WorkerResponse, worker_pid: u32, - validation_code_hash: &ValidationCodeHash, - artifact_path: &Path, execution_timeout: Duration, - audit_log_file: Option, ) -> WorkerResponse { if let WorkerResponse::Ok { duration, .. } = response { if duration > execution_timeout { @@ -238,25 +228,6 @@ async fn handle_response( } } - if let WorkerResponse::JobDied { err: _, job_pid } = response { - // The job died. Check if it was due to a seccomp violation. - // - // NOTE: Log, but don't change the outcome. Not all validators may have - // auditing enabled, so we don't want attackers to abuse a non-deterministic - // outcome. - for syscall in security::check_seccomp_violations_for_job(audit_log_file, job_pid).await { - gum::error!( - target: LOG_TARGET, - %worker_pid, - %job_pid, - %syscall, - ?validation_code_hash, - ?artifact_path, - "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" - ); - } - } - response } diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index f978005068c8..a393e9baa9e5 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -19,7 +19,6 @@ use crate::{ artifacts::ArtifactId, metrics::Metrics, - security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -134,7 +133,6 @@ pub async fn start_work( pid, |tmp_artifact_file, mut stream, worker_dir| async move { let preparation_timeout = pvf.prep_timeout(); - let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; if let Err(err) = send_request(&mut stream, &pvf).await { gum::warn!( @@ -170,7 +168,6 @@ pub async fn start_work( &pvf, &cache_path, preparation_timeout, - audit_log_file, ) .await, Ok(Err(err)) => { @@ -211,34 +208,13 @@ async fn handle_response( pvf: &PvfPrepData, cache_path: &Path, preparation_timeout: Duration, - audit_log_file: Option, ) -> Outcome { let PrepareWorkerSuccess { checksum, stats: PrepareStats { cpu_time_elapsed, memory_stats } } = match result.clone() { Ok(result) => result, // Timed out on the child. This should already be logged by the child. Err(PrepareError::TimedOut) => return Outcome::TimedOut, - Err(PrepareError::JobDied { err, job_pid }) => { - // The job died. Check if it was due to a seccomp violation. - // - // NOTE: Log, but don't change the outcome. Not all validators may have - // auditing enabled, so we don't want attackers to abuse a non-deterministic - // outcome. - for syscall in - security::check_seccomp_violations_for_job(audit_log_file, job_pid).await - { - gum::error!( - target: LOG_TARGET, - %worker_pid, - %job_pid, - %syscall, - ?pvf, - "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" - ); - } - - return Outcome::JobDied { err, job_pid } - }, + Err(PrepareError::JobDied { err, job_pid }) => return Outcome::JobDied { err, job_pid }, Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory, Err(err) => return Outcome::Concluded { worker, result: Err(err) }, }; diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index ef8714f58c81..2fd3b53e96b4 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -17,10 +17,6 @@ use crate::{Config, SecurityStatus, LOG_TARGET}; use futures::join; use std::{fmt, path::Path}; -use tokio::{ - fs::{File, OpenOptions}, - io::{AsyncReadExt, AsyncSeekExt, SeekFrom}, -}; const SECURE_MODE_ANNOUNCEMENT: &'static str = "In the next release this will be a hard error by default. @@ -35,7 +31,6 @@ const SECURE_MODE_ANNOUNCEMENT: &'static str = pub async fn check_security_status(config: &Config) -> SecurityStatus { let Config { prepare_worker_program_path, cache_path, .. } = config; - // TODO: add check that syslog is available and that seccomp violations are logged? let (landlock, seccomp, change_root) = join!( check_landlock(prepare_worker_program_path), check_seccomp(prepare_worker_program_path), @@ -303,147 +298,3 @@ async fn check_seccomp( } } } - -const AUDIT_LOG_PATH: &'static str = "/var/log/audit/audit.log"; -const SYSLOG_PATH: &'static str = "/var/log/syslog"; - -/// System audit log. -pub struct AuditLogFile { - file: File, - path: &'static str, -} - -impl AuditLogFile { - /// Looks for an audit log file on the system and opens it, seeking to the end to skip any - /// events from before this was called. - /// - /// A bit of a verbose name, but it should clue future refactorers not to move calls closer to - /// where the `AuditLogFile` is used. - pub async fn try_open_and_seek_to_end() -> Option { - let mut path = AUDIT_LOG_PATH; - let mut file = match OpenOptions::new().read(true).open(AUDIT_LOG_PATH).await { - Ok(file) => Ok(file), - Err(_) => { - path = SYSLOG_PATH; - OpenOptions::new().read(true).open(SYSLOG_PATH).await - }, - } - .ok()?; - - let _pos = file.seek(SeekFrom::End(0)).await; - - Some(Self { file, path }) - } - - async fn read_new_since_open(mut self) -> String { - let mut buf = String::new(); - let _len = self.file.read_to_string(&mut buf).await; - buf - } -} - -/// Check if a seccomp violation occurred for the given job process. As the syslog may be in a -/// different location, or seccomp auditing may be disabled, this function provides a best-effort -/// attempt only. -/// -/// The `audit_log_file` must have been obtained before the job started. It only allows reading -/// entries that were written since it was obtained, so that we do not consider events from previous -/// processes with the same pid. This can still be racy, but it's unlikely and fine for a -/// best-effort attempt. -pub async fn check_seccomp_violations_for_job( - audit_log_file: Option, - job_pid: i32, -) -> Vec { - let audit_event_pid_field = format!("pid={job_pid}"); - - let audit_log_file = match audit_log_file { - Some(file) => { - gum::trace!( - target: LOG_TARGET, - %job_pid, - audit_log_path = ?file.path, - "checking audit log for seccomp violations", - ); - file - }, - None => { - gum::warn!( - target: LOG_TARGET, - %job_pid, - "could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs" - ); - return vec![] - }, - }; - let events = audit_log_file.read_new_since_open().await; - - let mut violations = vec![]; - for event in events.lines() { - if let Some(syscall) = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field) { - violations.push(syscall); - } - } - - violations -} - -fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option { - const SECCOMP_AUDIT_EVENT_TYPE: &'static str = "type=1326"; - - // Do a series of simple .contains instead of a regex, because I'm not sure if the fields are - // guaranteed to always be in the same order. - if !event.contains(SECCOMP_AUDIT_EVENT_TYPE) || !event.contains(&audit_event_pid_field) { - return None - } - - // Get the syscall. Let's avoid a dependency on regex just for this. - for field in event.split(" ") { - if let Some(syscall) = field.strip_prefix("syscall=") { - return syscall.parse::().ok() - } - } - - None -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_audit_log_for_seccomp_event() { - let audit_event_pid_field = "pid=2559058"; - - assert_eq!( - parse_audit_log_for_seccomp_event( - r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559058 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, - audit_event_pid_field - ), - Some(53) - ); - // pid is wrong - assert_eq!( - parse_audit_log_for_seccomp_event( - r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, - audit_event_pid_field - ), - None - ); - // type is wrong - assert_eq!( - parse_audit_log_for_seccomp_event( - r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, - audit_event_pid_field - ), - None - ); - // no syscall field - assert_eq!( - parse_audit_log_for_seccomp_event( - r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e compat=0 ip=0x7f7542c80d5e code=0x80000000"#, - audit_event_pid_field - ), - None - ); - } -}