Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PVF: remove audit log access #2461

Merged
merged 3 commits into from
Nov 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitlab/pipeline/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -522,4 +522,4 @@ test-syscalls:
- if [[ "$CI_JOB_STATUS" == "failed" ]]; then
printf "The x86_64 syscalls used by the worker binaries have changed. Please review if this is expected and update polkadot/scripts/list-syscalls/*-worker-syscalls as needed.\n";
fi
allow_failure: true # TODO: remove this once we have an idea how often the syscall lists will change
allow_failure: false # this rarely triggers in practice
31 changes: 1 addition & 30 deletions polkadot/node/core/pvf/src/execute/worker_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

use crate::{
artifacts::ArtifactPathId,
security,
worker_intf::{
clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker,
SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
Expand All @@ -33,7 +32,7 @@ use polkadot_node_core_pvf_common::{
execute::{Handshake, WorkerResponse},
worker_dir, SecurityStatus,
};
use polkadot_parachain_primitives::primitives::{ValidationCodeHash, ValidationResult};
use polkadot_parachain_primitives::primitives::ValidationResult;
use polkadot_primitives::ExecutorParams;
use std::{path::Path, time::Duration};
use tokio::{io, net::UnixStream};
Expand Down Expand Up @@ -132,10 +131,7 @@ pub async fn start_work(
artifact.path.display(),
);

let artifact_path = artifact.path.clone();
with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move {
let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await;

if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await {
gum::warn!(
target: LOG_TARGET,
Expand All @@ -160,10 +156,7 @@ pub async fn start_work(
handle_response(
response,
pid,
&artifact.id.code_hash,
&artifact_path,
execution_timeout,
audit_log_file
)
.await,
Err(error) => {
Expand Down Expand Up @@ -217,10 +210,7 @@ pub async fn start_work(
async fn handle_response(
response: WorkerResponse,
worker_pid: u32,
validation_code_hash: &ValidationCodeHash,
artifact_path: &Path,
execution_timeout: Duration,
audit_log_file: Option<security::AuditLogFile>,
) -> WorkerResponse {
if let WorkerResponse::Ok { duration, .. } = response {
if duration > execution_timeout {
Expand All @@ -238,25 +228,6 @@ async fn handle_response(
}
}

if let WorkerResponse::JobDied { err: _, job_pid } = response {
// The job died. Check if it was due to a seccomp violation.
//
// NOTE: Log, but don't change the outcome. Not all validators may have
// auditing enabled, so we don't want attackers to abuse a non-deterministic
// outcome.
for syscall in security::check_seccomp_violations_for_job(audit_log_file, job_pid).await {
gum::error!(
target: LOG_TARGET,
%worker_pid,
%job_pid,
%syscall,
?validation_code_hash,
?artifact_path,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}
}

response
}

Expand Down
26 changes: 1 addition & 25 deletions polkadot/node/core/pvf/src/prepare/worker_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
use crate::{
artifacts::ArtifactId,
metrics::Metrics,
security,
worker_intf::{
clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker,
SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
Expand Down Expand Up @@ -134,7 +133,6 @@ pub async fn start_work(
pid,
|tmp_artifact_file, mut stream, worker_dir| async move {
let preparation_timeout = pvf.prep_timeout();
let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await;

if let Err(err) = send_request(&mut stream, &pvf).await {
gum::warn!(
Expand Down Expand Up @@ -170,7 +168,6 @@ pub async fn start_work(
&pvf,
&cache_path,
preparation_timeout,
audit_log_file,
)
.await,
Ok(Err(err)) => {
Expand Down Expand Up @@ -211,34 +208,13 @@ async fn handle_response(
pvf: &PvfPrepData,
cache_path: &Path,
preparation_timeout: Duration,
audit_log_file: Option<security::AuditLogFile>,
) -> Outcome {
let PrepareWorkerSuccess { checksum, stats: PrepareStats { cpu_time_elapsed, memory_stats } } =
match result.clone() {
Ok(result) => result,
// Timed out on the child. This should already be logged by the child.
Err(PrepareError::TimedOut) => return Outcome::TimedOut,
Err(PrepareError::JobDied { err, job_pid }) => {
// The job died. Check if it was due to a seccomp violation.
//
// NOTE: Log, but don't change the outcome. Not all validators may have
// auditing enabled, so we don't want attackers to abuse a non-deterministic
// outcome.
for syscall in
security::check_seccomp_violations_for_job(audit_log_file, job_pid).await
{
gum::error!(
target: LOG_TARGET,
%worker_pid,
%job_pid,
%syscall,
?pvf,
"A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!"
);
}

return Outcome::JobDied { err, job_pid }
},
Err(PrepareError::JobDied { err, job_pid }) => return Outcome::JobDied { err, job_pid },
Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory,
Err(err) => return Outcome::Concluded { worker, result: Err(err) },
};
Expand Down
149 changes: 0 additions & 149 deletions polkadot/node/core/pvf/src/security.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@
use crate::{Config, SecurityStatus, LOG_TARGET};
use futures::join;
use std::{fmt, path::Path};
use tokio::{
fs::{File, OpenOptions},
io::{AsyncReadExt, AsyncSeekExt, SeekFrom},
};

const SECURE_MODE_ANNOUNCEMENT: &'static str =
"In the next release this will be a hard error by default.
Expand All @@ -35,7 +31,6 @@ const SECURE_MODE_ANNOUNCEMENT: &'static str =
pub async fn check_security_status(config: &Config) -> SecurityStatus {
let Config { prepare_worker_program_path, cache_path, .. } = config;

// TODO: add check that syslog is available and that seccomp violations are logged?
let (landlock, seccomp, change_root) = join!(
check_landlock(prepare_worker_program_path),
check_seccomp(prepare_worker_program_path),
Expand Down Expand Up @@ -303,147 +298,3 @@ async fn check_seccomp(
}
}
}

const AUDIT_LOG_PATH: &'static str = "/var/log/audit/audit.log";
const SYSLOG_PATH: &'static str = "/var/log/syslog";

/// System audit log.
pub struct AuditLogFile {
file: File,
path: &'static str,
}

impl AuditLogFile {
/// Looks for an audit log file on the system and opens it, seeking to the end to skip any
/// events from before this was called.
///
/// A bit of a verbose name, but it should clue future refactorers not to move calls closer to
/// where the `AuditLogFile` is used.
pub async fn try_open_and_seek_to_end() -> Option<Self> {
let mut path = AUDIT_LOG_PATH;
let mut file = match OpenOptions::new().read(true).open(AUDIT_LOG_PATH).await {
Ok(file) => Ok(file),
Err(_) => {
path = SYSLOG_PATH;
OpenOptions::new().read(true).open(SYSLOG_PATH).await
},
}
.ok()?;

let _pos = file.seek(SeekFrom::End(0)).await;

Some(Self { file, path })
}

async fn read_new_since_open(mut self) -> String {
let mut buf = String::new();
let _len = self.file.read_to_string(&mut buf).await;
buf
}
}

/// Check if a seccomp violation occurred for the given job process. As the syslog may be in a
/// different location, or seccomp auditing may be disabled, this function provides a best-effort
/// attempt only.
///
/// The `audit_log_file` must have been obtained before the job started. It only allows reading
/// entries that were written since it was obtained, so that we do not consider events from previous
/// processes with the same pid. This can still be racy, but it's unlikely and fine for a
/// best-effort attempt.
pub async fn check_seccomp_violations_for_job(
audit_log_file: Option<AuditLogFile>,
job_pid: i32,
) -> Vec<u32> {
let audit_event_pid_field = format!("pid={job_pid}");

let audit_log_file = match audit_log_file {
Some(file) => {
gum::trace!(
target: LOG_TARGET,
%job_pid,
audit_log_path = ?file.path,
"checking audit log for seccomp violations",
);
file
},
None => {
gum::warn!(
target: LOG_TARGET,
%job_pid,
"could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs"
);
return vec![]
},
};
let events = audit_log_file.read_new_since_open().await;

let mut violations = vec![];
for event in events.lines() {
if let Some(syscall) = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field) {
violations.push(syscall);
}
}

violations
}

fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option<u32> {
const SECCOMP_AUDIT_EVENT_TYPE: &'static str = "type=1326";

// Do a series of simple .contains instead of a regex, because I'm not sure if the fields are
// guaranteed to always be in the same order.
if !event.contains(SECCOMP_AUDIT_EVENT_TYPE) || !event.contains(&audit_event_pid_field) {
return None
}

// Get the syscall. Let's avoid a dependency on regex just for this.
for field in event.split(" ") {
if let Some(syscall) = field.strip_prefix("syscall=") {
return syscall.parse::<u32>().ok()
}
}

None
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_parse_audit_log_for_seccomp_event() {
let audit_event_pid_field = "pid=2559058";

assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559058 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
Some(53)
);
// pid is wrong
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
// type is wrong
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
// no syscall field
assert_eq!(
parse_audit_log_for_seccomp_event(
r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e compat=0 ip=0x7f7542c80d5e code=0x80000000"#,
audit_event_pid_field
),
None
);
}
}