Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tracking/limiting memory allocator #1192

Merged
merged 62 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
47ecf58
Import changes from archived repo
s0me0ne-unkn0wn Aug 27, 2023
58330de
Fix dependencies
s0me0ne-unkn0wn Aug 30, 2023
aac1da6
Merge branch 'master' into s0me0ne/tracking-allocator
pepoviola Aug 30, 2023
99d4f5e
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 1, 2023
18ef342
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 2, 2023
facd26a
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 12, 2023
3d2146a
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 14, 2023
6ee5ff3
Format manifests
s0me0ne-unkn0wn Sep 14, 2023
f85c753
Make peak allocation value observable
s0me0ne-unkn0wn Sep 15, 2023
c9c0bf8
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 15, 2023
edfa2e1
Enforce memory limits
s0me0ne-unkn0wn Sep 16, 2023
ac5dab4
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 16, 2023
994273a
Fix the node allocator declaration
s0me0ne-unkn0wn Sep 16, 2023
72ded39
`cargo fmt`
s0me0ne-unkn0wn Sep 16, 2023
dfa0daf
Implement abort handler
s0me0ne-unkn0wn Sep 21, 2023
e740b5c
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 21, 2023
48cde3e
Address some discussions
s0me0ne-unkn0wn Sep 21, 2023
acf149e
Remove feature gate
s0me0ne-unkn0wn Sep 21, 2023
6cfeceb
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 21, 2023
c7fa464
Update crate description
s0me0ne-unkn0wn Sep 22, 2023
edeb901
Implement failure callback
s0me0ne-unkn0wn Sep 22, 2023
251ac90
Merge remote-tracking branch 'origin/s0me0ne/tracking-allocator' into…
s0me0ne-unkn0wn Sep 22, 2023
4195806
Address discussions
s0me0ne-unkn0wn Sep 22, 2023
09e5c07
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 22, 2023
02ad536
Fix tests
s0me0ne-unkn0wn Sep 22, 2023
e1a2b63
Fix typo
s0me0ne-unkn0wn Sep 25, 2023
1918d2a
Address discussions
s0me0ne-unkn0wn Sep 26, 2023
b06cb6d
Merge remote-tracking branch 'origin/s0me0ne/tracking-allocator' into…
s0me0ne-unkn0wn Sep 26, 2023
39cf3b7
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 26, 2023
007054a
Address discussions
s0me0ne-unkn0wn Sep 26, 2023
c81e8aa
Try to fix test pipeline
s0me0ne-unkn0wn Sep 26, 2023
5920260
Fix PVF preparation benchmark
s0me0ne-unkn0wn Sep 26, 2023
a37c688
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 26, 2023
96d71c9
Minor fixes
s0me0ne-unkn0wn Sep 27, 2023
30e5236
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 27, 2023
73799fb
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 28, 2023
93b9d61
Eliminate redundant branch
s0me0ne-unkn0wn Sep 28, 2023
8485741
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 28, 2023
7a3875a
Revert "Eliminate redundant branch"
s0me0ne-unkn0wn Sep 28, 2023
83e6b99
Remove redundant todo comment
s0me0ne-unkn0wn Sep 28, 2023
6c6afc8
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Sep 29, 2023
9de0856
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 4, 2023
d6470c5
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 5, 2023
d2d8a60
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 11, 2023
d74aa20
Remove stale file
s0me0ne-unkn0wn Oct 11, 2023
fb40a0d
Merge branch 's0me0ne/tracking-allocator' of github.com:paritytech/po…
s0me0ne-unkn0wn Oct 20, 2023
9c1d920
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 20, 2023
c6be92b
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 23, 2023
8c58a68
Abstract `Spinlock`
s0me0ne-unkn0wn Oct 23, 2023
52578ee
Merge branch 'master' into s0me0ne/tracking-allocator
s0me0ne-unkn0wn Oct 23, 2023
778513c
Update comment
s0me0ne-unkn0wn Oct 29, 2023
9c3ea9b
Add safety comment
s0me0ne-unkn0wn Oct 29, 2023
54686a1
Fix comment formatting
s0me0ne-unkn0wn Oct 29, 2023
07eb6a9
Fix typo
s0me0ne-unkn0wn Oct 29, 2023
e3ca3b6
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Oct 29, 2023
8be6359
Remove unneeded trait bound
s0me0ne-unkn0wn Oct 29, 2023
8b0a722
Remove stale file
s0me0ne-unkn0wn Oct 29, 2023
331ed01
Rename tracking fn
s0me0ne-unkn0wn Nov 1, 2023
7d69a9b
Merge remote-tracking branch 'origin/master' into s0me0ne/tracking-al…
s0me0ne-unkn0wn Nov 1, 2023
10e7443
Fix clippy and tests
s0me0ne-unkn0wn Nov 1, 2023
c3b66f2
Merge branch 'master' into s0me0ne/tracking-allocator
s0me0ne-unkn0wn Nov 1, 2023
e94ddf9
Merge branch 'master' into s0me0ne/tracking-allocator
s0me0ne-unkn0wn Nov 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion polkadot/node/core/pvf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ substrate-build-script-utils = { path = "../../../../substrate/utils/build-scrip
assert_matches = "1.4.0"
hex-literal = "0.4.1"
polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] }
# For the puppet worker, depend on ourselves with the test-utils feature.
polkadot-node-core-pvf = { path = ".", features = ["test-utils"] }

adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" }
Expand Down
26 changes: 25 additions & 1 deletion polkadot/node/core/pvf/common/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,45 @@ use std::fmt;
pub type PrepareResult = Result<PrepareStats, PrepareError>;

/// An error that occurred during the prepare part of the PVF pipeline.
// Codec indexes are intended to stabilize pre-encoded payloads (see `OOM_PAYLOAD` below)
#[derive(Debug, Clone, Encode, Decode)]
pub enum PrepareError {
/// During the prevalidation stage of preparation an issue was found with the PVF.
#[codec(index = 0)]
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
Prevalidation(String),
/// Compilation failed for the given PVF.
#[codec(index = 1)]
Preparation(String),
/// Instantiation of the WASM module instance failed.
#[codec(index = 2)]
RuntimeConstruction(String),
/// An unexpected panic has occurred in the preparation worker.
#[codec(index = 3)]
Panic(String),
/// Failed to prepare the PVF due to the time limit.
#[codec(index = 4)]
TimedOut,
/// An IO error occurred. This state is reported by either the validation host or by the
/// worker.
#[codec(index = 5)]
IoErr(String),
/// The temporary file for the artifact could not be created at the given cache path. This
/// state is reported by the validation host (not by the worker).
#[codec(index = 6)]
CreateTmpFileErr(String),
/// The response from the worker is received, but the file cannot be renamed (moved) to the
/// final destination location. This state is reported by the validation host (not by the
/// worker).
#[codec(index = 7)]
RenameTmpFileErr(String),
/// Memory limit reached
#[codec(index = 8)]
OutOfMemory,
}

/// Pre-encoded length-prefixed `PrepareResult::Err(PrepareError::OutOfMemory)`
pub const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08";

impl PrepareError {
/// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those
/// errors depend on the PVF itself and the sc-executor/wasmtime logic.
Expand All @@ -57,7 +72,7 @@ impl PrepareError {
pub fn is_deterministic(&self) -> bool {
use PrepareError::*;
match self {
Prevalidation(_) | Preparation(_) | Panic(_) => true,
Prevalidation(_) | Preparation(_) | Panic(_) | OutOfMemory => true,
TimedOut | IoErr(_) | CreateTmpFileErr(_) | RenameTmpFileErr(_) => false,
// Can occur due to issues with the PVF, but also due to local errors.
RuntimeConstruction(_) => false,
Expand All @@ -77,6 +92,7 @@ impl fmt::Display for PrepareError {
IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err),
CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err),
RenameTmpFileErr(err) => write!(f, "prepare: error renaming tmp file: {}", err),
OutOfMemory => write!(f, "prepare: out of memory"),
}
}
}
Expand Down Expand Up @@ -112,3 +128,11 @@ impl fmt::Display for InternalValidationError {
}
}
}

#[test]
fn pre_encoded_payloads() {
let oom_enc = PrepareResult::Err(PrepareError::OutOfMemory).encode();
let mut oom_payload = oom_enc.len().to_le_bytes().to_vec();
oom_payload.extend(oom_enc);
assert_eq!(oom_payload, OOM_PAYLOAD);
}
4 changes: 3 additions & 1 deletion polkadot/node/core/pvf/common/src/prepare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ pub struct PrepareStats {
/// supported by the OS, `ru_maxrss`.
#[derive(Clone, Debug, Default, Encode, Decode)]
pub struct MemoryStats {
/// Memory stats from `tikv_jemalloc_ctl`.
/// Memory stats from `tikv_jemalloc_ctl`, polling-based and not very precise.
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
pub memory_tracker_stats: Option<MemoryAllocationStats>,
/// `ru_maxrss` from `getrusage`. `None` if an error occurred.
#[cfg(target_os = "linux")]
pub max_rss: Option<i64>,
/// Peak allocation in bytes measured by tracking allocator
pub peak_tracked_alloc: u64,
}

/// Statistics of collected memory metrics.
Expand Down
14 changes: 13 additions & 1 deletion polkadot/node/core/pvf/prepare-worker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@ futures = "0.3.21"
gum = { package = "tracing-gum", path = "../../../gum" }
libc = "0.2.139"
rayon = "1.5.1"
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
tokio = { version = "1.24.2", features = ["fs", "process"] }
tracking-allocator = { path = "../../../tracking-allocator" }
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
tikv-jemallocator = { version = "0.5.0", optional = true }

parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] }

Expand All @@ -28,11 +30,21 @@ sp-maybe-compressed-blob = { path = "../../../../../substrate/primitives/maybe-c
sp-tracing = { path = "../../../../../substrate/primitives/tracing" }

[target.'cfg(target_os = "linux")'.dependencies]
tikv-jemallocator = "0.5.0"
tikv-jemalloc-ctl = "0.5.0"

[features]
builder = []
jemalloc-allocator = [
"dep:tikv-jemalloc-ctl",
"dep:tikv-jemallocator",
"polkadot-node-core-pvf-common/jemalloc-allocator",
]

[dev-dependencies]
criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support"] }
staging-kusama-runtime = { path = "../../../../runtime/kusama" }

[[bench]]
name = "prepare_kusama_runtime"
harness = false
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
use polkadot_node_core_pvf_common::{prepare::PrepareJobKind, pvf::PvfPrepData};
use polkadot_node_core_pvf_prepare_worker::{prepare, prevalidate};
use polkadot_primitives::ExecutorParams;
use std::time::Duration;

fn do_prepare_runtime(pvf: PvfPrepData) {
let blob = match prevalidate(&pvf.code()) {
Err(err) => panic!("{:?}", err),
Ok(b) => b,
};

match prepare(blob, &pvf.executor_params()) {
Ok(_) => (),
Err(err) => panic!("{:?}", err),
}
}

fn prepare_kusama_runtime(c: &mut Criterion) {
let blob = staging_kusama_runtime::WASM_BINARY.unwrap();
let pvf = match sp_maybe_compressed_blob::decompress(&blob, 64 * 1024 * 1024) {
Ok(code) => PvfPrepData::from_code(
code.into_owned(),
ExecutorParams::default(),
Duration::from_secs(360),
PrepareJobKind::Compilation,
),
Err(e) => {
panic!("Cannot decompress blob: {:?}", e);
},
};

let mut group = c.benchmark_group("kusama");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(20);
group.measurement_time(Duration::from_secs(240));
group.bench_function("prepare Kusama runtime", |b| {
// `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the
// benchmark accuracy
b.iter(|| do_prepare_runtime(pvf.clone()))
});
group.finish();
}

criterion_group!(preparation, prepare_kusama_runtime);
criterion_main!(preparation);
88 changes: 86 additions & 2 deletions polkadot/node/core/pvf/prepare-worker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread
use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop};
use parity_scale_codec::{Decode, Encode};
use polkadot_node_core_pvf_common::{
error::{PrepareError, PrepareResult},
error::{PrepareError, PrepareResult, OOM_PAYLOAD},
executor_intf::Executor,
framed_recv, framed_send,
prepare::{MemoryStats, PrepareJobKind, PrepareStats},
Expand All @@ -47,11 +47,22 @@ use polkadot_node_core_pvf_common::{
};
use polkadot_primitives::ExecutorParams;
use std::{
os::fd::{AsRawFd, RawFd},
path::PathBuf,
sync::{mpsc::channel, Arc},
time::Duration,
};
use tokio::{io, net::UnixStream};
use tracking_allocator::TrackingAllocator;

#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
#[global_allocator]
static ALLOC: TrackingAllocator<tikv_jemallocator::Jemalloc> =
TrackingAllocator(tikv_jemallocator::Jemalloc);

#[cfg(not(any(target_os = "linux", feature = "jemalloc-allocator")))]
#[global_allocator]
static ALLOC: TrackingAllocator<std::alloc::System> = TrackingAllocator(std::alloc::System);

/// Contains the bytes for a successfully compiled artifact.
pub struct CompiledArtifact(Vec<u8>);
Expand Down Expand Up @@ -91,6 +102,44 @@ async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Re
framed_send(stream, &result.encode()).await
}

fn start_memory_tracking(fd: RawFd, limit: Option<isize>) {
unsafe {
// SAFETY: Inside the failure handler, the allocator is locked and no allocations or
// deallocations are possible. For Linux, that always holds for the code below, so it's
// safe. For MacOS, that technically holds at the time of writing, but there are no future
// guarantees.
// The arguments of unsafe `libc` calls are valid, the payload validity is covered with
// a test.
ALLOC.start_tracking(
limit,
Some(Box::new(move || {
#[cfg(target_os = "linux")]
{
// Syscalls never allocate or deallocate, so this is safe.
libc::syscall(libc::SYS_write, fd, OOM_PAYLOAD.as_ptr(), OOM_PAYLOAD.len());
eskimor marked this conversation as resolved.
Show resolved Hide resolved
libc::syscall(libc::SYS_close, fd);
libc::syscall(libc::SYS_exit, 1);
}
#[cfg(not(target_os = "linux"))]
{
// Syscalls are not available on MacOS, so we have to use `libc` wrappers.
// Technicaly, there may be allocations inside, although they shouldn't be
// there. In that case, we'll see deadlocks on MacOS after the OOM condition
// triggered. As we consider running a validator on MacOS unsafe, and this
// code is only run by a validator, it's a lesser evil.
libc::write(fd, OOM_PAYLOAD.as_ptr().cast(), OOM_PAYLOAD.len());
libc::close(fd);
std::process::exit(1);
}
})),
);
}
}

fn end_memory_tracking() -> isize {
ALLOC.end_tracking()
}

/// The entrypoint that the spawned prepare worker should start with.
///
/// # Parameters
Expand Down Expand Up @@ -168,13 +217,29 @@ pub fn worker_entrypoint(
Arc::clone(&condvar),
WaitOutcome::TimedOut,
)?;

start_memory_tracking(
stream.as_raw_fd(),
executor_params.prechecking_max_memory().map(|v| {
v.try_into().unwrap_or_else(|_| {
gum::warn!(
LOG_TARGET,
%worker_pid,
"Illegal pre-checking max memory value {} discarded",
v,
);
0
})
}),
);

// Spawn another thread for preparation.
let prepare_thread = thread::spawn_worker_thread(
"prepare thread",
move || {
// Try to enable landlock.
#[cfg(target_os = "linux")]
let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread()
let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread()
.map(LandlockStatus::from_ruleset_status)
.map_err(|e| e.to_string());
#[cfg(not(target_os = "linux"))]
Expand Down Expand Up @@ -208,6 +273,17 @@ pub fn worker_entrypoint(

let outcome = thread::wait_for_threads(condvar);

let peak_alloc = {
let peak = end_memory_tracking();
gum::debug!(
target: LOG_TARGET,
%worker_pid,
"prepare job peak allocation is {} bytes",
peak,
);
peak
};

let result = match outcome {
WaitOutcome::Finished => {
let _ = cpu_time_monitor_tx.send(());
Expand Down Expand Up @@ -240,6 +316,14 @@ pub fn worker_entrypoint(
memory_tracker_stats,
#[cfg(target_os = "linux")]
max_rss: extract_max_rss_stat(max_rss, worker_pid),
// Negative peak allocation values are legit; they are narrow
// corner cases and shouldn't affect overall statistics
// significantly
peak_tracked_alloc: if peak_alloc > 0 {
peak_alloc as u64
} else {
0u64
},
};

// Log if landlock threw an error.
Expand Down
Loading