diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md index b1476db27548..1ff5b129e1e4 100644 --- a/polkadot/node/subsystem-bench/README.md +++ b/polkadot/node/subsystem-bench/README.md @@ -117,23 +117,24 @@ used to run a suite of tests defined in a `yaml` file like in this [example](exa ``` Options: - --network The type of network to be emulated [default: ideal] [possible values: - ideal, healthy, degraded] - --n-cores Number of cores to fetch availability for [default: 100] - --n-validators Number of validators to fetch chunks from [default: 500] - --min-pov-size The minimum pov size in KiB [default: 5120] - --max-pov-size The maximum pov size bytes [default: 5120] - -n, --num-blocks The number of blocks the test is going to run [default: 1] - -p, --peer-bandwidth The bandwidth of simulated remote peers in KiB - -b, --bandwidth The bandwidth of our simulated node in KiB - --peer-error Simulated conection error ratio [0-100] - --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] - --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] - --profile Enable CPU Profiling with Pyroscope - --pyroscope-url Pyroscope Server URL [default: http://localhost:4040] - --pyroscope-sample-rate Pyroscope Sample Rate [default: 113] - -h, --help Print help - -V, --version Print version + --network The type of network to be emulated [default: ideal] [possible + values: ideal, healthy, degraded] + --n-cores Number of cores to fetch availability for [default: 100] + --n-validators Number of validators to fetch chunks from [default: 500] + --min-pov-size The minimum pov size in KiB [default: 5120] + --max-pov-size The maximum pov size bytes [default: 5120] +-n, --num-blocks The number of blocks the test is going to run [default: 1] +-p, --peer-bandwidth The bandwidth of simulated remote peers in KiB +-b, --bandwidth The bandwidth of our simulated node in KiB + --peer-error Simulated conection error ratio [0-100] + --peer-min-latency Minimum remote peer latency in milliseconds [0-5000] + --peer-max-latency Maximum remote peer latency in milliseconds [0-5000] + --profile Enable CPU Profiling with Pyroscope + --pyroscope-url Pyroscope Server URL [default: http://localhost:4040] + --pyroscope-sample-rate Pyroscope Sample Rate [default: 113] + --cache-misses Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind + must be in the PATH +-h, --help Print help ``` These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file. @@ -221,6 +222,48 @@ view the test progress in real time by accessing [this link](http://localhost:30 Now run `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml` and view the metrics in real time and spot differences between different `n_validators` values. + +### Profiling cache misses + +Cache misses are profiled using Cachegrind, part of Valgrind. Cachegrind runs slowly, and its cache simulation is basic +and unlikely to reflect the behavior of a modern machine. However, it still represents the general situation with cache +usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in +a remote virtual installation. + +To profile cache misses use the `--cache-misses` flag. Cache simulation of current runs tuned for Intel Ice Lake CPU. +Since the execution will be very slow, it's recommended not to run it together with other profiling and not to take +benchmark results into account. A report is saved in a file `cachegrind_report.txt`. + +Example run results: +``` +$ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read +$ cat cachegrind_report.txt +I refs: 64,622,081,485 +I1 misses: 3,018,168 +LLi misses: 437,654 +I1 miss rate: 0.00% +LLi miss rate: 0.00% + +D refs: 12,161,833,115 (9,868,356,364 rd + 2,293,476,751 wr) +D1 misses: 167,940,701 ( 71,060,073 rd + 96,880,628 wr) +LLd misses: 33,550,018 ( 16,685,853 rd + 16,864,165 wr) +D1 miss rate: 1.4% ( 0.7% + 4.2% ) +LLd miss rate: 0.3% ( 0.2% + 0.7% ) + +LL refs: 170,958,869 ( 74,078,241 rd + 96,880,628 wr) +LL misses: 33,987,672 ( 17,123,507 rd + 16,864,165 wr) +LL miss rate: 0.0% ( 0.0% + 0.7% ) +``` + +The results show that 1.4% of the L1 data cache missed, but the last level cache only missed 0.3% of the time. +Instruction data of the L1 has 0.00%. + +Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.`. +This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.`. For more information see the +[cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html). + +For finer profiling of cache misses, better use `perf` on a bare-metal machine. + ## Create new test objectives This tool is intended to make it easy to write new test objectives that focus individual subsystems, diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs index 29b62b27855a..8669ee4e8b1d 100644 --- a/polkadot/node/subsystem-bench/src/subsystem-bench.rs +++ b/polkadot/node/subsystem-bench/src/subsystem-bench.rs @@ -16,6 +16,7 @@ //! A tool for running subsystem benchmark tests designed for development and //! CI regression testing. + use clap::Parser; use color_eyre::eyre; use pyroscope::PyroscopeAgent; @@ -27,6 +28,7 @@ use std::{path::Path, time::Duration}; pub(crate) mod availability; pub(crate) mod cli; pub(crate) mod core; +mod valgrind; use availability::{prepare_test, NetworkEmulation, TestState}; use cli::TestObjective; @@ -90,12 +92,21 @@ struct BenchCli { /// Pyroscope Sample Rate pub pyroscope_sample_rate: u32, + #[clap(long, default_value_t = false)] + /// Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH + pub cache_misses: bool, + #[command(subcommand)] pub objective: cli::TestObjective, } impl BenchCli { fn launch(self) -> eyre::Result<()> { + let is_valgrind_running = valgrind::is_valgrind_running(); + if !is_valgrind_running && self.cache_misses { + return valgrind::relaunch_in_valgrind_mode() + } + let agent_running = if self.profile { let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench") .backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate))) @@ -185,7 +196,7 @@ impl BenchCli { let mut state = TestState::new(&test_config); let (mut env, _protocol_config) = prepare_test(test_config, &mut state); - // test_config.write_to_disk(); + env.runtime() .block_on(availability::benchmark_availability_read(&mut env, state)); diff --git a/polkadot/node/subsystem-bench/src/valgrind.rs b/polkadot/node/subsystem-bench/src/valgrind.rs new file mode 100644 index 000000000000..3d0c488355b9 --- /dev/null +++ b/polkadot/node/subsystem-bench/src/valgrind.rs @@ -0,0 +1,49 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use color_eyre::eyre; + +/// Show if the app is running under Valgrind +pub(crate) fn is_valgrind_running() -> bool { + match std::env::var("LD_PRELOAD") { + Ok(v) => v.contains("valgrind"), + Err(_) => false, + } +} + +/// Stop execution and relaunch the app under valgrind +/// Cache configuration used to emulate Intel Ice Lake (size, associativity, line size): +/// L1 instruction: 32,768 B, 8-way, 64 B lines +/// L1 data: 49,152 B, 12-way, 64 B lines +/// Last-level: 2,097,152 B, 16-way, 64 B lines +pub(crate) fn relaunch_in_valgrind_mode() -> eyre::Result<()> { + use std::os::unix::process::CommandExt; + let err = std::process::Command::new("valgrind") + .arg("--tool=cachegrind") + .arg("--cache-sim=yes") + .arg("--log-file=cachegrind_report.txt") + .arg("--I1=32768,8,64") + .arg("--D1=49152,12,64") + .arg("--LL=2097152,16,64") + .arg("--verbose") + .args(std::env::args()) + .exec(); + + Err(eyre::eyre!( + "Сannot run Valgrind, check that it is installed and available in the PATH\n{}", + err + )) +}