paritytech · AndreiEres · Jan 16, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/polkadot/node/subsystem-bench/Cargo.toml b/polkadot/node/subsystem-bench/Cargo.toml
@@ -59,5 +59,8 @@ orchestra = { version = "0.3.3", default-features = false, features = ["futures_
 pyroscope = "0.5.7"
 pyroscope_pprofrs = "0.2.7"
 
+[target.'cfg(target_os = "linux")'.dependencies]
+crabgrind = "0.1.10"
+
 [features]
 default = []
diff --git a/polkadot/node/subsystem-bench/README.md b/polkadot/node/subsystem-bench/README.md
@@ -117,23 +117,24 @@ used to run a suite of tests defined in a `yaml` file like in this [example](exa
 
 ```
 Options:
-      --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible values:
-                                                       ideal, healthy, degraded]
-      --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
-      --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
-      --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
-      --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
-  -n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
-  -p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
-  -b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
-      --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
-      --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
-      --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
-      --profile                                        Enable CPU Profiling with Pyroscope
-      --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
-      --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
-  -h, --help                                           Print help
-  -V, --version                                        Print version
+    --network <NETWORK>                              The type of network to be emulated [default: ideal] [possible
+                                                     values: ideal, healthy, degraded]
+    --n-cores <N_CORES>                              Number of cores to fetch availability for [default: 100]
+    --n-validators <N_VALIDATORS>                    Number of validators to fetch chunks from [default: 500]
+    --min-pov-size <MIN_POV_SIZE>                    The minimum pov size in KiB [default: 5120]
+    --max-pov-size <MAX_POV_SIZE>                    The maximum pov size bytes [default: 5120]
+-n, --num-blocks <NUM_BLOCKS>                        The number of blocks the test is going to run [default: 1]
+-p, --peer-bandwidth <PEER_BANDWIDTH>                The bandwidth of simulated remote peers in KiB
+-b, --bandwidth <BANDWIDTH>                          The bandwidth of our simulated node in KiB
+    --peer-error <PEER_ERROR>                        Simulated conection error ratio [0-100]
+    --peer-min-latency <PEER_MIN_LATENCY>            Minimum remote peer latency in milliseconds [0-5000]
+    --peer-max-latency <PEER_MAX_LATENCY>            Maximum remote peer latency in milliseconds [0-5000]
+    --profile                                        Enable CPU Profiling with Pyroscope
+    --pyroscope-url <PYROSCOPE_URL>                  Pyroscope Server URL [default: http://localhost:4040]
+    --pyroscope-sample-rate <PYROSCOPE_SAMPLE_RATE>  Pyroscope Sample Rate [default: 113]
+    --cache-misses                                   Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind
+                                                     must be in the PATH
+-h, --help                                           Print help
 ```
 
 These apply to all test objectives, except `test-sequence` which relies on the values being specified in a file.
@@ -221,6 +222,83 @@ view the test progress in real time by accessing [this link](http://localhost:30
 Now run
 `target/testnet/subsystem-bench test-sequence --path polkadot/node/subsystem-bench/examples/availability_read.yaml`
 and view the metrics in real time and spot differences between different `n_validators` values.
+
+### Profiling cache misses
+
+Cache misses are profiled using Cachegrind, part of Valgrind. Cachegrind runs slowly, and its cache simulation is basic
+and unlikely to reflect the behavior of a modern machine. However, it still represents the general situation with cache
+usage, and more importantly it doesn't require a bare-metal machine to run on, which means it could be run in CI or in
+a remote virtual installation.
+
+To profile cache misses use the `--cache-misses` flag. Since the execution will be very slow, it's recommended not to
+run it together with other profiling and not to take benchmark results into account.
+
+Example run results:
+```
+$ target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+==201761== Callgrind, a call-graph generating cache profiler
+==201761== Copyright (C) 2002-2017, and GNU GPL'd, by Josef Weidendorfer et al.
+==201761== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info
+==201761== Command: target/testnet/subsystem-bench --n-cores 10 --cache-misses data-availability-read
+==201761==
+--201761-- warning: L3 cache found, using its data for the LL simulation.
+--201761-- warning: specified LL cache: line_size 64  assoc 20  total_size 57,671,680
+--201761-- warning: simulated LL cache: line_size 64  assoc 28  total_size 58,720,256
+==201761== For interactive control, run 'callgrind_control -h'.
+[2024-01-10T08:00:32Z INFO  subsystem_bench::core::display] n_validators = 500, n_cores = 10, pov_size = 5120 - 5120, error = 0, latency = N
+one
+[2024-01-10T08:00:32Z INFO  subsystem-bench::availability] Generating template candidate index=0 pov_size=5242880
+[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Created test environment.
+[2024-01-10T08:01:21Z INFO  subsystem-bench::availability] Pre-generating 10 candidates.
+[2024-01-10T08:07:18Z INFO  subsystem-bench::core] Initializing emulation for a 500 peer network.
+[2024-01-10T08:07:18Z INFO  subsystem-bench::core] connectivity 100%, error 0%
+[2024-01-10T08:07:19Z INFO  subsystem-bench::core] Network created, connected validator count 500
+[2024-01-10T08:07:19Z INFO  subsystem-bench::availability] Current block 1/1
+[2024-01-10T08:07:19Z INFO  substrate_prometheus_endpoint] 〽️ Prometheus exporter started at 127.0.0.1:9999
+[2024-01-10T08:07:20Z INFO  subsystem_bench::availability] 10 recoveries pending
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All work for block completed in 1462465ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] All blocks processed in 1462533ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Throughput: 51200 KiB/block
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability] Block time: 1462538 ms
+[2024-01-10T08:31:42Z INFO  subsystem_bench::availability]
+
+    Total received from network: 200 MiB
+    Total sent to network: 762 KiB
+    Total subsystem CPU usage 2916.57s
+    CPU usage per block 2916.57s
+    Total test environment CPU usage 6.17s
+    CPU usage per block 6.17s
+
+==201761==
+==201761== Events    : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
+==201761== Collected : 95411656238 10227094854 8380466979 9032639 161537270 173821878 42409 6413757 10014923
+==201761==
+==201761== I   refs:      95,411,656,238
+==201761== I1  misses:         9,032,639
+==201761== LLi misses:            42,409
+==201761== I1  miss rate:           0.01%
+==201761== LLi miss rate:           0.00%
+==201761==
+==201761== D   refs:      18,607,561,833  (10,227,094,854 rd + 8,380,466,979 wr)
+==201761== D1  misses:       335,359,148  (   161,537,270 rd +   173,821,878 wr)
+==201761== LLd misses:        16,428,680  (     6,413,757 rd +    10,014,923 wr)
+==201761== D1  miss rate:            1.8% (           1.6%   +           2.1%  )
+==201761== LLd miss rate:            0.1% (           0.1%   +           0.1%  )
+==201761==
+==201761== LL refs:          344,391,787  (   170,569,909 rd +   173,821,878 wr)
+==201761== LL misses:         16,471,089  (     6,456,166 rd +    10,014,923 wr)
+==201761== LL miss rate:             0.0% (           0.0%   +           0.1%  )
+```
+
+The results show that 1.8% of the L1 data cache missed, but the last level cache only missed 0.1% of the time.
+Instruction data of the L1 has 0.04% of the time and almost nothing was missed at the last level.
+
+Cachegrind writes line-by-line cache profiling information to a file named `cachegrind.out.<pid>`.
+This file is best interpreted with `cg_annotate --auto=yes cachegrind.out.<pid>`. For more information see the
+[cachegrind manual](https://www.cs.cmu.edu/afs/cs.cmu.edu/project/cmt-40/Nice/RuleRefinement/bin/valgrind-3.2.0/docs/html/cg-manual.html).
+
+For finer profiling of cache misses, better use `perf` on a bare-metal machine.
+
 ## Create new test objectives
 
 This tool is intended to make it easy to write new test objectives that focus individual subsystems,

diff --git a/polkadot/node/subsystem-bench/src/subsystem-bench.rs b/polkadot/node/subsystem-bench/src/subsystem-bench.rs
@@ -16,6 +16,7 @@
 
 //! A tool for running subsystem benchmark tests designed for development and
 //! CI regression testing.
+
 use clap::Parser;
 use color_eyre::eyre;
 use pyroscope::PyroscopeAgent;
@@ -90,12 +91,21 @@ struct BenchCli {
 	/// Pyroscope Sample Rate
 	pub pyroscope_sample_rate: u32,
 
+	#[clap(long, default_value_t = false)]
+	/// Enable Cache Misses Profiling with Valgrind. Linux only, Valgrind must be in the PATH
+	pub cache_misses: bool,
+
 	#[command(subcommand)]
 	pub objective: cli::TestObjective,
 }
 
 impl BenchCli {
 	fn launch(self) -> eyre::Result<()> {
+		let is_valgrind = is_valgrind_mode();
+		if !is_valgrind && self.cache_misses {
+			return valgrind_init()
+		}
+
 		let agent_running = if self.profile {
 			let agent = PyroscopeAgent::builder(self.pyroscope_url.as_str(), "subsystem-bench")
 				.backend(pprof_backend(PprofConfig::new().sample_rate(self.pyroscope_sample_rate)))
@@ -185,10 +195,18 @@ impl BenchCli {
 
 		let mut state = TestState::new(&test_config);
 		let (mut env, _protocol_config) = prepare_test(test_config, &mut state);
-		// test_config.write_to_disk();
+
+		if is_valgrind {
+			valgrind_start();
+		}
+
 		env.runtime()
 			.block_on(availability::benchmark_availability_read(&mut env, state));
 
+		if is_valgrind {
+			valgrind_stop();
+		}
+
 		if let Some(agent_running) = agent_running {
 			let agent_ready = agent_running.stop()?;
 			agent_ready.shutdown();
@@ -198,6 +216,52 @@ impl BenchCli {
 	}
 }
 
+#[cfg(target_os = "linux")]
+fn is_valgrind_mode() -> bool {
+	!matches!(crabgrind::run_mode(), crabgrind::RunMode::Native)
+}
+
+#[cfg(not(target_os = "linux"))]
+fn is_valgrind_mode() -> bool {
+	false
+}
+
+/// Start collecting cache misses data
+#[cfg(target_os = "linux")]
+fn valgrind_start() {
+	crabgrind::cachegrind::start_instrumentation();
+}
+
+#[cfg(not(target_os = "linux"))]
+fn valgrind_start() {}
+
+/// Stop collecting cache misses data
+#[cfg(target_os = "linux")]
+fn valgrind_stop() {
+	crabgrind::cachegrind::stop_instrumentation();
+}
+
+#[cfg(not(target_os = "linux"))]
+fn valgrind_stop() {}
+
+#[cfg(target_os = "linux")]
+fn valgrind_init() -> eyre::Result<()> {
+	use std::os::unix::process::CommandExt;
+	std::process::Command::new("valgrind")
+		.arg("--tool=cachegrind")
+		.arg("--cache-sim=yes")
+		.arg("--instr-at-start=no")
+		.args(std::env::args())
+		.exec();
+
+	return Ok(())
+}
+
+#[cfg(not(target_os = "linux"))]
+fn valgrind_init() -> eyre::Result<()> {
+	return Err(eyre::eyre!("Valgrind can be executed only on linux"));
+}
+
 fn main() -> eyre::Result<()> {
 	color_eyre::install()?;
 	env_logger::builder()