From fa618bc9c2f5b0209d1d0e62de9f44b9f33ba6c7 Mon Sep 17 00:00:00 2001 From: Balaji Arun Date: Thu, 1 Jun 2023 15:59:50 -0700 Subject: [PATCH] [forge] Mainnet Like Network Simulation (#7533) This PR adds the ability to simulate as close to a real network as possible in Forge. It adds a new NetEm config that can support emulating delay, loss, corruption, bandwidth between any two targets within the same test, overcoming existing drawbacks where multiple chaos simulations couldn't be stacked. This provides the MultiRegionNetworkEmulationTest which enables simulating a multi-region network and optionally add chaos between nodes in the same region group. Also, CpuChaosTest enables stressing the CPU to mimic varying CPU cores between validators. --- testsuite/forge-cli/src/main.rs | 21 +- testsuite/forge/src/backend/k8s/chaos.rs | 100 +++++- .../src/backend/k8s/chaos/cpu_stress.yaml | 16 + .../forge/src/backend/k8s/chaos/netem.yaml | 32 ++ testsuite/forge/src/interface/chaos.rs | 45 +++ testsuite/testcases/src/lib.rs | 2 +- testsuite/testcases/src/modifiers.rs | 84 ++++- .../src/multi_region_network_test.rs | 314 ++++++++++++++++++ .../src/multi_region_simulation_test.rs | 186 ----------- 9 files changed, 601 insertions(+), 199 deletions(-) create mode 100644 testsuite/forge/src/backend/k8s/chaos/cpu_stress.yaml create mode 100644 testsuite/forge/src/backend/k8s/chaos/netem.yaml create mode 100644 testsuite/testcases/src/multi_region_network_test.rs delete mode 100644 testsuite/testcases/src/multi_region_simulation_test.rs diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs index 76dd744fd68852..3e5196ea748965 100644 --- a/testsuite/forge-cli/src/main.rs +++ b/testsuite/forge-cli/src/main.rs @@ -21,8 +21,8 @@ use aptos_testcases::{ fullnode_reboot_stress_test::FullNodeRebootStressTest, generate_traffic, load_vs_perf_benchmark::{LoadVsPerfBenchmark, TransactionWorkload, Workloads}, - modifiers::{ExecutionDelayConfig, ExecutionDelayTest}, - multi_region_simulation_test::MultiRegionMultiCloudSimulationTest, + modifiers::{CpuChaosTest, ExecutionDelayConfig, ExecutionDelayTest}, + multi_region_network_test::MultiRegionNetworkEmulationTest, network_bandwidth_test::NetworkBandwidthTest, network_loss_test::NetworkLossTest, network_partition_test::NetworkPartitionTest, @@ -537,9 +537,7 @@ fn single_test_suite(test_name: &str) -> Result> { run_consensus_only_three_region_simulation(config) }, "quorum_store_reconfig_enable_test" => quorum_store_reconfig_enable_test(config), - "multi_region_multi_cloud_simulation_test" => { - multi_region_multi_cloud_simulation_test(config) - }, + "mainnet_like_simulation_test" => mainnet_like_simulation_test(config), "multiregion_benchmark_test" => multiregion_benchmark_test(config), _ => return Err(format_err!("Invalid --suite given: {:?}", test_name)), }; @@ -1583,9 +1581,9 @@ fn quorum_store_reconfig_enable_test(forge_config: ForgeConfig<'static>) -> Forg ) } -fn multi_region_multi_cloud_simulation_test(config: ForgeConfig<'static>) -> ForgeConfig<'static> { +fn mainnet_like_simulation_test(config: ForgeConfig<'static>) -> ForgeConfig<'static> { config - .with_initial_validator_count(NonZeroUsize::new(100).unwrap()) + .with_initial_validator_count(NonZeroUsize::new(20).unwrap()) .with_emit_job( EmitJobRequest::default() .mode(EmitJobMode::MaxLoad { @@ -1593,7 +1591,14 @@ fn multi_region_multi_cloud_simulation_test(config: ForgeConfig<'static>) -> For }) .txn_expiration_time_secs(5 * 60), ) - .with_network_tests(vec![&MultiRegionMultiCloudSimulationTest {}]) + .with_network_tests(vec![&CompositeNetworkTest { + wrapper: &MultiRegionNetworkEmulationTest { + override_config: None, + }, + test: &CpuChaosTest { + override_config: None, + }, + }]) .with_genesis_helm_config_fn(Arc::new(|helm_values| { // no epoch change. helm_values["chain"]["epoch_duration_secs"] = (24 * 3600).into(); diff --git a/testsuite/forge/src/backend/k8s/chaos.rs b/testsuite/forge/src/backend/k8s/chaos.rs index b07b2a3a6a08d1..96723e013402fe 100644 --- a/testsuite/forge/src/backend/k8s/chaos.rs +++ b/testsuite/forge/src/backend/k8s/chaos.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{ - dump_string_to_file, K8sSwarm, Result, Swarm, SwarmChaos, SwarmNetworkBandwidth, - SwarmNetworkDelay, SwarmNetworkLoss, SwarmNetworkPartition, KUBECTL_BIN, + dump_string_to_file, K8sSwarm, Result, Swarm, SwarmChaos, SwarmCpuStress, SwarmNetEm, + SwarmNetworkBandwidth, SwarmNetworkDelay, SwarmNetworkLoss, SwarmNetworkPartition, KUBECTL_BIN, }; use anyhow::bail; use aptos_logger::info; @@ -32,6 +32,18 @@ macro_rules! NETWORK_LOSS_CHAOS_TEMPLATE { }; } +macro_rules! NETEM_CHAOS_TEMPLATE { + () => { + "chaos/netem.yaml" + }; +} + +macro_rules! CPU_STRESS_CHAOS_TEMPLATE { + () => { + "chaos/cpu_stress.yaml" + }; +} + impl K8sSwarm { /// Injects the SwarmChaos into the specified namespace pub fn inject_swarm_chaos(&self, chaos: &SwarmChaos) -> Result<()> { @@ -166,12 +178,96 @@ impl K8sSwarm { )) } + fn create_netem_template(&self, swarm_netem: &SwarmNetEm) -> Result { + let mut network_chaos_specs = vec![]; + + for group_netem in &swarm_netem.group_netems { + let source_instance_labels = group_netem + .source_nodes + .iter() + .map(|node| { + if let Some(v) = self.validator(*node) { + v.name() + } else { + "invalid-node" + } + }) + .collect::>() + .join(","); + + let target_instance_labels = group_netem + .target_nodes + .iter() + .map(|node| { + if let Some(v) = self.validator(*node) { + v.name() + } else { + "invalid-node" + } + }) + .collect::>() + .join(","); + + network_chaos_specs.push(format!( + include_str!(NETEM_CHAOS_TEMPLATE!()), + name = &group_netem.name, + namespace = self.kube_namespace, + delay_latency_ms = group_netem.delay_latency_ms, + delay_jitter_ms = group_netem.delay_jitter_ms, + delay_correlation_percentage = group_netem.delay_correlation_percentage, + loss_percentage = group_netem.loss_percentage, + loss_correlation_percentage = group_netem.loss_correlation_percentage, + instance_labels = &source_instance_labels, + target_instance_labels = &target_instance_labels, + rate = group_netem.rate_in_mbps, + )); + } + + Ok(network_chaos_specs.join("\n---\n")) + } + + /// Creates the CPU stress template, which can be used to inject CPU stress into a pod. + /// This can be used to simulate nodes with different available CPU resource even though the + /// nodes have identical hardware. For example, a node with 4 cores can be simulated as a node + /// with 2 cores by setting num_workers to 2. + fn create_cpu_stress_template(&self, swarm_cpu_stress: &SwarmCpuStress) -> Result { + let mut cpu_stress_specs = vec![]; + + for group_cpu_stress in &swarm_cpu_stress.group_cpu_stresses { + let instance_labels = group_cpu_stress + .target_nodes + .iter() + .map(|node| { + if let Some(v) = self.validator(*node) { + v.name() + } else { + "invalid-node" + } + }) + .collect::>() + .join(","); + + cpu_stress_specs.push(format!( + include_str!(CPU_STRESS_CHAOS_TEMPLATE!()), + name = &group_cpu_stress.name, + namespace = self.kube_namespace, + num_workers = group_cpu_stress.num_workers, + load_per_worker = group_cpu_stress.load_per_worker, + instance_labels = &instance_labels, + )); + } + + Ok(cpu_stress_specs.join("\n---\n")) + } + fn create_chaos_template(&self, chaos: &SwarmChaos) -> Result { match chaos { SwarmChaos::Delay(c) => self.create_network_delay_template(c), SwarmChaos::Partition(c) => self.create_network_partition_template(c), SwarmChaos::Bandwidth(c) => self.create_network_bandwidth_template(c), SwarmChaos::Loss(c) => self.create_network_loss_template(c), + SwarmChaos::NetEm(c) => self.create_netem_template(c), + SwarmChaos::CpuStress(c) => self.create_cpu_stress_template(c), } } diff --git a/testsuite/forge/src/backend/k8s/chaos/cpu_stress.yaml b/testsuite/forge/src/backend/k8s/chaos/cpu_stress.yaml new file mode 100644 index 00000000000000..64d771ce5a0eee --- /dev/null +++ b/testsuite/forge/src/backend/k8s/chaos/cpu_stress.yaml @@ -0,0 +1,16 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: StressChaos +metadata: + namespace: {namespace} + name: {name} +spec: + mode: all + selector: + namespaces: + - {namespace} + expressionSelectors: + - {{ key: app.kubernetes.io/instance, operator: In, values: [{instance_labels}] }} + stressors: + cpu: + workers: {num_workers} + load: {load_per_worker} \ No newline at end of file diff --git a/testsuite/forge/src/backend/k8s/chaos/netem.yaml b/testsuite/forge/src/backend/k8s/chaos/netem.yaml new file mode 100644 index 00000000000000..1957df33f898e5 --- /dev/null +++ b/testsuite/forge/src/backend/k8s/chaos/netem.yaml @@ -0,0 +1,32 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: NetworkChaos +metadata: + namespace: {namespace} + name: {name} +spec: + action: netem + mode: all + selector: + namespaces: + - {namespace} + expressionSelectors: + - {{ key: app.kubernetes.io/instance, operator: In, values: [{instance_labels}] }} + delay: + latency: "{delay_latency_ms}ms" + correlation: "{delay_correlation_percentage}" + jitter: "{delay_jitter_ms}ms" + loss: + loss: "{loss_percentage}" + correlation: "{loss_correlation_percentage}" + bandwidth: + rate: "{rate}mbps" + limit: 20971520 # placeholder value. not supported by tc netem + buffer: 10000 # placeholder value. not supported by tc netem + direction: both + target: + selector: + namespaces: + - {namespace} + expressionSelectors: + - {{ key: app.kubernetes.io/instance, operator: In, values: [{target_instance_labels}] }} + mode: all diff --git a/testsuite/forge/src/interface/chaos.rs b/testsuite/forge/src/interface/chaos.rs index 11e3f261600929..cb2375f6431821 100644 --- a/testsuite/forge/src/interface/chaos.rs +++ b/testsuite/forge/src/interface/chaos.rs @@ -10,6 +10,8 @@ pub enum SwarmChaos { Partition(SwarmNetworkPartition), Bandwidth(SwarmNetworkBandwidth), Loss(SwarmNetworkLoss), + NetEm(SwarmNetEm), + CpuStress(SwarmCpuStress), } #[derive(Eq, Hash, PartialEq, Debug, Clone)] @@ -79,3 +81,46 @@ impl Display for SwarmNetworkLoss { ) } } + +#[derive(Eq, Hash, PartialEq, Debug, Clone)] +pub struct SwarmNetEm { + pub group_netems: Vec, +} + +impl Display for SwarmNetEm { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "NetEm nodes {:?}", self.group_netems) + } +} + +#[derive(Eq, Hash, PartialEq, Debug, Clone)] +pub struct GroupNetEm { + pub name: String, + pub source_nodes: Vec, + pub target_nodes: Vec, + pub delay_latency_ms: u64, + pub delay_jitter_ms: u64, + pub delay_correlation_percentage: u64, + pub loss_percentage: u64, + pub loss_correlation_percentage: u64, + pub rate_in_mbps: u64, +} + +#[derive(Eq, Hash, PartialEq, Debug, Clone)] +pub struct SwarmCpuStress { + pub group_cpu_stresses: Vec, +} + +impl Display for SwarmCpuStress { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "CpuStress nodes {:?}", self.group_cpu_stresses) + } +} + +#[derive(Eq, Hash, PartialEq, Debug, Clone)] +pub struct GroupCpuStress { + pub name: String, + pub target_nodes: Vec, + pub num_workers: u64, + pub load_per_worker: u64, +} diff --git a/testsuite/testcases/src/lib.rs b/testsuite/testcases/src/lib.rs index 48cb18022b1502..9c5a6209186ac9 100644 --- a/testsuite/testcases/src/lib.rs +++ b/testsuite/testcases/src/lib.rs @@ -9,7 +9,7 @@ pub mod framework_upgrade; pub mod fullnode_reboot_stress_test; pub mod load_vs_perf_benchmark; pub mod modifiers; -pub mod multi_region_simulation_test; +pub mod multi_region_network_test; pub mod network_bandwidth_test; pub mod network_loss_test; pub mod network_partition_test; diff --git a/testsuite/testcases/src/modifiers.rs b/testsuite/testcases/src/modifiers.rs index 9f979bed693487..33258ca15bdba3 100644 --- a/testsuite/testcases/src/modifiers.rs +++ b/testsuite/testcases/src/modifiers.rs @@ -1,9 +1,12 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use crate::{LoadDestination, NetworkLoadTest}; -use aptos_forge::{NetworkContext, NetworkTest, Swarm, SwarmExt, Test}; +use crate::{multi_region_network_test::chunk_validators, LoadDestination, NetworkLoadTest}; +use aptos_forge::{ + GroupCpuStress, NetworkContext, NetworkTest, Swarm, SwarmChaos, SwarmCpuStress, SwarmExt, Test, +}; use aptos_logger::info; +use aptos_types::PeerId; use rand::Rng; use tokio::runtime::Runtime; @@ -194,3 +197,80 @@ impl Test for NetworkUnreliabilityTest { "NetworkUnreliabilityWrapper" } } + +#[derive(Clone)] +pub struct CpuChaosConfig { + pub num_groups: usize, + pub load_per_worker: u64, +} + +impl Default for CpuChaosConfig { + fn default() -> Self { + Self { + num_groups: 4, + load_per_worker: 100, + } + } +} + +pub struct CpuChaosTest { + pub override_config: Option, +} + +impl Test for CpuChaosTest { + fn name(&self) -> &'static str { + "CpuChaosWrapper" + } +} + +fn create_cpu_stress_template( + all_validators: Vec, + config: &CpuChaosConfig, +) -> SwarmCpuStress { + let validator_chunks = chunk_validators(all_validators, config.num_groups); + + let group_cpu_stresses = validator_chunks + .into_iter() + .enumerate() + .map(|(idx, chunk)| GroupCpuStress { + name: format!("group-{}-cpu-stress", idx), + target_nodes: chunk, + num_workers: (config.num_groups - idx) as u64, + load_per_worker: config.load_per_worker, + }) + .collect(); + SwarmCpuStress { group_cpu_stresses } +} + +impl NetworkLoadTest for CpuChaosTest { + fn setup(&self, ctx: &mut NetworkContext) -> anyhow::Result { + let all_validators = ctx + .swarm() + .validators() + .map(|v| v.peer_id()) + .collect::>(); + + let config = self.override_config.as_ref().cloned().unwrap_or_default(); + + let swarm_cpu_stress = create_cpu_stress_template(all_validators, &config); + ctx.swarm() + .inject_chaos(SwarmChaos::CpuStress(swarm_cpu_stress))?; + + Ok(LoadDestination::FullnodesOtherwiseValidators) + } + + fn finish(&self, swarm: &mut dyn Swarm) -> anyhow::Result<()> { + let all_validators = swarm.validators().map(|v| v.peer_id()).collect::>(); + + let config = self.override_config.as_ref().cloned().unwrap_or_default(); + + let swarm_cpu_stress = create_cpu_stress_template(all_validators, &config); + swarm.remove_chaos(SwarmChaos::CpuStress(swarm_cpu_stress)) + } +} + +impl NetworkTest for CpuChaosTest { + fn run<'t>(&self, ctx: &mut NetworkContext<'t>) -> anyhow::Result<()> { + ::run(self, ctx) + } +} diff --git a/testsuite/testcases/src/multi_region_network_test.rs b/testsuite/testcases/src/multi_region_network_test.rs new file mode 100644 index 00000000000000..03868a03dd07a1 --- /dev/null +++ b/testsuite/testcases/src/multi_region_network_test.rs @@ -0,0 +1,314 @@ +// Copyright © Aptos Foundation +// SPDX-License-Identifier: Apache-2.0 + +use crate::{LoadDestination, NetworkLoadTest}; +use aptos_forge::{GroupNetEm, NetworkContext, NetworkTest, Swarm, SwarmChaos, SwarmNetEm, Test}; +use aptos_logger::info; +use aptos_types::PeerId; +use itertools::{self, Itertools}; +use std::collections::BTreeMap; + +/// The link stats are obtained from https://github.com/doitintl/intercloud-throughput/blob/master/results_202202/results.csv +/// The four regions were hand-picked from the dataset to simulate a multi-region setup +/// with high latencies and low bandwidth. +macro_rules! FOUR_REGION_LINK_STATS_CSV { + () => { + "data/four_region_link_stats.csv" + }; +} + +fn get_link_stats_table() -> BTreeMap> { + let mut stats_table = BTreeMap::new(); + + let mut rdr = + csv::Reader::from_reader(include_bytes!(FOUR_REGION_LINK_STATS_CSV!()).as_slice()); + rdr.deserialize() + .for_each(|result: Result<(String, String, u64, f64), _>| { + if let Ok((from, to, bitrate, latency)) = result { + stats_table + .entry(from) + .or_insert_with(BTreeMap::new) + .insert(to, (bitrate, latency)); + } + }); + stats_table +} + +pub(crate) fn chunk_validators(validators: Vec, num_groups: usize) -> Vec> { + let approx_chunk_size = validators.len() / num_groups; + + let chunks = validators.chunks_exact(approx_chunk_size); + + let mut validator_chunks: Vec> = + chunks.clone().map(|chunk| chunk.to_vec()).collect(); + + // Get any remaining validators and add them to the first group + let remaining_validators: Vec = chunks + .remainder() + .iter() + // If `approx_validators_per_region` is 1, then it is possible we will have more regions than desired, so the + // remaining validators will be in the first group. + .chain(chunks.skip(num_groups).flatten()) + .cloned() + .collect(); + if !remaining_validators.is_empty() { + validator_chunks[0].append(remaining_validators.to_vec().as_mut()); + } + + validator_chunks +} + +/// Creates a table of validators grouped by region. The validators divided into N groups, where N is the number of regions +/// provided in the link stats table. Any remaining validators are added to the first group. +fn create_link_stats_table_with_peer_groups( + validators: Vec, + link_stats_table: &LinkStatsTable, +) -> LinkStatsTableWithPeerGroups { + assert!(validators.len() >= link_stats_table.len()); + + let number_of_regions = link_stats_table.len(); + assert!( + number_of_regions >= 2, + "At least 2 regions are required for inter-region network chaos." + ); + assert!( + number_of_regions <= 4, + "ChaosMesh only supports simulating up to 4 regions." + ); + + let validator_chunks = chunk_validators(validators, number_of_regions); + + let validator_groups = validator_chunks + .into_iter() + .zip(link_stats_table.iter()) + .map(|(chunk, (from_region, stats))| (from_region.clone(), chunk, stats.clone())) + .collect(); + + validator_groups +} + +// A map of "source" regions to a map of "destination" region to (bandwidth, latency) +type LinkStatsTable = BTreeMap>; +// A map of "source" regions to a tuple of (list of validators, map of "destination" region to (bandwidth, latency)) +type LinkStatsTableWithPeerGroups = Vec<(String, Vec, BTreeMap)>; + +#[derive(Clone)] +pub struct InterRegionNetEmConfig { + delay_jitter_ms: u64, + delay_correlation_percentage: u64, + loss_percentage: u64, + loss_correlation_percentage: u64, +} + +impl Default for InterRegionNetEmConfig { + fn default() -> Self { + Self { + delay_jitter_ms: 20, + delay_correlation_percentage: 50, + loss_percentage: 3, + loss_correlation_percentage: 50, + } + } +} + +impl InterRegionNetEmConfig { + // Creates GroupNetEm for inter-region network chaos + fn build(&self, validator_groups: &LinkStatsTableWithPeerGroups) -> Vec { + let group_netems: Vec = validator_groups + .iter() + .combinations(2) + .map(|comb| { + let (from_region, from_chunk, stats) = &comb[0]; + let (to_region, to_chunk, _) = &comb[1]; + + let (bandwidth, latency) = stats.get(to_region).unwrap(); + let netem = GroupNetEm { + name: format!("{}-to-{}-netem", from_region, to_region), + source_nodes: from_chunk.to_vec(), + target_nodes: to_chunk.to_vec(), + delay_latency_ms: *latency as u64, + delay_jitter_ms: self.delay_jitter_ms, + delay_correlation_percentage: self.delay_correlation_percentage, + loss_percentage: self.loss_percentage, + loss_correlation_percentage: self.loss_correlation_percentage, + rate_in_mbps: *bandwidth / 1e6 as u64, + }; + info!("inter-region netem {:?}", netem); + + netem + }) + .collect(); + + group_netems + } +} + +#[derive(Clone)] +pub struct IntraRegionNetEmConfig { + bandwidth_rate_mbps: u64, + delay_latency_ms: u64, + delay_jitter_ms: u64, + delay_correlation_percentage: u64, + loss_percentage: u64, + loss_correlation_percentage: u64, +} + +impl Default for IntraRegionNetEmConfig { + fn default() -> Self { + Self { + bandwidth_rate_mbps: 10 * 1000, // 10 Gbps + delay_latency_ms: 50, + delay_jitter_ms: 5, + delay_correlation_percentage: 50, + loss_percentage: 1, + loss_correlation_percentage: 50, + } + } +} + +impl IntraRegionNetEmConfig { + fn build(&self, validator_groups: LinkStatsTableWithPeerGroups) -> Vec { + let group_netems: Vec = validator_groups + .iter() + .map(|(region, chunk, _)| { + let netem = GroupNetEm { + name: format!("{}-self-netem", region), + source_nodes: chunk.to_vec(), + target_nodes: chunk.to_vec(), + delay_latency_ms: self.delay_latency_ms, + delay_jitter_ms: self.delay_jitter_ms, + delay_correlation_percentage: self.delay_correlation_percentage, + loss_percentage: self.loss_percentage, + loss_correlation_percentage: self.loss_correlation_percentage, + rate_in_mbps: self.bandwidth_rate_mbps, + }; + info!("intra-region netem {:?}", netem); + + netem + }) + .collect(); + + group_netems + } +} + +#[derive(Clone)] +pub struct MultiRegionNetworkEmulationConfig { + pub link_stats_table: LinkStatsTable, + pub inter_region_config: InterRegionNetEmConfig, + pub intra_region_config: Option, +} + +impl Default for MultiRegionNetworkEmulationConfig { + fn default() -> Self { + Self { + link_stats_table: get_link_stats_table(), + inter_region_config: InterRegionNetEmConfig::default(), + intra_region_config: Some(IntraRegionNetEmConfig::default()), + } + } +} + +/// A test to emulate network conditions for a multi-region setup. +pub struct MultiRegionNetworkEmulationTest { + pub override_config: Option, +} + +impl Test for MultiRegionNetworkEmulationTest { + fn name(&self) -> &'static str { + "network:multi-region-network-emulation" + } +} + +fn create_multi_region_swarm_network_chaos( + all_validators: Vec, + config: &MultiRegionNetworkEmulationConfig, +) -> SwarmNetEm { + let validator_groups = + create_link_stats_table_with_peer_groups(all_validators, &config.link_stats_table); + + let inter_region_netem = config.inter_region_config.build(&validator_groups); + let intra_region_netem = config + .intra_region_config + .as_ref() + .map(|config| config.build(validator_groups)) + .unwrap_or_default(); + + SwarmNetEm { + group_netems: itertools::concat(vec![intra_region_netem, inter_region_netem]), + } +} + +impl NetworkLoadTest for MultiRegionNetworkEmulationTest { + fn setup(&self, ctx: &mut NetworkContext) -> anyhow::Result { + let all_validators = ctx + .swarm() + .validators() + .map(|v| v.peer_id()) + .collect::>(); + + let config = self.override_config.as_ref().cloned().unwrap_or_default(); + + // inject netem chaos + let chaos = create_multi_region_swarm_network_chaos(all_validators, &config); + ctx.swarm().inject_chaos(SwarmChaos::NetEm(chaos))?; + + Ok(LoadDestination::FullnodesOtherwiseValidators) + } + + fn finish(&self, swarm: &mut dyn Swarm) -> anyhow::Result<()> { + swarm.remove_all_chaos() + } +} + +impl NetworkTest for MultiRegionNetworkEmulationTest { + fn run<'t>(&self, ctx: &mut NetworkContext<'t>) -> anyhow::Result<()> { + ::run(self, ctx) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::vec; + + #[test] + fn test_create_multi_region_swarm_network_chaos() { + aptos_logger::Logger::new().init(); + + let config = MultiRegionNetworkEmulationConfig::default(); + + let all_validators = (0..8).map(|_| PeerId::random()).collect(); + let netem = create_multi_region_swarm_network_chaos(all_validators, &config); + + assert_eq!(netem.group_netems.len(), 10); + + let all_validators: Vec = (0..10).map(|_| PeerId::random()).collect(); + let netem = create_multi_region_swarm_network_chaos(all_validators.clone(), &config); + + assert_eq!(netem.group_netems.len(), 10); + assert_eq!(netem.group_netems[0].source_nodes.len(), 4); + assert_eq!(netem.group_netems[0].target_nodes.len(), 4); + assert_eq!(netem.group_netems[0], GroupNetEm { + name: "aws--ap-northeast-1-self-netem".to_owned(), + rate_in_mbps: 10000, + source_nodes: vec![ + all_validators[0], + all_validators[1], + all_validators[8], + all_validators[9], + ], + target_nodes: vec![ + all_validators[0], + all_validators[1], + all_validators[8], + all_validators[9], + ], + delay_latency_ms: 50, + delay_jitter_ms: 5, + delay_correlation_percentage: 50, + loss_percentage: 1, + loss_correlation_percentage: 50 + }) + } +} diff --git a/testsuite/testcases/src/multi_region_simulation_test.rs b/testsuite/testcases/src/multi_region_simulation_test.rs deleted file mode 100644 index 12dcc2726e3b61..00000000000000 --- a/testsuite/testcases/src/multi_region_simulation_test.rs +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright © Aptos Foundation -// SPDX-License-Identifier: Apache-2.0 - -use crate::{LoadDestination, NetworkLoadTest}; -use aptos_forge::{ - GroupNetworkBandwidth, GroupNetworkDelay, NetworkContext, NetworkTest, Swarm, SwarmChaos, - SwarmNetworkBandwidth, SwarmNetworkDelay, Test, -}; -use aptos_logger::info; -use aptos_types::PeerId; -use csv::Reader; -use itertools::{self, Itertools}; -use std::collections::BTreeMap; - -/// The link stats are obtained from https://github.com/doitintl/intercloud-throughput/blob/master/results_202202/results.csv -/// The four regions were hand-picked from the dataset to simulate a multi-region setup -/// with high latencies and low bandwidth. -macro_rules! FOUR_REGION_LINK_STATS_CSV { - () => { - "data/four_region_link_stats.csv" - }; -} - -/// A test to simulate network between multiple regions in different clouds. -/// It currently supports only 4 regions, due to ChaosMesh limitations. -pub struct MultiRegionMultiCloudSimulationTest {} - -impl Test for MultiRegionMultiCloudSimulationTest { - fn name(&self) -> &'static str { - "network::multi-region-multi-cloud-simulation" - } -} - -fn get_link_stats_table() -> BTreeMap> { - let mut stats_table = BTreeMap::new(); - - let mut rdr = Reader::from_reader(include_bytes!(FOUR_REGION_LINK_STATS_CSV!()).as_slice()); - rdr.deserialize() - .for_each(|result: Result<(String, String, u64, f64), _>| { - if let Ok((from, to, bitrate, latency)) = result { - stats_table - .entry(from) - .or_insert_with(BTreeMap::new) - .insert(to, (bitrate, latency)); - } - }); - stats_table -} - -/// Creates a SwarmNetworkDelay -fn create_multi_region_swarm_network_chaos( - all_validators: Vec, -) -> (SwarmNetworkDelay, SwarmNetworkBandwidth) { - let link_stats_table = get_link_stats_table(); - - assert!(all_validators.len() >= link_stats_table.len()); - - let number_of_regions = link_stats_table.len(); - let approx_validators_per_region = all_validators.len() / number_of_regions; - - let validator_chunks = all_validators.chunks_exact(approx_validators_per_region); - - let (mut group_network_delays, group_network_bandwidths): ( - Vec, - Vec, - ) = validator_chunks - .clone() - .zip(link_stats_table.iter().clone()) - .combinations(2) - .map(|comb| { - let (from_chunk, (from_region, stats)) = &comb[0]; - let (to_chunk, (to_region, _)) = &comb[1]; - - let (bandwidth, latency) = stats.get(*to_region).unwrap(); - let delay = GroupNetworkDelay { - name: format!("{}-to-{}-delay", from_region, to_region), - source_nodes: from_chunk.to_vec(), - target_nodes: to_chunk.to_vec(), - latency_ms: *latency as u64, - jitter_ms: 5, - correlation_percentage: 50, - }; - info!("delay {:?}", delay); - - let bandwidth = GroupNetworkBandwidth { - name: format!("{}-to-{}-bandwidth", from_region, to_region), - // source_nodes: from_chunk.to_vec(), - // target_nodes: to_chunk.to_vec(), - rate: bandwidth / 8, - limit: 20971520, - buffer: 10000, - }; - info!("bandwidth {:?}", bandwidth); - - (delay, bandwidth) - }) - .unzip(); - - let remainder = validator_chunks.remainder(); - let remaining_validators: Vec = validator_chunks - .skip(number_of_regions) - .flatten() - .chain(remainder.iter()) - .cloned() - .collect(); - info!("remaining: {:?}", remaining_validators); - if !remaining_validators.is_empty() { - group_network_delays[0] - .source_nodes - .append(remaining_validators.to_vec().as_mut()); - } - - ( - SwarmNetworkDelay { - group_network_delays, - }, - SwarmNetworkBandwidth { - group_network_bandwidths, - }, - ) -} - -impl NetworkLoadTest for MultiRegionMultiCloudSimulationTest { - fn setup(&self, ctx: &mut NetworkContext) -> anyhow::Result { - let all_validators = ctx - .swarm() - .validators() - .map(|v| v.peer_id()) - .collect::>(); - - let (delay, bandwidth) = create_multi_region_swarm_network_chaos(all_validators); - - // inject bandwidth limit - let chaos = SwarmChaos::Bandwidth(bandwidth); - ctx.swarm().inject_chaos(chaos)?; - - // inject network delay - let chaos = SwarmChaos::Delay(delay); - ctx.swarm().inject_chaos(chaos)?; - - Ok(LoadDestination::FullnodesOtherwiseValidators) - } - - fn finish(&self, swarm: &mut dyn Swarm) -> anyhow::Result<()> { - swarm.remove_all_chaos() - } -} - -impl NetworkTest for MultiRegionMultiCloudSimulationTest { - fn run<'t>(&self, ctx: &mut NetworkContext<'t>) -> anyhow::Result<()> { - ::run(self, ctx) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_create_multi_region_swarm_network_chaos() { - aptos_logger::Logger::new().init(); - - let all_validators = (0..8).map(|_| PeerId::random()).collect(); - let (delay, bandwidth) = create_multi_region_swarm_network_chaos(all_validators); - - assert_eq!(delay.group_network_delays.len(), 6); - assert_eq!(bandwidth.group_network_bandwidths.len(), 6); - - let all_validators: Vec = (0..10).map(|_| PeerId::random()).collect(); - let (delay, bandwidth) = create_multi_region_swarm_network_chaos(all_validators); - - assert_eq!(delay.group_network_delays.len(), 6); - assert_eq!(bandwidth.group_network_bandwidths.len(), 6); - assert_eq!(delay.group_network_delays[0].source_nodes.len(), 4); - assert_eq!(delay.group_network_delays[0].target_nodes.len(), 2); - assert_eq!( - bandwidth.group_network_bandwidths[0], - GroupNetworkBandwidth { - name: "aws--ap-northeast-1-to-aws--eu-west-1-bandwidth".to_owned(), - rate: 5160960, - limit: 20971520, - buffer: 10000, - } - ) - } -}