Skip to content

Commit

Permalink
Add realistic_env_load_sweep forge test, to check latency across TPS
Browse files Browse the repository at this point in the history
  • Loading branch information
igor-aptos committed Jun 8, 2023
1 parent 065c93f commit b2d104b
Show file tree
Hide file tree
Showing 9 changed files with 221 additions and 126 deletions.
15 changes: 14 additions & 1 deletion .github/workflows/forge-stable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,24 @@ jobs:
secrets: inherit
with:
IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }}
FORGE_NAMESPACE: forge-land-blocking-new-${{ needs.determine-test-metadata.outputs.IMAGE_TAG }}
FORGE_NAMESPACE: forge-realistic-env-max-throughput-${{ needs.determine-test-metadata.outputs.IMAGE_TAG }}
FORGE_RUNNER_DURATION_SECS: 600
FORGE_TEST_SUITE: realistic_env_max_throughput
POST_TO_SLACK: true

run-forge-realistic-env-load-sweep:
if: ${{ github.event_name != 'pull_request' }}
needs: determine-test-metadata
uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main
secrets: inherit
with:
IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }}
FORGE_NAMESPACE: forge-realistic-env-load-sweep-${{ needs.determine-test-metadata.outputs.IMAGE_TAG }}
# 5 tests, each 300s
FORGE_RUNNER_DURATION_SECS: 1500
FORGE_TEST_SUITE: realistic_env_load_sweep
POST_TO_SLACK: true

run-forge-three-region-graceful-overload:
if: ${{ github.event_name != 'pull_request' }}
needs: determine-test-metadata
Expand Down
14 changes: 11 additions & 3 deletions crates/transaction-emitter-lib/src/emitter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pub struct EmitModeParams {
pub worker_offset_mode: WorkerOffsetMode,
pub wait_millis: u64,
pub check_account_sequence_only_once_fraction: f32,
pub check_account_sequence_sleep_millis: u64,
pub check_account_sequence_sleep: Duration,
}

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -140,6 +140,8 @@ pub struct EmitJobRequest {
prompt_before_spending: bool,

coordination_delay_between_instances: Duration,

latency_polling_interval: Duration,
}

impl Default for EmitJobRequest {
Expand All @@ -163,6 +165,7 @@ impl Default for EmitJobRequest {
expected_gas_per_txn: aptos_global_constants::MAX_GAS_AMOUNT,
prompt_before_spending: false,
coordination_delay_between_instances: Duration::from_secs(0),
latency_polling_interval: Duration::from_millis(300),
}
}
}
Expand Down Expand Up @@ -257,6 +260,11 @@ impl EmitJobRequest {
self
}

pub fn latency_polling_interval(mut self, latency_polling_interval: Duration) -> Self {
self.latency_polling_interval = latency_polling_interval;
self
}

pub fn calculate_mode_params(&self) -> EmitModeParams {
let clients_count = self.rest_clients.len();

Expand Down Expand Up @@ -294,7 +302,7 @@ impl EmitJobRequest {
workers_per_endpoint: num_workers_per_endpoint,
endpoints: clients_count,
check_account_sequence_only_once_fraction: 0.0,
check_account_sequence_sleep_millis: 300,
check_account_sequence_sleep: self.latency_polling_interval,
}
},
EmitJobMode::ConstTps { tps }
Expand Down Expand Up @@ -382,7 +390,7 @@ impl EmitJobRequest {
workers_per_endpoint: num_workers_per_endpoint,
endpoints: clients_count,
check_account_sequence_only_once_fraction: 1.0 - sample_latency_fraction,
check_account_sequence_sleep_millis: 300,
check_account_sequence_sleep: self.latency_polling_interval,
}
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,8 @@ impl SubmissionWorker {
// generally, we should never need to recheck, as we wait enough time
// before calling here, but in case of shutdown/or client we are talking
// to being stale (having stale transaction_version), we might need to wait.
Duration::from_millis(
if self.skip_latency_stats { 10 } else { 1 }
* self.params.check_account_sequence_sleep_millis,
),
if self.skip_latency_stats { 10 } else { 1 }
* self.params.check_account_sequence_sleep,
loop_stats,
)
.await;
Expand Down
107 changes: 75 additions & 32 deletions testsuite/forge-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,15 @@ fn main() -> Result<()> {
logger.build();

let args = Args::from_args();
let duration = Duration::from_secs(args.duration_secs as u64);
let duration = Duration::from_secs(5 * 300 as u64); // args.duration_secs as u64);
let suite_name: &str = args.suite.as_ref();

let suite_name = if suite_name == "land_blocking" {
"realistic_env_load_sweep"
} else {
panic!()
};

let runtime = Runtime::new()?;
match args.cli_cmd {
// cmd input for test
Expand Down Expand Up @@ -263,7 +269,7 @@ fn main() -> Result<()> {
match test_cmd {
TestCommand::LocalSwarm(local_cfg) => {
// Loosen all criteria for local runs
test_suite.get_success_criteria_mut().avg_tps = 400;
test_suite.get_success_criteria_mut().min_avg_tps = 400;
let previous_emit_job = test_suite.get_emit_job().clone();
let test_suite =
test_suite.with_emit_job(previous_emit_job.mode(EmitJobMode::MaxLoad {
Expand Down Expand Up @@ -492,6 +498,7 @@ fn single_test_suite(test_name: &str, duration: Duration) -> Result<ForgeConfig>
"compat" => compat(),
"framework_upgrade" => upgrade(),
// Rest of the tests:
"realistic_env_load_sweep" => realistic_env_load_sweep_test(),
"epoch_changer_performance" => epoch_changer_performance(),
"state_sync_perf_fullnodes_apply_outputs" => state_sync_perf_fullnodes_apply_outputs(),
"state_sync_perf_fullnodes_execute_transactions" => {
Expand Down Expand Up @@ -591,8 +598,9 @@ fn run_consensus_only_perf_test() -> ForgeConfig {
config
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.add_network_test(LoadVsPerfBenchmark {
test: &PerformanceBenchmark,
test: Box::new(PerformanceBenchmark),
workloads: Workloads::TPS(&[30000]),
criteria: vec![],
})
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
// no epoch change.
Expand Down Expand Up @@ -757,15 +765,57 @@ fn consensus_stress_test() -> ForgeConfig {
})
}

fn realistic_env_load_sweep_test() -> ForgeConfig {
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_fullnode_count(10)
.add_network_test(CompositeNetworkTest::new_with_two_wrappers(
MultiRegionNetworkEmulationTest {
override_config: None,
},
CpuChaosTest {
override_config: None,
},
LoadVsPerfBenchmark {
test: Box::new(PerformanceBenchmark),
workloads: Workloads::TPS(&[10, 100, 1000, 3000, 5000]),
criteria: [(9, 1.5, 3.), (95, 1.5, 3.), (950, 2., 3.), (2900, 2.5, 4.), (4900, 3., 5.)].into_iter().map(
|(min_tps, max_lat_p50, max_lat_p99)|
SuccessCriteria::new(min_tps)
.add_latency_threshold(max_lat_p50, LatencyType::P50)
.add_latency_threshold(max_lat_p99, LatencyType::P99)
).collect(),
},
))
// Test inherits the main EmitJobRequest, so update here for more precise latency measurements
.with_emit_job(
EmitJobRequest::default().latency_polling_interval(Duration::from_millis(100)),
)
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
// no epoch change.
helm_values["chain"]["epoch_duration_secs"] = (24 * 3600).into();
}))
.with_success_criteria(
SuccessCriteria::new(0)
.add_no_restarts()
.add_wait_for_catchup_s(60)
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 30.0,
max_round_gap: 10,
}),
)
}

fn load_vs_perf_benchmark() -> ForgeConfig {
ForgeConfig::default()
.with_initial_validator_count(NonZeroUsize::new(20).unwrap())
.with_initial_fullnode_count(10)
.add_network_test(LoadVsPerfBenchmark {
test: &PerformanceBenchmark,
test: Box::new(PerformanceBenchmark),
workloads: Workloads::TPS(&[
200, 1000, 3000, 5000, 7000, 7500, 8000, 9000, 10000, 12000, 15000,
]),
criteria: Vec::new(),
})
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
// no epoch change.
Expand Down Expand Up @@ -794,7 +844,7 @@ fn workload_vs_perf_benchmark() -> ForgeConfig {
// mempool_backlog: 10000,
// }))
.add_network_test(LoadVsPerfBenchmark {
test: &PerformanceBenchmark,
test: Box::new(PerformanceBenchmark),
workloads: Workloads::TRANSACTIONS(&[
TransactionWorkload {
transaction_type: TransactionTypeArg::NoOp,
Expand Down Expand Up @@ -837,6 +887,7 @@ fn workload_vs_perf_benchmark() -> ForgeConfig {
unique_senders: true,
},
]),
criteria: Vec::new(),
})
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
// no epoch change.
Expand All @@ -863,15 +914,14 @@ fn graceful_overload() -> ForgeConfig {
// So having VFNs for all validators
.with_initial_fullnode_count(10)
.add_network_test(TwoTrafficsTest {
inner_mode: EmitJobMode::ConstTps { tps: 15000 },
inner_gas_price: aptos_global_constants::GAS_UNIT_PRICE,
inner_init_gas_price_multiplier: 20,
inner_transaction_type: TransactionTypeArg::CoinTransfer.materialize_default(),
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::ConstTps { tps: 15000 })
.init_gas_price_multiplier(20),

// Additionally - we are not really gracefully handling overlaods,
// setting limits based on current reality, to make sure they
// don't regress, but something to investigate
avg_tps: 3400,
latency_thresholds: &[],
inner_success_criteria: SuccessCriteria::new(3400),
})
// First start higher gas-fee traffic, to not cause issues with TxnEmitter setup - account creation
.with_emit_job(
Expand Down Expand Up @@ -913,19 +963,13 @@ fn three_region_sim_graceful_overload() -> ForgeConfig {
.add_network_test(CompositeNetworkTest::new(
ThreeRegionSameCloudSimulationTest,
TwoTrafficsTest {
inner_mode: EmitJobMode::ConstTps { tps: 15000 },
inner_gas_price: aptos_global_constants::GAS_UNIT_PRICE,
inner_init_gas_price_multiplier: 20,
// Cannot use TransactionTypeArg::materialize, as this needs to be static
inner_transaction_type: TransactionType::CoinTransfer {
invalid_transaction_ratio: 0,
sender_use_account_pool: false,
},
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::ConstTps { tps: 15000 })
.init_gas_price_multiplier(20),
// Additionally - we are not really gracefully handling overlaods,
// setting limits based on current reality, to make sure they
// don't regress, but something to investigate
avg_tps: 1200,
latency_thresholds: &[],
inner_success_criteria: SuccessCriteria::new(3400),
},
))
// First start higher gas-fee traffic, to not cause issues with TxnEmitter setup - account creation
Expand Down Expand Up @@ -1333,14 +1377,12 @@ fn realistic_env_max_throughput_test_suite(duration: Duration) -> ForgeConfig {
override_config: None,
},
TwoTrafficsTest {
inner_mode: EmitJobMode::MaxLoad {
mempool_backlog: 40000,
},
inner_gas_price: aptos_global_constants::GAS_UNIT_PRICE,
inner_init_gas_price_multiplier: 20,
inner_transaction_type: TransactionTypeArg::CoinTransfer.materialize_default(),
avg_tps: 5000,
latency_thresholds: &[],
inner_traffic: EmitJobRequest::default()
.mode(EmitJobMode::MaxLoad {
mempool_backlog: 40000,
})
.init_gas_price_multiplier(20),
inner_success_criteria: SuccessCriteria::new(5000),
},
))
.with_genesis_helm_config_fn(Arc::new(|helm_values| {
Expand All @@ -1351,7 +1393,8 @@ fn realistic_env_max_throughput_test_suite(duration: Duration) -> ForgeConfig {
.with_emit_job(
EmitJobRequest::default()
.mode(EmitJobMode::ConstTps { tps: 100 })
.gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE),
.gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE)
.latency_polling_interval(Duration::from_millis(100)),
)
.with_success_criteria(
SuccessCriteria::new(95)
Expand All @@ -1366,8 +1409,8 @@ fn realistic_env_max_throughput_test_suite(duration: Duration) -> ForgeConfig {
// Check that we don't use more than 10 GB of memory for 30% of the time.
MetricsThreshold::new(10 * 1024 * 1024 * 1024, 30),
))
.add_latency_threshold(4.0, LatencyType::P50)
.add_latency_threshold(8.0, LatencyType::P90)
.add_latency_threshold(3.0, LatencyType::P50)
.add_latency_threshold(5.0, LatencyType::P90)
.add_chain_progress(StateProgressThreshold {
max_no_progress_secs: 10.0,
max_round_gap: 4,
Expand Down
2 changes: 2 additions & 0 deletions testsuite/forge/src/report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use aptos_transaction_emitter_lib::emitter::stats::TxnStats;
use serde::Serialize;
use std::fmt;
use aptos_logger::info;

#[derive(Default, Debug, Serialize)]
pub struct TestReport {
Expand Down Expand Up @@ -37,6 +38,7 @@ impl TestReport {
self.text.push('\n');
}
self.text.push_str(&text);
info!("{}", text);
}

pub fn report_txn_stats(&mut self, test_name: String, stats: &TxnStats) {
Expand Down
Loading

0 comments on commit b2d104b

Please sign in to comment.