Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Save genesis bootstrap congestion info into DB and reuse it at node restarts #11724

Merged
merged 34 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
958b474
Add nayduck test for switching between memtries and disktries.
tayfunelmas Jun 27, 2024
ec8b4b6
Increase timeout
tayfunelmas Jun 27, 2024
7b49e03
Revert some log changes and reduce timeout
tayfunelmas Jun 27, 2024
b493313
Make contract deployment non-waiting
tayfunelmas Jun 28, 2024
ce27a87
Add comments.
tayfunelmas Jun 28, 2024
80176d4
Replace validator accounts with created accounts
tayfunelmas Jun 28, 2024
58af30e
Revert utils change
tayfunelmas Jun 28, 2024
1f011b4
Use nonces from access keys of the accounts
tayfunelmas Jun 28, 2024
d2ee23d
Merge branch 'master' into nayduck-memtries
tayfunelmas Jul 1, 2024
c4ef9e9
New version with nonblocking tx sent.
tayfunelmas Jul 2, 2024
6979359
Use dumper node as the RPC node
tayfunelmas Jul 2, 2024
653e412
Add comments. and comment-out last step.
tayfunelmas Jul 2, 2024
f011461
Merge branch 'master' into nayduck-memtries
tayfunelmas Jul 2, 2024
262bb9a
Address comment
tayfunelmas Jul 2, 2024
bd6d467
Merge branch 'nayduck-memtries' of https://github.com/tayfunelmas/nea…
tayfunelmas Jul 2, 2024
7b2b0a6
Test for MissingTrieValue
tayfunelmas Jul 2, 2024
330154a
Revert "Test for MissingTrieValue"
tayfunelmas Jul 3, 2024
389711f
Merge branch 'master' into nayduck-memtries
tayfunelmas Jul 3, 2024
6792941
Add test for congestion control bootstrap after N epochs
tayfunelmas Jul 3, 2024
eb36c1d
Fix typo
tayfunelmas Jul 3, 2024
e8a1359
Minor typo
tayfunelmas Jul 3, 2024
4864f85
Merge branch 'master' into nayduck-memtries
tayfunelmas Jul 3, 2024
aeac783
Merge branch 'master' into congestion-control-bootstrap
tayfunelmas Jul 3, 2024
b97df30
Merge branch 'master' into congestion-control-bootstrap
tayfunelmas Jul 4, 2024
2b049a5
Save congestion info in DB.
tayfunelmas Jul 4, 2024
7ff36d0
Format.
tayfunelmas Jul 4, 2024
de865e2
Delete already renamed python test.
tayfunelmas Jul 4, 2024
4f9da25
Revert init_test_logger change.
tayfunelmas Jul 4, 2024
650f1be
Merge branch 'master' into congestion-control-bootstrap
tayfunelmas Jul 4, 2024
e4a055c
Address comments.
tayfunelmas Jul 4, 2024
a0ca76f
Merge branch 'congestion-control-bootstrap' of https://github.com/tay…
tayfunelmas Jul 4, 2024
6b5744d
Merge branch 'master' into congestion-control-bootstrap
tayfunelmas Jul 4, 2024
93c16b8
Merge branch 'master' into congestion-control-bootstrap
tayfunelmas Jul 5, 2024
fff0f32
Remove unused argument.
tayfunelmas Jul 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 40 additions & 27 deletions chain/chain/src/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3943,54 +3943,67 @@ fn get_genesis_congestion_infos_impl(
runtime: &dyn RuntimeAdapter,
state_roots: &Vec<CryptoHash>,
) -> Result<Vec<Option<CongestionInfo>>, Error> {
let prev_hash = CryptoHash::default();
let epoch_id = epoch_manager.get_epoch_id_from_prev_block(&prev_hash)?;
let protocol_version = epoch_manager.get_epoch_protocol_version(&epoch_id)?;
let genesis_prev_hash = CryptoHash::default();
let genesis_epoch_id = epoch_manager.get_epoch_id_from_prev_block(&genesis_prev_hash)?;
let genesis_protocol_version = epoch_manager.get_epoch_protocol_version(&genesis_epoch_id)?;
// If congestion control is not enabled at the genesis block, we return None (congestion info) for each shard.
if !ProtocolFeature::CongestionControl.enabled(genesis_protocol_version) {
return Ok(std::iter::repeat(None).take(state_roots.len()).collect());
}

// Since the congestion info is already bootstrapped in statelessnet, skip another bootstrap.
// TODO: This is temporary mitigation for the failing genesis congestion info due to garbage
// collected genesis state roots. It can be removed after the statelessnet network is turned down.
if let Ok(protocol_config) = runtime.get_protocol_config(&genesis_epoch_id) {
if protocol_config.genesis_config.chain_id == near_primitives::chains::STATELESSNET {
return Ok(std::iter::repeat(None).take(state_roots.len()).collect());
}
}

// Check we had already computed the congestion infos from the genesis state roots.
if let Some(saved_infos) = near_store::get_genesis_congestion_infos(runtime.store())? {
tracing::debug!(target: "chain", "Reading genesis congestion infos from database.");
return Ok(saved_infos.into_iter().map(Option::Some).collect());
}

let mut result = vec![];
let mut new_infos = vec![];
for (shard_id, &state_root) in state_roots.iter().enumerate() {
let shard_id = shard_id as ShardId;
let congestion_info = get_congestion_info(
let congestion_info = get_genesis_congestion_info(
runtime,
protocol_version,
&prev_hash,
genesis_protocol_version,
&genesis_prev_hash,
shard_id,
state_root,
&epoch_id,
)?;
result.push(congestion_info);
new_infos.push(congestion_info);
}
Ok(result)

// Store it in DB so that we can read it later, instead of recomputing from genesis state roots.
// Note that this is necessary because genesis state roots will be garbage-collected and will not
// be available, for example, when the node restarts later.
tracing::debug!(target: "chain", "Saving genesis congestion infos to database.");
let mut store_update = runtime.store().store_update();
near_store::set_genesis_congestion_infos(&mut store_update, &new_infos);
store_update.commit()?;

Ok(new_infos.into_iter().map(Option::Some).collect())
}

fn get_congestion_info(
fn get_genesis_congestion_info(
runtime: &dyn RuntimeAdapter,
protocol_version: ProtocolVersion,
prev_hash: &CryptoHash,
shard_id: ShardId,
state_root: StateRoot,
epoch_id: &EpochId,
) -> Result<Option<CongestionInfo>, Error> {
if !ProtocolFeature::CongestionControl.enabled(protocol_version) {
return Ok(None);
}

// Since the congestion info is already bootstrapped in statelessnet, skip another bootstrap.
// TODO: This is temporary mitigation for the failing genesis congestion info due to garbage
// collected genesis state roots. It can be removed after the statelessnet network is turned down.
if let Ok(protocol_config) = runtime.get_protocol_config(&epoch_id) {
if protocol_config.genesis_config.chain_id == near_primitives::chains::STATELESSNET {
return Ok(None);
}
}

) -> Result<CongestionInfo, Error> {
// Get the view trie because it's possible that the chain is ahead of
// genesis and doesn't have this block in flat state and memtrie.
let trie = runtime.get_view_trie_for_shard(shard_id, prev_hash, state_root)?;
let runtime_config = runtime.get_runtime_config(protocol_version)?;
let congestion_info = bootstrap_congestion_info(&trie, &runtime_config, shard_id)?;
tracing::debug!(target: "chain", ?shard_id, ?state_root, ?congestion_info, "Computed genesis congestion info.");
Ok(Some(congestion_info))
Ok(congestion_info)
}

fn shard_id_out_of_bounds(shard_id: ShardId) -> Error {
Expand Down
1 change: 1 addition & 0 deletions core/store/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub const LATEST_KNOWN_KEY: &[u8; 12] = b"LATEST_KNOWN";
pub const LARGEST_TARGET_HEIGHT_KEY: &[u8; 21] = b"LARGEST_TARGET_HEIGHT";
pub const GENESIS_JSON_HASH_KEY: &[u8; 17] = b"GENESIS_JSON_HASH";
pub const GENESIS_STATE_ROOTS_KEY: &[u8; 19] = b"GENESIS_STATE_ROOTS";
pub const GENESIS_CONGESTION_INFO_KEY: &[u8] = b"GENESIS_CONGESTION_INFO_KEY";
pub const COLD_HEAD_KEY: &[u8; 9] = b"COLD_HEAD";
pub const STATE_SYNC_DUMP_KEY: &[u8; 15] = b"STATE_SYNC_DUMP";
pub const STATE_SNAPSHOT_KEY: &[u8; 18] = b"STATE_SNAPSHOT_KEY";
Expand Down
15 changes: 15 additions & 0 deletions core/store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pub use crate::trie::{
};
use borsh::{BorshDeserialize, BorshSerialize};
pub use columns::DBCol;
use db::GENESIS_CONGESTION_INFO_KEY;
pub use db::{
CHUNK_TAIL_KEY, COLD_HEAD_KEY, FINAL_HEAD_KEY, FORK_TAIL_KEY, GENESIS_JSON_HASH_KEY,
GENESIS_STATE_ROOTS_KEY, HEADER_HEAD_KEY, HEAD_KEY, LARGEST_TARGET_HEIGHT_KEY,
Expand All @@ -21,6 +22,7 @@ use metadata::{DbKind, DbVersion, KIND_KEY, VERSION_KEY};
use near_crypto::PublicKey;
use near_fmt::{AbbrBytes, StorageKey};
use near_primitives::account::{AccessKey, Account};
use near_primitives::congestion_info::CongestionInfo;
pub use near_primitives::errors::{MissingTrieValueContext, StorageError};
use near_primitives::hash::CryptoHash;
use near_primitives::receipt::{
Expand Down Expand Up @@ -1011,6 +1013,10 @@ pub fn get_genesis_state_roots(store: &Store) -> io::Result<Option<Vec<StateRoot
store.get_ser::<Vec<StateRoot>>(DBCol::BlockMisc, GENESIS_STATE_ROOTS_KEY)
}

pub fn get_genesis_congestion_infos(store: &Store) -> io::Result<Option<Vec<CongestionInfo>>> {
store.get_ser::<Vec<CongestionInfo>>(DBCol::BlockMisc, GENESIS_CONGESTION_INFO_KEY)
}

pub fn get_genesis_hash(store: &Store) -> io::Result<Option<CryptoHash>> {
store.get_ser::<CryptoHash>(DBCol::BlockMisc, GENESIS_JSON_HASH_KEY)
}
Expand All @@ -1027,6 +1033,15 @@ pub fn set_genesis_state_roots(store_update: &mut StoreUpdate, genesis_roots: &[
.expect("Borsh cannot fail");
}

pub fn set_genesis_congestion_infos(
store_update: &mut StoreUpdate,
congestion_infos: &[CongestionInfo],
) {
store_update
.set_ser(DBCol::BlockMisc, GENESIS_CONGESTION_INFO_KEY, &congestion_infos)
.expect("Borsh cannot fail");
}

fn option_to_not_found<T, F>(res: io::Result<Option<T>>, field_name: F) -> io::Result<T>
where
F: std::string::ToString,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
use near_async::time::Duration;
use near_chain::ChainStoreAccess;
use near_chain_configs::test_genesis::TestGenesisBuilder;
use near_client::Client;
use near_o11y::testonly::init_test_logger;
use near_primitives::types::AccountId;
use near_primitives::version::{ProtocolFeature, PROTOCOL_VERSION};

use crate::test_loop::builder::TestLoopBuilder;
use crate::test_loop::env::TestLoopEnv;
use crate::test_loop::utils::ONE_NEAR;

const NUM_SHARDS: usize = 4;

/// This test checks that the genesis congestion control info is saved into DB and not cleaned during GC,
/// so that client can use it to bootstrap the genesis congestion control info after restarting.
/// Restarting is the node is not checked here but in python/nayduck tests.
#[test]
fn test_congestion_control_genesis_bootstrap() {
if !ProtocolFeature::CongestionControl.enabled(PROTOCOL_VERSION) {
return;
}

init_test_logger();

let builder = TestLoopBuilder::new();

let initial_balance = 10000 * ONE_NEAR;
let accounts = ["test0", "test1"];
let clients: Vec<AccountId> = accounts.iter().map(|account| account.parse().unwrap()).collect();

let mut genesis_builder = TestGenesisBuilder::new();
genesis_builder
.genesis_time_from_clock(&builder.clock())
.protocol_version_latest()
.shard_layout_simple_v1(&["account3", "account5", "account7"])
.validators_desired_roles(&accounts[0..1], &accounts[1..2])
.minimum_validators_per_shard(1);

for i in 0..clients.len() {
genesis_builder.add_user_account_simple(clients[i].clone(), initial_balance);
}

let TestLoopEnv { mut test_loop, datas: node_datas, tempdir } =
builder.genesis(genesis_builder.build()).clients(clients.clone()).build();

test_loop.run_for(Duration::seconds(5));

for i in 0..clients.len() {
check_genesis_congestion_info_in_store(
&mut test_loop.data.get_mut(&node_datas[i].client_sender.actor_handle()).client,
);
}

TestLoopEnv { test_loop, datas: node_datas, tempdir }
.shutdown_and_drain_remaining_events(Duration::seconds(20));
}

fn check_genesis_congestion_info_in_store(client: &mut Client) {
let gc_config = client.config.gc.clone();
client.chain.clear_data(&gc_config).unwrap();

let infos = near_store::get_genesis_congestion_infos(client.chain.chain_store().store())
.unwrap()
.unwrap();
assert_eq!(infos.len(), NUM_SHARDS);
for i in 0..NUM_SHARDS {
assert_eq!(infos[i].buffered_receipts_gas(), 0);
assert_eq!(infos[i].delayed_receipts_gas(), 0);
assert_eq!(infos[i].receipt_bytes(), 0);
}
}
1 change: 1 addition & 0 deletions integration-tests/src/test_loop/tests/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod chunk_validator_kickout;
pub mod congestion_control;
pub mod congestion_control_genesis_bootstrap;
pub mod in_memory_tries;
pub mod multinode_stateless_validators;
pub mod multinode_test_loop_example;
Expand Down
5 changes: 2 additions & 3 deletions nightly/pytest-sanity.txt
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,8 @@ pytest sanity/slow_chunk.py --features nightly
# TODO(congestion_control) - enable pytest on stabilization
# pytest sanity/congestion_control.py
pytest sanity/congestion_control.py --features nightly
# TODO(#11702) Enable these after fixing the issue and stabilization.
# pytest sanity/congestion_control_genesis_bootstrap.py
# pytest sanity/congestion_control_genesis_bootstrap.py --features nightly
pytest sanity/congestion_control_genesis_bootstrap.py
pytest sanity/congestion_control_genesis_bootstrap.py --features nightly

# Tests the correct operation of the view client without using memtries (#11312).
pytest sanity/rpc_view_history.py
Expand Down
11 changes: 6 additions & 5 deletions pytest/tests/sanity/memtrie_disktrie_switch.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,12 @@ def test(self):
self.__restart_nodes(enable_memtries=False)
self.__random_workload_until(target_height)

# TODO(#11675): Fix MissingTrieValue error and re-enable this step of the test.
# target_height = self.__next_target_height(num_epochs=1)
# logger.info(f"Step 3: Restarting nodes with memtries enabled until height {target_height}")
# self.__restart_nodes(enable_memtries=True)
# self.__random_workload_until(target_height)
target_height = self.__next_target_height(num_epochs=1)
logger.info(
f"Step 3: Restarting nodes with memtries enabled until height {target_height}"
)
self.__restart_nodes(enable_memtries=True)
self.__random_workload_until(target_height)

self.__wait_for_txs(self.txs, assert_all_accepted=False)
logger.info("Test ended")
Expand Down
Loading