From 5b61b95bec370c6f7c94de6a852080b833b2d769 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 25 Aug 2022 13:34:52 -0500 Subject: [PATCH 1/2] Disconnect if not seen a block from connection for 2x heart-beat time. Revert reset to lib of sync_next_expected_num in start_sync since it is called outside exclusively starting a new sync. When starting a new sync start from lib. --- plugins/net_plugin/net_plugin.cpp | 28 +++++++++++++++------------- tests/nodeos_startup_catchup.py | 7 +++++++ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index d69135d74e..9974793654 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -1161,11 +1161,12 @@ namespace eosio { close(false); } return; - } else { - const tstamp timeout = std::max(hb_timeout/2, 2*std::chrono::milliseconds(config::block_interval_ms).count()); + } else if( latest_blk_time > 0 ) { + const tstamp timeout = std::max(hb_timeout*2, 2*std::chrono::milliseconds(config::block_interval_ms).count()); if ( current_time > latest_blk_time + timeout ) { - send_handshake(); - return; + no_retry = benign_other; + peer_wlog(this, "block timeout"); + close(); } } } @@ -1586,9 +1587,10 @@ namespace eosio { // if closing the connection we are currently syncing from, then reset our last requested and next expected. if( c == sync_source ) { reset_last_requested_num(g); - uint32_t head_blk_num = 0; - std::tie( std::ignore, head_blk_num, std::ignore, std::ignore, std::ignore, std::ignore ) = my_impl->get_chain_info(); - sync_next_expected_num = head_blk_num + 1; + // if starting to sync need to always start from lib as we might be on our own fork + uint32_t lib_num = 0; + std::tie( lib_num, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore ) = my_impl->get_chain_info(); + sync_next_expected_num = lib_num + 1; request_next_chunk( std::move(g) ); } } @@ -1605,8 +1607,9 @@ namespace eosio { ("r", sync_last_requested_num)("e", sync_next_expected_num)("k", sync_known_lib_num)("s", sync_req_span) ); if( fork_head_block_num < sync_last_requested_num && sync_source && sync_source->current() ) { - fc_ilog( logger, "ignoring request, head is ${h} last req = ${r}, source connection ${c}", - ("h", fork_head_block_num)("r", sync_last_requested_num)("c", sync_source->connection_id) ); + fc_ilog( logger, "ignoring request, head is ${h} last req = ${r}, sync_next_expected_num: ${e}, sync_known_lib_num: ${k}, sync_req_span: ${s}, source connection ${c}", + ("h", fork_head_block_num)("r", sync_last_requested_num)("e", sync_next_expected_num) + ("k", sync_known_lib_num)("s", sync_req_span)("c", sync_source->connection_id) ); return; } @@ -1740,12 +1743,11 @@ namespace eosio { if( sync_state == in_sync ) { set_state( lib_catchup ); } - // if starting to sync need to always start from lib as we might be on our own fork - sync_next_expected_num = lib_num + 1; + sync_next_expected_num = std::max( lib_num + 1, sync_next_expected_num ); // p2p_high_latency_test.py test depends on this exact log statement. - peer_ilog( c, "Catching up with chain, our last req is ${cc}, theirs is ${t}", - ("cc", sync_last_requested_num)("t", target) ); + peer_ilog( c, "Catching up with chain, our last req is ${cc}, theirs is ${t}, next expected ${n}", + ("cc", sync_last_requested_num)("t", target)("n", sync_next_expected_num) ); request_next_chunk( std::move( g_sync ), c ); } diff --git a/tests/nodeos_startup_catchup.py b/tests/nodeos_startup_catchup.py index 08d1b47a92..186e8e3015 100755 --- a/tests/nodeos_startup_catchup.py +++ b/tests/nodeos_startup_catchup.py @@ -165,9 +165,16 @@ def waitForNodeStarted(node): numBlocksToCatchup=(lastLibNum-lastCatchupLibNum-1)+twoRounds waitForBlock(catchupNode, lastLibNum, timeout=twoRoundsTimeout, blockType=BlockType.lib) + catchupHead=head(catchupNode) Print("Shutdown catchup node and validate exit code") catchupNode.interruptAndVerifyExitStatus(60) + # every other catchup make a lib catchup + if catchup_num % 2 == 0: + Print(f"Wait for producer to advance lib past head of catchup {catchupHead}") + # catchupHead+5 to allow for advancement of head during shutdown of catchupNode + waitForBlock(node0, catchupHead+5, timeout=twoRoundsTimeout*2, blockType=BlockType.lib) + Print("Restart catchup node") catchupNode.relaunch(cachePopen=True) waitForNodeStarted(catchupNode) From 3091b8d9f0a384e7c9afc5131bd8109755c3e232 Mon Sep 17 00:00:00 2001 From: Kevin Heifner Date: Thu, 25 Aug 2022 13:59:25 -0500 Subject: [PATCH 2/2] GH-49 Revert close if not seen block. You might not see a block if you sent them the block or if you are syncing. --- plugins/net_plugin/net_plugin.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/plugins/net_plugin/net_plugin.cpp b/plugins/net_plugin/net_plugin.cpp index 9974793654..03f9fb416c 100644 --- a/plugins/net_plugin/net_plugin.cpp +++ b/plugins/net_plugin/net_plugin.cpp @@ -1162,11 +1162,10 @@ namespace eosio { } return; } else if( latest_blk_time > 0 ) { - const tstamp timeout = std::max(hb_timeout*2, 2*std::chrono::milliseconds(config::block_interval_ms).count()); + const tstamp timeout = std::max(hb_timeout/2, 2*std::chrono::milliseconds(config::block_interval_ms).count()); if ( current_time > latest_blk_time + timeout ) { - no_retry = benign_other; - peer_wlog(this, "block timeout"); - close(); + send_handshake(); + return; } } }