From a36a9d8a9123b13db61703a38eaa2cecac5478c3 Mon Sep 17 00:00:00 2001 From: Nikolay Kurtov Date: Tue, 8 Aug 2023 16:31:49 +0200 Subject: [PATCH 1/6] fix(state-sync): Enable State Sync from GCS by default --- CHANGELOG.md | 1 + chain/client/src/client_actor.rs | 6 +----- chain/client/src/info.rs | 26 +++++-------------------- core/chain-configs/src/client_config.rs | 16 ++++++--------- nearcore/src/config.rs | 12 ++++++++++-- 5 files changed, 23 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e0e92cd55b4..0868e727b7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ * Number of transactions included in a chunk will be lowered if there is a congestion of more than 20000 delayed receipts in a shard. [#9222](https://github.com/near/nearcore/pull/9222) * Our more efficient and scalable V2 routing protocol is implemented. It shadows the V1 protocol for now while we verify its performance. [#9187](https://github.com/near/nearcore/pull/9187) * The default config now enables TIER1 outbound connections by default. [#9349](https://github.com/near/nearcore/pull/9349) +* Enable State Sync from GCS by default. ## 1.35.0 diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 5f3ebc2032e..87a2436ec23 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -710,11 +710,7 @@ impl Handler> for ClientActor { sync_status: format!( "{} ({})", self.client.sync_status.as_variant_name().to_string(), - display_sync_status( - &self.client.sync_status, - &self.client.chain.head()?, - &self.client.config.state_sync.sync, - ), + display_sync_status(&self.client.sync_status, &self.client.chain.head()?,), ), catchup_status: self.client.get_catchup_status()?, current_head_status: head.clone().into(), diff --git a/chain/client/src/info.rs b/chain/client/src/info.rs index bc67c19694b..0282a7ee725 100644 --- a/chain/client/src/info.rs +++ b/chain/client/src/info.rs @@ -2,7 +2,7 @@ use crate::config_updater::ConfigUpdater; use crate::{metrics, SyncStatus}; use actix::Addr; use itertools::Itertools; -use near_chain_configs::{ClientConfig, LogSummaryStyle, SyncConfig}; +use near_chain_configs::{ClientConfig, LogSummaryStyle}; use near_client_primitives::types::StateSyncStatus; use near_network::types::NetworkInfo; use near_primitives::block::Tip; @@ -372,8 +372,7 @@ impl InfoHelper { let s = |num| if num == 1 { "" } else { "s" }; - let sync_status_log = - Some(display_sync_status(sync_status, head, &client_config.state_sync.sync)); + let sync_status_log = Some(display_sync_status(sync_status, head)); let catchup_status_log = display_catchup_status(catchup_status); let validator_info_log = validator_info.as_ref().map(|info| { format!( @@ -592,11 +591,7 @@ pub fn display_catchup_status(catchup_status: Vec) -> String .join("\n") } -pub fn display_sync_status( - sync_status: &SyncStatus, - head: &Tip, - state_sync_config: &SyncConfig, -) -> String { +pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String { metrics::SYNC_STATUS.set(sync_status.repr() as i64); match sync_status { SyncStatus::AwaitingPeers => format!("#{:>8} Waiting for peers", head.height), @@ -635,22 +630,11 @@ pub fn display_sync_status( ) } SyncStatus::StateSync(StateSyncStatus { sync_hash, sync_status: shard_statuses }) => { - let mut res = format!("State {:?}", sync_hash); + let mut res = format!("State {sync_hash:?}"); let mut shard_statuses: Vec<_> = shard_statuses.iter().collect(); shard_statuses.sort_by_key(|(shard_id, _)| *shard_id); for (shard_id, shard_status) in shard_statuses { - write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap(); - } - if matches!(state_sync_config, SyncConfig::Peers) { - // TODO #8719 - tracing::warn!( - target: "stats", - "The node is syncing its State. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\ - Suggestions:\n\ - * Download a recent data snapshot and restart the node.\n\ - * Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`.\n\ - \n\ - A better implementation of State Sync is work in progress."); + write!(res, "[{shard_id}: {}]", shard_status.status.to_string()).unwrap(); } res } diff --git a/core/chain-configs/src/client_config.rs b/core/chain-configs/src/client_config.rs index 717fe6c3b06..39f5c5763b9 100644 --- a/core/chain-configs/src/client_config.rs +++ b/core/chain-configs/src/client_config.rs @@ -138,7 +138,11 @@ pub enum SyncConfig { impl Default for SyncConfig { fn default() -> Self { - Self::Peers + Self::ExternalStorage(ExternalStorageConfig { + location: ExternalStorageLocation::GCS { bucket: "state-parts".to_string() }, + num_concurrent_requests: 4, + num_concurrent_requests_during_catchup: 4, + }) } } @@ -148,17 +152,9 @@ pub struct StateSyncConfig { #[serde(skip_serializing_if = "Option::is_none")] /// `none` value disables state dump to external storage. pub dump: Option, - #[serde(skip_serializing_if = "SyncConfig::is_default", default = "SyncConfig::default")] pub sync: SyncConfig, } -impl SyncConfig { - /// Checks whether the object equals its default value. - fn is_default(&self) -> bool { - matches!(self, Self::Peers) - } -} - /// ClientConfig where some fields can be updated at runtime. #[derive(Clone, serde::Serialize)] pub struct ClientConfig { @@ -340,7 +336,7 @@ impl ClientConfig { flat_storage_creation_enabled: true, flat_storage_creation_period: Duration::from_secs(1), state_sync_enabled, - state_sync: StateSyncConfig::default(), + state_sync: StateSyncConfig{ dump: None, sync: SyncConfig::Peers }, state_snapshot_every_n_blocks: None, transaction_pool_size_limit: None, enable_multiline_logging: false, diff --git a/nearcore/src/config.rs b/nearcore/src/config.rs index bfe36fdd513..2627357bd42 100644 --- a/nearcore/src/config.rs +++ b/nearcore/src/config.rs @@ -198,6 +198,14 @@ fn default_transaction_pool_size_limit() -> Option { Some(100_000_000) // 100 MB. } +fn default_state_sync_enabled() -> Option { + Some(true) +} + +fn default_state_sync() -> Option { + Some(StateSyncConfig::default()) +} + #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] pub struct Consensus { /// Minimum number of peers to start syncing. @@ -337,10 +345,10 @@ pub struct Config { #[serde(default, skip_serializing_if = "Option::is_none")] pub expected_shutdown: Option, /// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead. - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default = "default_state_sync_enabled")] pub state_sync_enabled: Option, /// Options for syncing state. - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default = "default_state_sync")] pub state_sync: Option, /// Limit of the size of per-shard transaction pool measured in bytes. If not set, the size /// will be unbounded. From 043ea7c577437aaa667c06eb2062635d742ec617 Mon Sep 17 00:00:00 2001 From: Nikolay Kurtov Date: Tue, 8 Aug 2023 16:32:35 +0200 Subject: [PATCH 2/6] fix(state-sync): Enable State Sync from GCS by default --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0868e727b7e..4de4ae96d0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * Number of transactions included in a chunk will be lowered if there is a congestion of more than 20000 delayed receipts in a shard. [#9222](https://github.com/near/nearcore/pull/9222) * Our more efficient and scalable V2 routing protocol is implemented. It shadows the V1 protocol for now while we verify its performance. [#9187](https://github.com/near/nearcore/pull/9187) * The default config now enables TIER1 outbound connections by default. [#9349](https://github.com/near/nearcore/pull/9349) -* Enable State Sync from GCS by default. +* Enable State Sync from GCS by default. [#9398](https://github.com/near/nearcore/pull/9398) ## 1.35.0 From 569e2304bbf0a36153ec7e7f3ed9e5b1cb1c82b7 Mon Sep 17 00:00:00 2001 From: Saketh Are Date: Tue, 8 Aug 2023 10:33:26 -0400 Subject: [PATCH 3/6] RoutingTableView: gracefully handle missing distance vectors (#9394) The RoutingTableView in the debug UI includes a table over the direct peers of the node: image Each active connection of the node has one row in the table. The last column is populated using the most recent distance vector shared by the direct peer. In some cases there may be a direct connection for which we have not received a distance vector yet. This PR allows the debug UI to gracefully handle such cases and display "null" instead of producing an error. --- tools/debug-ui/src/RoutingTableView.tsx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/debug-ui/src/RoutingTableView.tsx b/tools/debug-ui/src/RoutingTableView.tsx index b4c18ccb073..76b109694e6 100644 --- a/tools/debug-ui/src/RoutingTableView.tsx +++ b/tools/debug-ui/src/RoutingTableView.tsx @@ -63,11 +63,16 @@ export const RoutingTableView = ({ addr }: RoutingTableViewProps) => { {direct_peers.map((peer_id) => { const peer_label = peerLabels[peer_id]; + + const peer_distances = routingInfo.peer_distances[peer_id]; + const formatted_distances = peer_distances == null ? "null" : + peer_distances.distance.map((x) => x || '_').join(', '); + return ( {peer_id.substring(8, 14)}... {peer_label} - {routingInfo.peer_distances[peer_id].distance.join(', ')} + {formatted_distances} ); })} From ad9c15e1f046806c72fb986ff411753916f1967a Mon Sep 17 00:00:00 2001 From: Nikolay Kurtov Date: Tue, 8 Aug 2023 17:45:01 +0200 Subject: [PATCH 4/6] fix(state-sync): Enable State Sync from GCS by default --- CHANGELOG.md | 2 +- chain/client/src/client_actor.rs | 2 +- core/chain-configs/src/client_config.rs | 16 ++++++++++------ nearcore/src/config.rs | 12 ++---------- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4de4ae96d0c..f39afa0f454 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ * Number of transactions included in a chunk will be lowered if there is a congestion of more than 20000 delayed receipts in a shard. [#9222](https://github.com/near/nearcore/pull/9222) * Our more efficient and scalable V2 routing protocol is implemented. It shadows the V1 protocol for now while we verify its performance. [#9187](https://github.com/near/nearcore/pull/9187) * The default config now enables TIER1 outbound connections by default. [#9349](https://github.com/near/nearcore/pull/9349) -* Enable State Sync from GCS by default. [#9398](https://github.com/near/nearcore/pull/9398) +* State Sync from GCS is available for experimental use. [#9398](https://github.com/near/nearcore/pull/9398) ## 1.35.0 diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 87a2436ec23..41ddb5f973d 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -710,7 +710,7 @@ impl Handler> for ClientActor { sync_status: format!( "{} ({})", self.client.sync_status.as_variant_name().to_string(), - display_sync_status(&self.client.sync_status, &self.client.chain.head()?,), + display_sync_status(&self.client.sync_status, &self.client.chain.head()?), ), catchup_status: self.client.get_catchup_status()?, current_head_status: head.clone().into(), diff --git a/core/chain-configs/src/client_config.rs b/core/chain-configs/src/client_config.rs index 39f5c5763b9..717fe6c3b06 100644 --- a/core/chain-configs/src/client_config.rs +++ b/core/chain-configs/src/client_config.rs @@ -138,11 +138,7 @@ pub enum SyncConfig { impl Default for SyncConfig { fn default() -> Self { - Self::ExternalStorage(ExternalStorageConfig { - location: ExternalStorageLocation::GCS { bucket: "state-parts".to_string() }, - num_concurrent_requests: 4, - num_concurrent_requests_during_catchup: 4, - }) + Self::Peers } } @@ -152,9 +148,17 @@ pub struct StateSyncConfig { #[serde(skip_serializing_if = "Option::is_none")] /// `none` value disables state dump to external storage. pub dump: Option, + #[serde(skip_serializing_if = "SyncConfig::is_default", default = "SyncConfig::default")] pub sync: SyncConfig, } +impl SyncConfig { + /// Checks whether the object equals its default value. + fn is_default(&self) -> bool { + matches!(self, Self::Peers) + } +} + /// ClientConfig where some fields can be updated at runtime. #[derive(Clone, serde::Serialize)] pub struct ClientConfig { @@ -336,7 +340,7 @@ impl ClientConfig { flat_storage_creation_enabled: true, flat_storage_creation_period: Duration::from_secs(1), state_sync_enabled, - state_sync: StateSyncConfig{ dump: None, sync: SyncConfig::Peers }, + state_sync: StateSyncConfig::default(), state_snapshot_every_n_blocks: None, transaction_pool_size_limit: None, enable_multiline_logging: false, diff --git a/nearcore/src/config.rs b/nearcore/src/config.rs index 2627357bd42..bfe36fdd513 100644 --- a/nearcore/src/config.rs +++ b/nearcore/src/config.rs @@ -198,14 +198,6 @@ fn default_transaction_pool_size_limit() -> Option { Some(100_000_000) // 100 MB. } -fn default_state_sync_enabled() -> Option { - Some(true) -} - -fn default_state_sync() -> Option { - Some(StateSyncConfig::default()) -} - #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)] pub struct Consensus { /// Minimum number of peers to start syncing. @@ -345,10 +337,10 @@ pub struct Config { #[serde(default, skip_serializing_if = "Option::is_none")] pub expected_shutdown: Option, /// Whether to use state sync (unreliable and corrupts the DB if fails) or do a block sync instead. - #[serde(default = "default_state_sync_enabled")] + #[serde(skip_serializing_if = "Option::is_none")] pub state_sync_enabled: Option, /// Options for syncing state. - #[serde(default = "default_state_sync")] + #[serde(skip_serializing_if = "Option::is_none")] pub state_sync: Option, /// Limit of the size of per-shard transaction pool measured in bytes. If not set, the size /// will be unbounded. From 497f8c7526b8747b4b4e2550b720b946f2c4c7eb Mon Sep 17 00:00:00 2001 From: Nikolay Kurtov Date: Tue, 8 Aug 2023 18:06:28 +0200 Subject: [PATCH 5/6] fix(state-sync): Enable State Sync from GCS by default --- chain/client/src/client_actor.rs | 6 +++++- chain/client/src/info.rs | 32 +++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 41ddb5f973d..5f3ebc2032e 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -710,7 +710,11 @@ impl Handler> for ClientActor { sync_status: format!( "{} ({})", self.client.sync_status.as_variant_name().to_string(), - display_sync_status(&self.client.sync_status, &self.client.chain.head()?), + display_sync_status( + &self.client.sync_status, + &self.client.chain.head()?, + &self.client.config.state_sync.sync, + ), ), catchup_status: self.client.get_catchup_status()?, current_head_status: head.clone().into(), diff --git a/chain/client/src/info.rs b/chain/client/src/info.rs index 0282a7ee725..42e8533cc2a 100644 --- a/chain/client/src/info.rs +++ b/chain/client/src/info.rs @@ -2,7 +2,7 @@ use crate::config_updater::ConfigUpdater; use crate::{metrics, SyncStatus}; use actix::Addr; use itertools::Itertools; -use near_chain_configs::{ClientConfig, LogSummaryStyle}; +use near_chain_configs::{ClientConfig, LogSummaryStyle, SyncConfig}; use near_client_primitives::types::StateSyncStatus; use near_network::types::NetworkInfo; use near_primitives::block::Tip; @@ -372,7 +372,8 @@ impl InfoHelper { let s = |num| if num == 1 { "" } else { "s" }; - let sync_status_log = Some(display_sync_status(sync_status, head)); + let sync_status_log = + Some(display_sync_status(sync_status, head, &client_config.state_sync.sync)); let catchup_status_log = display_catchup_status(catchup_status); let validator_info_log = validator_info.as_ref().map(|info| { format!( @@ -591,7 +592,11 @@ pub fn display_catchup_status(catchup_status: Vec) -> String .join("\n") } -pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String { +pub fn display_sync_status( + sync_status: &SyncStatus, + head: &Tip, + state_sync_config: &SyncConfig, +) -> String { metrics::SYNC_STATUS.set(sync_status.repr() as i64); match sync_status { SyncStatus::AwaitingPeers => format!("#{:>8} Waiting for peers", head.height), @@ -630,12 +635,29 @@ pub fn display_sync_status(sync_status: &SyncStatus, head: &Tip) -> String { ) } SyncStatus::StateSync(StateSyncStatus { sync_hash, sync_status: shard_statuses }) => { - let mut res = format!("State {sync_hash:?}"); + let mut res = format!("State {:?}", sync_hash); let mut shard_statuses: Vec<_> = shard_statuses.iter().collect(); shard_statuses.sort_by_key(|(shard_id, _)| *shard_id); for (shard_id, shard_status) in shard_statuses { - write!(res, "[{shard_id}: {}]", shard_status.status.to_string()).unwrap(); + write!(res, "[{}: {}]", shard_id, shard_status.status.to_string(),).unwrap(); } + match state_sync_config { + SyncConfig::Peers => { + tracing::warn!( + target: "stats", + "The node is trying to sync its State from its peers. The current implementation of this mechanism is known to be unreliable. It may never complete, or fail randomly and corrupt the DB.\n\ + Suggestions:\n\ + * Try to state sync from GCS. See `\"state_sync\"` and `\"state_sync_enabled\"` options in the reference `config.json` file. + or + * Disable state sync in the config. Add `\"state_sync_enabled\": false` to `config.json`, then download a recent data snapshot and restart the node."); + } + SyncConfig::ExternalStorage(_) => { + tracing::info!( + target: "stats", + "The node is trying to sync its State from external storage. The current implementation is experimental. Use carefully. If it fails, consider disabling state sync and restarting from a recent snapshot:\n\ + - Add `\"state_sync_enabled\": false` to `config.json`, then download a recent data snapshot and restart the node."); + } + }; res } SyncStatus::StateSyncDone => "State sync done".to_string(), From c5390fd05863a94afc11153e5a0cc05d8872b94f Mon Sep 17 00:00:00 2001 From: Nikolay Kurtov Date: Wed, 9 Aug 2023 11:22:04 +0200 Subject: [PATCH 6/6] fix(state-sync): Enable State Sync from GCS by default --- chain/client/src/info.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chain/client/src/info.rs b/chain/client/src/info.rs index 42e8533cc2a..e442f1b3bbe 100644 --- a/chain/client/src/info.rs +++ b/chain/client/src/info.rs @@ -654,7 +654,7 @@ pub fn display_sync_status( SyncConfig::ExternalStorage(_) => { tracing::info!( target: "stats", - "The node is trying to sync its State from external storage. The current implementation is experimental. Use carefully. If it fails, consider disabling state sync and restarting from a recent snapshot:\n\ + "The node is trying to sync its State from external storage. The current implementation is experimental. If it fails, consider disabling state sync and restarting from a recent snapshot:\n\ - Add `\"state_sync_enabled\": false` to `config.json`, then download a recent data snapshot and restart the node."); } };