Skip to content

Commit

Permalink
fix: logs and traces (#10231)
Browse files Browse the repository at this point in the history
No functional changes.

Improving logging and traces to enable investigation of the network wide
issue of validators missing chunks.

Logging changes are directed to making it easier to see why a node might
have missed a chunk or a partial encoded chunk.

Current state of investigation:
Validators are missing chunks, because they are in a syncing state.
Unclear why the validator nodes are in a syncing state.

Tested by running a testnet node.
The logs have a lot of `process_partial_encoded_chunk` logs, but nothing
is overly spammy.
  • Loading branch information
nikurt authored Nov 24, 2023
1 parent 979102f commit 0688056
Show file tree
Hide file tree
Showing 14 changed files with 337 additions and 123 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 34 additions & 10 deletions chain/chain/src/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ use std::collections::{HashMap, HashSet};
use std::fmt::{Debug, Formatter};
use std::sync::Arc;
use std::time::{Duration as TimeDuration, Instant};
use tracing::{debug, error, info, warn, Span};
use tracing::{debug, debug_span, error, info, warn, Span};

/// Maximum number of orphans chain can store.
pub const MAX_ORPHAN_SIZE: usize = 1024;
Expand Down Expand Up @@ -358,6 +358,15 @@ pub struct BlockMissingChunks {
pub missing_chunks: Vec<ShardChunkHeader>,
}

impl Debug for BlockMissingChunks {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("BlockMissingChunks")
.field("prev_hash", &self.prev_hash)
.field("num_missing_chunks", &self.missing_chunks.len())
.finish()
}
}

/// Contains information needed to request chunks for orphans
/// Fields will be used as arguments for `request_chunks_for_orphan`
pub struct OrphanMissingChunks {
Expand All @@ -370,6 +379,16 @@ pub struct OrphanMissingChunks {
pub ancestor_hash: CryptoHash,
}

impl Debug for OrphanMissingChunks {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_struct("OrphanMissingChunks")
.field("epoch_id", &self.epoch_id)
.field("ancestor_hash", &self.ancestor_hash)
.field("num_missing_chunks", &self.missing_chunks.len())
.finish()
}
}

/// Check if block header is known
/// Returns Err(Error) if any error occurs when checking store
/// Ok(Err(BlockKnownError)) if the block header is known
Expand Down Expand Up @@ -1778,6 +1797,8 @@ impl Chain {
block_processing_artifacts: &mut BlockProcessingArtifact,
apply_chunks_done_callback: DoneApplyChunkCallback,
) -> Result<(), Error> {
let _span =
debug_span!(target: "chain", "start_process_block_async", ?provenance).entered();
let block_received_time = StaticClock::instant();
metrics::BLOCK_PROCESSING_ATTEMPTS_TOTAL.inc();

Expand Down Expand Up @@ -1818,6 +1839,7 @@ impl Chain {
block_processing_artifacts: &mut BlockProcessingArtifact,
apply_chunks_done_callback: DoneApplyChunkCallback,
) -> (Vec<AcceptedBlock>, HashMap<CryptoHash, Error>) {
let _span = debug_span!(target: "chain", "postprocess_ready_blocks_chain").entered();
let mut accepted_blocks = vec![];
let mut errors = HashMap::new();
while let Ok((block_hash, apply_result)) = self.apply_chunks_receiver.try_recv() {
Expand Down Expand Up @@ -2083,11 +2105,8 @@ impl Chain {
block_received_time: Instant,
) -> Result<(), Error> {
let block_height = block.header().height();
let _span = tracing::debug_span!(
target: "chain",
"start_process_block_impl",
height = block_height)
.entered();
let _span =
debug_span!(target: "chain", "start_process_block_impl", block_height).entered();
// 0) Before we proceed with any further processing, we first check that the block
// hash and signature matches to make sure the block is indeed produced by the assigned
// block producer. If not, we drop the block immediately
Expand Down Expand Up @@ -2234,7 +2253,7 @@ impl Chain {
) {
let sc = self.apply_chunks_sender.clone();
spawn(move || {
// do_apply_chunks runs `work` parallelly, but still waits for all of them to finish
// do_apply_chunks runs `work` in parallel, but still waits for all of them to finish
let res = do_apply_chunks(block_hash, block_height, work);
// If we encounter error here, that means the receiver is deallocated and the client
// thread is already shut down. The node is already crashed, so we can unwrap here
Expand Down Expand Up @@ -2803,8 +2822,13 @@ impl Chain {
block_processing_artifacts: &mut BlockProcessingArtifact,
apply_chunks_done_callback: DoneApplyChunkCallback,
) {
let _span = debug_span!(
target: "chain",
"check_orphans",
?prev_hash,
num_orphans = self.orphans.len())
.entered();
// Check if there are orphans we can process.
debug!(target: "chain", "Check orphans: from {}, # total orphans {}", prev_hash, self.orphans.len());
// check within the descendents of `prev_hash` to see if there are orphans there that
// are ready to request missing chunks for
let orphans_to_check =
Expand Down Expand Up @@ -5264,7 +5288,7 @@ impl<'a> ChainUpdate<'a> {
let prev_hash = block.header().prev_hash();
let results = apply_chunks_results.into_iter().map(|x| {
if let Err(err) = &x {
warn!(target:"chain", hash = %block.hash(), error = %err, "Error in applying chunks for block");
warn!(target: "chain", hash = %block.hash(), %err, "Error in applying chunks for block");
}
x
}).collect::<Result<Vec<_>, Error>>()?;
Expand Down Expand Up @@ -5903,7 +5927,7 @@ pub fn do_apply_chunks(
// a single span.
task(&parent_span)
})
.collect::<Vec<_>>()
.collect()
}

pub fn collect_receipts<'a, T>(receipt_proofs: T) -> Vec<Receipt>
Expand Down
36 changes: 32 additions & 4 deletions chain/chain/src/doomslug.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use near_primitives::hash::CryptoHash;
use near_primitives::static_clock::StaticClock;
use near_primitives::types::{AccountId, ApprovalStake, Balance, BlockHeight, BlockHeightDelta};
use near_primitives::validator_signer::ValidatorSigner;
use tracing::info;
use tracing::{debug, debug_span, field, info};

/// Have that many iterations in the timer instead of `loop` to prevent potential bugs from blocking
/// the node
Expand Down Expand Up @@ -699,6 +699,15 @@ impl Doomslug {
has_enough_chunks: bool,
log_block_production_info: bool,
) -> bool {
let span = debug_span!(
target: "doomslug",
"ready_to_produce_block",
has_enough_chunks,
target_height,
enough_approvals_for = field::Empty,
ready_to_produce_block = field::Empty,
need_to_wait = field::Empty)
.entered();
let hash_or_height =
ApprovalInner::new(&self.tip.block_hash, self.tip.height, target_height);
if let Some(approval_trackers_at_height) = self.approval_tracking.get_mut(&target_height) {
Expand All @@ -710,9 +719,16 @@ impl Doomslug {
match block_production_readiness {
DoomslugBlockProductionReadiness::NotReady => false,
DoomslugBlockProductionReadiness::ReadySince(when) => {
let enough_approvals_for = now.saturating_duration_since(when);
span.record("enough_approvals_for", enough_approvals_for.as_secs_f64());
span.record("ready_to_produce_block", true);
if has_enough_chunks {
if log_block_production_info {
info!("ready to produce block @ {}, has enough approvals for {:?}, has enough chunks", target_height, now.saturating_duration_since(when));
info!(
target: "doomslug",
target_height,
?enough_approvals_for,
"ready to produce block, has enough approvals, has enough chunks");
}
true
} else {
Expand All @@ -721,21 +737,33 @@ impl Doomslug {
) / 6;

let ready = now > when + delay;
span.record("need_to_wait", !ready);
if log_block_production_info {
if ready {
info!("ready to produce block @ {}, has enough approvals for {:?}, does not have enough chunks", target_height, now.saturating_duration_since(when));
info!(
target: "doomslug",
target_height,
?enough_approvals_for,
"ready to produce block, but does not have enough chunks");
} else {
info!("not ready to produce block @ {}, need to wait {:?}, has enough approvals for {:?}", target_height, (when + delay).saturating_duration_since(now), now.saturating_duration_since(when));
info!(
target: "doomslug",
target_height,
need_to_wait_for = ?(when + delay).saturating_duration_since(now),
?enough_approvals_for,
"not ready to produce block, need to wait");
}
}
ready
}
}
}
} else {
debug!(target: "doomslug", target_height, ?hash_or_height, "No approval tracker");
false
}
} else {
debug!(target: "doomslug", target_height, "No approval trackers at height");
false
}
}
Expand Down
Loading

0 comments on commit 0688056

Please sign in to comment.