Skip to content

Commit

Permalink
KVStore: Add metrics for AddLock failure (pingcap#9170) (pingcap#229)
Browse files Browse the repository at this point in the history
Signed-off-by: CalvinNeo <calvinneo1995@gmail.com>
  • Loading branch information
CalvinNeo authored Jun 28, 2024
1 parent aa2eb41 commit 2f08969
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 5 deletions.
1 change: 1 addition & 0 deletions dbms/src/Common/TiFlashMetrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ namespace DB
F(type_failed_timeout, {{"type", "failed_timeout"}}), \
F(type_failed_baddata, {{"type", "failed_baddata"}}), \
F(type_failed_repeated, {{"type", "failed_repeated"}}), \
F(type_failed_build_chkpt, {{"type", "failed_build_chkpt"}}), \
F(type_restore, {{"type", "restore"}}), \
F(type_succeed, {{"type", "succeed"}})) \
M(tiflash_fap_task_state, \
Expand Down
14 changes: 12 additions & 2 deletions dbms/src/Flash/Disaggregated/S3LockClient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,12 @@ std::pair<bool, String> S3LockClient::makeCall(
{
if (Clock::now() > deadline)
{
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "deadline exceed, " + tracing_log->identifier());
throw Exception(
ErrorCodes::TIMEOUT_EXCEEDED,
"deadline exceed, {}, address={}, request={}",
tracing_log->identifier(),
address,
req.ShortDebugString());
}
// retry
LOG_ERROR(
Expand All @@ -141,7 +146,12 @@ std::pair<bool, String> S3LockClient::makeCall(
{
if (Clock::now() > deadline)
{
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "deadline exceed, " + tracing_log->identifier());
throw Exception(
ErrorCodes::TIMEOUT_EXCEEDED,
"deadline exceed, {}, address={}, request={}",
tracing_log->identifier(),
address,
req.ShortDebugString());
}
// retry
auto not_owner = resp.result().not_owner();
Expand Down
17 changes: 15 additions & 2 deletions dbms/src/Storages/KVStore/MultiRaft/Disagg/FastAddPeer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,20 @@ FastAddPeerRes FastAddPeerImplWrite(
return genFastAddPeerRes(FastAddPeerStatus::Canceled, "", "");
}

auto segments = dm_storage->buildSegmentsFromCheckpointInfo(new_key_range, checkpoint_info, settings);
DM::Segments segments;
try
{
segments = dm_storage->buildSegmentsFromCheckpointInfo(new_key_range, checkpoint_info, settings);
}
catch (...)
{
// It will call `createTargetSegmentsFromCheckpoint`, which will build delta and stable space for all segments.
// For every remote pages refered, `createS3LockForWriteBatch` will lock them on S3 to prevent them from being GC-ed.
// Failure in creating lock results in an Exception, causing FAP fallback with BadData error.
// A typical failure is that this TiFlash node fails to communicate with other TiFlash nodes.
GET_METRIC(tiflash_fap_task_result, type_failed_build_chkpt).Increment();
throw;
}
GET_METRIC(tiflash_fap_task_duration_seconds, type_write_stage_build).Observe(watch.elapsedSecondsFromLastTime());

fap_ctx->insertCheckpointIngestInfo(
Expand Down Expand Up @@ -520,7 +533,7 @@ uint8_t ApplyFapSnapshotImpl(
// `region_to_ingest` is not the region in kvstore.
auto region_to_ingest = checkpoint_ingest_info->getRegion();
RUNTIME_CHECK(region_to_ingest != nullptr);
if (!(region_to_ingest->appliedIndex() == index && region_to_ingest->appliedIndexTerm() == term))
if (region_to_ingest->appliedIndex() != index || region_to_ingest->appliedIndexTerm() != term)
{
if (assert_exist)
{
Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ static void runInParallel(
{
// The exception can be wrapped in the future, however, we abort here.
const auto & processed_keys = part_sst_stream->getProcessKeys();
LOG_INFO(
LOG_WARNING(
log,
"Parallel prehandling error {}"
" write_cf_off={} split_id={} region_id={}",
Expand Down

0 comments on commit 2f08969

Please sign in to comment.