From 1aa504eb3be8b685a0c9cdb27efe1d9b2fd3ebdd Mon Sep 17 00:00:00 2001 From: heyuchen Date: Fri, 12 Mar 2021 15:34:46 +0800 Subject: [PATCH] feat(split): add split related perf-counter --- src/replica/replica_context.h | 17 ++++++ src/replica/replica_stub.cpp | 67 +++++++++++++++++++++ src/replica/replica_stub.h | 13 ++++ src/replica/split/replica_split_manager.cpp | 21 +++++-- 4 files changed, 114 insertions(+), 4 deletions(-) diff --git a/src/replica/replica_context.h b/src/replica/replica_context.h index 90dc6a3646..a5ab150b57 100644 --- a/src/replica/replica_context.h +++ b/src/replica/replica_context.h @@ -240,6 +240,16 @@ class partition_split_context public: bool cleanup(bool force); bool is_cleaned() const; + uint64_t total_ms() const + { + return splitting_start_ts_ns > 0 ? (dsn_now_ns() - splitting_start_ts_ns) / 1000000 : 0; + } + uint64_t async_learn_ms() const + { + return splitting_start_async_learn_ts_ns > 0 + ? (dsn_now_ns() - splitting_start_async_learn_ts_ns) / 1000000 + : 0; + } public: gpid parent_gpid; @@ -254,6 +264,13 @@ class partition_split_context // partition split states checker, start when initialize child replica // see more in function `child_check_split_context` and `parent_check_states` task_ptr check_state_task; + + // Used for split related perf-counter + uint64_t splitting_start_ts_ns{0}; + uint64_t splitting_start_async_learn_ts_ns{0}; + uint64_t splitting_copy_file_count{0}; + uint64_t splitting_copy_file_size{0}; + uint64_t splitting_copy_mutation_count{0}; }; //---------------inline impl---------------------------------------------------------------- diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index 6fbf60619d..3452e5b2a8 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -405,6 +405,59 @@ void replica_stub::install_perf_counters() COUNTER_TYPE_NUMBER, "current tcmalloc release memory size"); #endif + + // <- Partition split Metrics -> + + _counter_replicas_splitting_count.init_app_counter("eon.replica_stub", + "replicas.splitting.count", + COUNTER_TYPE_NUMBER, + "current partition splitting count"); + + _counter_replicas_splitting_max_duration_time_ms.init_app_counter( + "eon.replica_stub", + "replicas.splitting.max.duration.time(ms)", + COUNTER_TYPE_NUMBER, + "current partition splitting max duration time(ms)"); + _counter_replicas_splitting_max_async_learn_time_ms.init_app_counter( + "eon.replica_stub", + "replicas.splitting.max.async.learn.time(ms)", + COUNTER_TYPE_NUMBER, + "current partition splitting max async learn time(ms)"); + _counter_replicas_splitting_max_copy_file_size.init_app_counter( + "eon.replica_stub", + "replicas.splitting.max.copy.file.size", + COUNTER_TYPE_NUMBER, + "current splitting max copy file size"); + _counter_replicas_splitting_recent_start_count.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.start.count", + COUNTER_TYPE_VOLATILE_NUMBER, + "current splitting start count in the recent period"); + _counter_replicas_splitting_recent_copy_file_count.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.copy.file.count", + COUNTER_TYPE_VOLATILE_NUMBER, + "splitting copy file count in the recent period"); + _counter_replicas_splitting_recent_copy_file_size.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.copy.file.size", + COUNTER_TYPE_VOLATILE_NUMBER, + "splitting copy file size in the recent period"); + _counter_replicas_splitting_recent_copy_mutation_count.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.copy.mutation.count", + COUNTER_TYPE_VOLATILE_NUMBER, + "splitting copy mutation count in the recent period"); + _counter_replicas_splitting_recent_split_succ_count.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.split.succ.count", + COUNTER_TYPE_VOLATILE_NUMBER, + "splitting succeed count in the recent period"); + _counter_replicas_splitting_recent_split_fail_count.init_app_counter( + "eon.replica_stub", + "replicas.splitting.recent.split.fail.count", + COUNTER_TYPE_VOLATILE_NUMBER, + "splitting fail count in the recent period"); } void replica_stub::initialize(bool clear /* = false*/) @@ -1736,6 +1789,10 @@ void replica_stub::on_gc() uint64_t bulk_load_running_count = 0; uint64_t bulk_load_max_ingestion_time_ms = 0; uint64_t bulk_load_max_duration_time_ms = 0; + uint64_t splitting_count = 0; + uint64_t splitting_max_duration_time_ms = 0; + uint64_t splitting_max_async_learn_time_ms = 0; + uint64_t splitting_max_copy_file_size = 0; for (auto &kv : rs) { replica_ptr &rep = kv.second.rep; if (rep->status() == partition_status::PS_POTENTIAL_SECONDARY) { @@ -1762,6 +1819,16 @@ void replica_stub::on_gc() std::max(bulk_load_max_duration_time_ms, rep->get_bulk_loader()->duration_ms()); } } + // splitting_max_copy_file_size, rep->_split_states.copy_file_size + if (rep->status() == partition_status::PS_PARTITION_SPLIT) { + splitting_count++; + splitting_max_duration_time_ms = + std::max(splitting_max_duration_time_ms, rep->_split_states.total_ms()); + splitting_max_async_learn_time_ms = + std::max(splitting_max_async_learn_time_ms, rep->_split_states.async_learn_ms()); + splitting_max_copy_file_size = + std::max(splitting_max_copy_file_size, rep->_split_states.splitting_copy_file_size); + } } _counter_replicas_learning_count->set(learning_count); diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h index 6877c77b50..7e7ac632ce 100644 --- a/src/replica/replica_stub.h +++ b/src/replica/replica_stub.h @@ -68,6 +68,7 @@ namespace test { class test_checker; } class cold_backup_context; +class replica_split_manager; typedef std::unordered_map replicas; typedef std::function, public ref_counter perf_counter_wrapper _counter_bulk_load_max_ingestion_time_ms; perf_counter_wrapper _counter_bulk_load_max_duration_time_ms; + // <- Partition split Metrics -> + perf_counter_wrapper _counter_replicas_splitting_count; + perf_counter_wrapper _counter_replicas_splitting_max_duration_time_ms; + perf_counter_wrapper _counter_replicas_splitting_max_async_learn_time_ms; + perf_counter_wrapper _counter_replicas_splitting_max_copy_file_size; + perf_counter_wrapper _counter_replicas_splitting_recent_start_count; + perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_count; + perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_size; + perf_counter_wrapper _counter_replicas_splitting_recent_copy_mutation_count; + perf_counter_wrapper _counter_replicas_splitting_recent_split_fail_count; + perf_counter_wrapper _counter_replicas_splitting_recent_split_succ_count; + dsn::task_tracker _tracker; }; } // namespace replication diff --git a/src/replica/split/replica_split_manager.cpp b/src/replica/split/replica_split_manager.cpp index 98662afaaa..914af37e51 100644 --- a/src/replica/split/replica_split_manager.cpp +++ b/src/replica/split/replica_split_manager.cpp @@ -127,7 +127,8 @@ void replica_split_manager::child_init_replica(gpid parent_gpid, std::bind(&replica_split_manager::child_check_split_context, this), get_gpid().thread_hash(), std::chrono::seconds(3)); - // TODO(heyuchen): add other states + _replica->_split_states.splitting_start_ts_ns = dsn_now_ns(); + _stub->_counter_replicas_splitting_recent_start_count->increment(); ddebug_replica( "child initialize succeed, init_ballot={}, parent_gpid={}", init_ballot, parent_gpid); @@ -279,6 +280,7 @@ void replica_split_manager::child_copy_prepare_list( // learning parent states is time-consuming, should execute in THREAD_POOL_REPLICATION_LONG decree last_committed_decree = plist->last_committed_decree(); + _replica->_split_states.splitting_start_async_learn_ts_ns = dsn_now_ns(); _replica->_split_states.async_learn_task = tasking::enqueue(LPC_PARTITION_SPLIT_ASYNC_LEARN, tracker(), @@ -435,6 +437,11 @@ replica_split_manager::child_apply_private_logs(std::vector plog_fi return ec; } + _replica->_split_states.splitting_copy_file_count += plog_files.size(); + _replica->_split_states.splitting_copy_file_size += total_file_size; + _stub->_counter_replicas_splitting_recent_copy_file_count->add(plog_files.size()); + _stub->_counter_replicas_splitting_recent_copy_file_size->add(total_file_size); + ddebug_replica("replay private_log files succeed, file count={}, app last_committed_decree={}", plog_files.size(), _replica->_app->last_committed_decree()); @@ -456,6 +463,8 @@ replica_split_manager::child_apply_private_logs(std::vector plog_fi plist.prepare(mu, partition_status::PS_SECONDARY); ++count; } + _replica->_split_states.splitting_copy_mutation_count += count; + _stub->_counter_replicas_splitting_recent_copy_mutation_count->add(count); plist.commit(last_committed_decree, COMMIT_TO_DECREE_HARD); ddebug_replica( "apply in-memory mutations succeed, mutation count={}, app last_committed_decree={}", @@ -1105,6 +1114,7 @@ void replica_split_manager::child_partition_active( _replica->_primary_states.last_prepare_decree_on_new_primary = _replica->_prepare_list->max_decree(); _replica->update_configuration(config); + _stub->_counter_replicas_splitting_recent_split_succ_count->increment(); ddebug_replica("child partition is active, status={}", enum_to_string(status())); } @@ -1121,10 +1131,13 @@ void replica_split_manager::child_handle_split_error( const std::string &error_msg) // on child partition { if (status() != partition_status::PS_ERROR) { - derror_replica("child partition split failed because {}, parent = {}", + derror_replica("child partition split failed because {}, parent = {}, split_duration = " + "{}ms, async_learn_duration = {}ms", error_msg, - _replica->_split_states.parent_gpid); - // TODO(heyuchen): add perf-counter (split_failed_count) + _replica->_split_states.parent_gpid, + _replica->_split_states.total_ms(), + _replica->_split_states.async_learn_ms()); + _stub->_counter_replicas_splitting_recent_split_fail_count->increment(); _replica->update_local_configuration_with_no_ballot_change(partition_status::PS_ERROR); } }