Skip to content
This repository has been archived by the owner on Jun 23, 2022. It is now read-only.

feat(split): add split related perf-counter #789

Merged
merged 1 commit into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/replica/replica_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,16 @@ class partition_split_context
public:
bool cleanup(bool force);
bool is_cleaned() const;
uint64_t total_ms() const
{
return splitting_start_ts_ns > 0 ? (dsn_now_ns() - splitting_start_ts_ns) / 1000000 : 0;
}
uint64_t async_learn_ms() const
{
return splitting_start_async_learn_ts_ns > 0
? (dsn_now_ns() - splitting_start_async_learn_ts_ns) / 1000000
: 0;
}

public:
gpid parent_gpid;
Expand All @@ -254,6 +264,13 @@ class partition_split_context
// partition split states checker, start when initialize child replica
// see more in function `child_check_split_context` and `parent_check_states`
task_ptr check_state_task;

// Used for split related perf-counter
uint64_t splitting_start_ts_ns{0};
uint64_t splitting_start_async_learn_ts_ns{0};
uint64_t splitting_copy_file_count{0};
uint64_t splitting_copy_file_size{0};
uint64_t splitting_copy_mutation_count{0};
};

//---------------inline impl----------------------------------------------------------------
Expand Down
67 changes: 67 additions & 0 deletions src/replica/replica_stub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,59 @@ void replica_stub::install_perf_counters()
COUNTER_TYPE_NUMBER,
"current tcmalloc release memory size");
#endif

// <- Partition split Metrics ->

_counter_replicas_splitting_count.init_app_counter("eon.replica_stub",
"replicas.splitting.count",
COUNTER_TYPE_NUMBER,
"current partition splitting count");

_counter_replicas_splitting_max_duration_time_ms.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.duration.time(ms)",
COUNTER_TYPE_NUMBER,
"current partition splitting max duration time(ms)");
_counter_replicas_splitting_max_async_learn_time_ms.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.async.learn.time(ms)",
COUNTER_TYPE_NUMBER,
"current partition splitting max async learn time(ms)");
_counter_replicas_splitting_max_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.copy.file.size",
COUNTER_TYPE_NUMBER,
"current splitting max copy file size");
_counter_replicas_splitting_recent_start_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.start.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"current splitting start count in the recent period");
_counter_replicas_splitting_recent_copy_file_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.file.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy file count in the recent period");
_counter_replicas_splitting_recent_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.file.size",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy file size in the recent period");
_counter_replicas_splitting_recent_copy_mutation_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.mutation.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy mutation count in the recent period");
_counter_replicas_splitting_recent_split_succ_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.split.succ.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting succeed count in the recent period");
_counter_replicas_splitting_recent_split_fail_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.split.fail.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting fail count in the recent period");
}

void replica_stub::initialize(bool clear /* = false*/)
Expand Down Expand Up @@ -1736,6 +1789,10 @@ void replica_stub::on_gc()
uint64_t bulk_load_running_count = 0;
uint64_t bulk_load_max_ingestion_time_ms = 0;
uint64_t bulk_load_max_duration_time_ms = 0;
uint64_t splitting_count = 0;
uint64_t splitting_max_duration_time_ms = 0;
uint64_t splitting_max_async_learn_time_ms = 0;
uint64_t splitting_max_copy_file_size = 0;
for (auto &kv : rs) {
replica_ptr &rep = kv.second.rep;
if (rep->status() == partition_status::PS_POTENTIAL_SECONDARY) {
Expand All @@ -1762,6 +1819,16 @@ void replica_stub::on_gc()
std::max(bulk_load_max_duration_time_ms, rep->get_bulk_loader()->duration_ms());
}
}
// splitting_max_copy_file_size, rep->_split_states.copy_file_size
if (rep->status() == partition_status::PS_PARTITION_SPLIT) {
splitting_count++;
splitting_max_duration_time_ms =
std::max(splitting_max_duration_time_ms, rep->_split_states.total_ms());
splitting_max_async_learn_time_ms =
std::max(splitting_max_async_learn_time_ms, rep->_split_states.async_learn_ms());
splitting_max_copy_file_size =
std::max(splitting_max_copy_file_size, rep->_split_states.splitting_copy_file_size);
}
}

_counter_replicas_learning_count->set(learning_count);
Expand Down
13 changes: 13 additions & 0 deletions src/replica/replica_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ namespace test {
class test_checker;
}
class cold_backup_context;
class replica_split_manager;

typedef std::unordered_map<gpid, replica_ptr> replicas;
typedef std::function<void(
Expand Down Expand Up @@ -473,6 +474,18 @@ class replica_stub : public serverlet<replica_stub>, public ref_counter
perf_counter_wrapper _counter_bulk_load_max_ingestion_time_ms;
perf_counter_wrapper _counter_bulk_load_max_duration_time_ms;

// <- Partition split Metrics ->
perf_counter_wrapper _counter_replicas_splitting_count;
perf_counter_wrapper _counter_replicas_splitting_max_duration_time_ms;
perf_counter_wrapper _counter_replicas_splitting_max_async_learn_time_ms;
perf_counter_wrapper _counter_replicas_splitting_max_copy_file_size;
perf_counter_wrapper _counter_replicas_splitting_recent_start_count;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_count;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_size;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_mutation_count;
perf_counter_wrapper _counter_replicas_splitting_recent_split_fail_count;
perf_counter_wrapper _counter_replicas_splitting_recent_split_succ_count;

dsn::task_tracker _tracker;
};
} // namespace replication
Expand Down
21 changes: 17 additions & 4 deletions src/replica/split/replica_split_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ void replica_split_manager::child_init_replica(gpid parent_gpid,
std::bind(&replica_split_manager::child_check_split_context, this),
get_gpid().thread_hash(),
std::chrono::seconds(3));
// TODO(heyuchen): add other states
_replica->_split_states.splitting_start_ts_ns = dsn_now_ns();
_stub->_counter_replicas_splitting_recent_start_count->increment();

ddebug_replica(
"child initialize succeed, init_ballot={}, parent_gpid={}", init_ballot, parent_gpid);
Expand Down Expand Up @@ -279,6 +280,7 @@ void replica_split_manager::child_copy_prepare_list(

// learning parent states is time-consuming, should execute in THREAD_POOL_REPLICATION_LONG
decree last_committed_decree = plist->last_committed_decree();
_replica->_split_states.splitting_start_async_learn_ts_ns = dsn_now_ns();
_replica->_split_states.async_learn_task =
tasking::enqueue(LPC_PARTITION_SPLIT_ASYNC_LEARN,
tracker(),
Expand Down Expand Up @@ -435,6 +437,11 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
return ec;
}

_replica->_split_states.splitting_copy_file_count += plog_files.size();
_replica->_split_states.splitting_copy_file_size += total_file_size;
_stub->_counter_replicas_splitting_recent_copy_file_count->add(plog_files.size());
_stub->_counter_replicas_splitting_recent_copy_file_size->add(total_file_size);

ddebug_replica("replay private_log files succeed, file count={}, app last_committed_decree={}",
plog_files.size(),
_replica->_app->last_committed_decree());
Expand All @@ -456,6 +463,8 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
plist.prepare(mu, partition_status::PS_SECONDARY);
++count;
}
_replica->_split_states.splitting_copy_mutation_count += count;
_stub->_counter_replicas_splitting_recent_copy_mutation_count->add(count);
plist.commit(last_committed_decree, COMMIT_TO_DECREE_HARD);
ddebug_replica(
"apply in-memory mutations succeed, mutation count={}, app last_committed_decree={}",
Expand Down Expand Up @@ -1105,6 +1114,7 @@ void replica_split_manager::child_partition_active(
_replica->_primary_states.last_prepare_decree_on_new_primary =
_replica->_prepare_list->max_decree();
_replica->update_configuration(config);
_stub->_counter_replicas_splitting_recent_split_succ_count->increment();
ddebug_replica("child partition is active, status={}", enum_to_string(status()));
}

Expand All @@ -1121,10 +1131,13 @@ void replica_split_manager::child_handle_split_error(
const std::string &error_msg) // on child partition
{
if (status() != partition_status::PS_ERROR) {
derror_replica("child partition split failed because {}, parent = {}",
derror_replica("child partition split failed because {}, parent = {}, split_duration = "
"{}ms, async_learn_duration = {}ms",
error_msg,
_replica->_split_states.parent_gpid);
// TODO(heyuchen): add perf-counter (split_failed_count)
_replica->_split_states.parent_gpid,
_replica->_split_states.total_ms(),
_replica->_split_states.async_learn_ms());
_stub->_counter_replicas_splitting_recent_split_fail_count->increment();
_replica->update_local_configuration_with_no_ballot_change(partition_status::PS_ERROR);
}
}
Expand Down