Skip to content
This repository has been archived by the owner on Jun 23, 2022. It is now read-only.

Commit

Permalink
feat(split): add split related perf-counter (#789)
Browse files Browse the repository at this point in the history
  • Loading branch information
hycdong authored Mar 17, 2021
1 parent 5fcd16e commit d39b109
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 4 deletions.
17 changes: 17 additions & 0 deletions src/replica/replica_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,16 @@ class partition_split_context
public:
bool cleanup(bool force);
bool is_cleaned() const;
uint64_t total_ms() const
{
return splitting_start_ts_ns > 0 ? (dsn_now_ns() - splitting_start_ts_ns) / 1000000 : 0;
}
uint64_t async_learn_ms() const
{
return splitting_start_async_learn_ts_ns > 0
? (dsn_now_ns() - splitting_start_async_learn_ts_ns) / 1000000
: 0;
}

public:
gpid parent_gpid;
Expand All @@ -254,6 +264,13 @@ class partition_split_context
// partition split states checker, start when initialize child replica
// see more in function `child_check_split_context` and `parent_check_states`
task_ptr check_state_task;

// Used for split related perf-counter
uint64_t splitting_start_ts_ns{0};
uint64_t splitting_start_async_learn_ts_ns{0};
uint64_t splitting_copy_file_count{0};
uint64_t splitting_copy_file_size{0};
uint64_t splitting_copy_mutation_count{0};
};

//---------------inline impl----------------------------------------------------------------
Expand Down
67 changes: 67 additions & 0 deletions src/replica/replica_stub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,59 @@ void replica_stub::install_perf_counters()
COUNTER_TYPE_NUMBER,
"current tcmalloc release memory size");
#endif

// <- Partition split Metrics ->

_counter_replicas_splitting_count.init_app_counter("eon.replica_stub",
"replicas.splitting.count",
COUNTER_TYPE_NUMBER,
"current partition splitting count");

_counter_replicas_splitting_max_duration_time_ms.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.duration.time(ms)",
COUNTER_TYPE_NUMBER,
"current partition splitting max duration time(ms)");
_counter_replicas_splitting_max_async_learn_time_ms.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.async.learn.time(ms)",
COUNTER_TYPE_NUMBER,
"current partition splitting max async learn time(ms)");
_counter_replicas_splitting_max_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.splitting.max.copy.file.size",
COUNTER_TYPE_NUMBER,
"current splitting max copy file size");
_counter_replicas_splitting_recent_start_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.start.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"current splitting start count in the recent period");
_counter_replicas_splitting_recent_copy_file_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.file.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy file count in the recent period");
_counter_replicas_splitting_recent_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.file.size",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy file size in the recent period");
_counter_replicas_splitting_recent_copy_mutation_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.copy.mutation.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting copy mutation count in the recent period");
_counter_replicas_splitting_recent_split_succ_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.split.succ.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting succeed count in the recent period");
_counter_replicas_splitting_recent_split_fail_count.init_app_counter(
"eon.replica_stub",
"replicas.splitting.recent.split.fail.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"splitting fail count in the recent period");
}

void replica_stub::initialize(bool clear /* = false*/)
Expand Down Expand Up @@ -1736,6 +1789,10 @@ void replica_stub::on_gc()
uint64_t bulk_load_running_count = 0;
uint64_t bulk_load_max_ingestion_time_ms = 0;
uint64_t bulk_load_max_duration_time_ms = 0;
uint64_t splitting_count = 0;
uint64_t splitting_max_duration_time_ms = 0;
uint64_t splitting_max_async_learn_time_ms = 0;
uint64_t splitting_max_copy_file_size = 0;
for (auto &kv : rs) {
replica_ptr &rep = kv.second.rep;
if (rep->status() == partition_status::PS_POTENTIAL_SECONDARY) {
Expand All @@ -1762,6 +1819,16 @@ void replica_stub::on_gc()
std::max(bulk_load_max_duration_time_ms, rep->get_bulk_loader()->duration_ms());
}
}
// splitting_max_copy_file_size, rep->_split_states.copy_file_size
if (rep->status() == partition_status::PS_PARTITION_SPLIT) {
splitting_count++;
splitting_max_duration_time_ms =
std::max(splitting_max_duration_time_ms, rep->_split_states.total_ms());
splitting_max_async_learn_time_ms =
std::max(splitting_max_async_learn_time_ms, rep->_split_states.async_learn_ms());
splitting_max_copy_file_size =
std::max(splitting_max_copy_file_size, rep->_split_states.splitting_copy_file_size);
}
}

_counter_replicas_learning_count->set(learning_count);
Expand Down
13 changes: 13 additions & 0 deletions src/replica/replica_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ namespace test {
class test_checker;
}
class cold_backup_context;
class replica_split_manager;

typedef std::unordered_map<gpid, replica_ptr> replicas;
typedef std::function<void(
Expand Down Expand Up @@ -473,6 +474,18 @@ class replica_stub : public serverlet<replica_stub>, public ref_counter
perf_counter_wrapper _counter_bulk_load_max_ingestion_time_ms;
perf_counter_wrapper _counter_bulk_load_max_duration_time_ms;

// <- Partition split Metrics ->
perf_counter_wrapper _counter_replicas_splitting_count;
perf_counter_wrapper _counter_replicas_splitting_max_duration_time_ms;
perf_counter_wrapper _counter_replicas_splitting_max_async_learn_time_ms;
perf_counter_wrapper _counter_replicas_splitting_max_copy_file_size;
perf_counter_wrapper _counter_replicas_splitting_recent_start_count;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_count;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_file_size;
perf_counter_wrapper _counter_replicas_splitting_recent_copy_mutation_count;
perf_counter_wrapper _counter_replicas_splitting_recent_split_fail_count;
perf_counter_wrapper _counter_replicas_splitting_recent_split_succ_count;

dsn::task_tracker _tracker;
};
} // namespace replication
Expand Down
21 changes: 17 additions & 4 deletions src/replica/split/replica_split_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ void replica_split_manager::child_init_replica(gpid parent_gpid,
std::bind(&replica_split_manager::child_check_split_context, this),
get_gpid().thread_hash(),
std::chrono::seconds(3));
// TODO(heyuchen): add other states
_replica->_split_states.splitting_start_ts_ns = dsn_now_ns();
_stub->_counter_replicas_splitting_recent_start_count->increment();

ddebug_replica(
"child initialize succeed, init_ballot={}, parent_gpid={}", init_ballot, parent_gpid);
Expand Down Expand Up @@ -279,6 +280,7 @@ void replica_split_manager::child_copy_prepare_list(

// learning parent states is time-consuming, should execute in THREAD_POOL_REPLICATION_LONG
decree last_committed_decree = plist->last_committed_decree();
_replica->_split_states.splitting_start_async_learn_ts_ns = dsn_now_ns();
_replica->_split_states.async_learn_task =
tasking::enqueue(LPC_PARTITION_SPLIT_ASYNC_LEARN,
tracker(),
Expand Down Expand Up @@ -435,6 +437,11 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
return ec;
}

_replica->_split_states.splitting_copy_file_count += plog_files.size();
_replica->_split_states.splitting_copy_file_size += total_file_size;
_stub->_counter_replicas_splitting_recent_copy_file_count->add(plog_files.size());
_stub->_counter_replicas_splitting_recent_copy_file_size->add(total_file_size);

ddebug_replica("replay private_log files succeed, file count={}, app last_committed_decree={}",
plog_files.size(),
_replica->_app->last_committed_decree());
Expand All @@ -456,6 +463,8 @@ replica_split_manager::child_apply_private_logs(std::vector<std::string> plog_fi
plist.prepare(mu, partition_status::PS_SECONDARY);
++count;
}
_replica->_split_states.splitting_copy_mutation_count += count;
_stub->_counter_replicas_splitting_recent_copy_mutation_count->add(count);
plist.commit(last_committed_decree, COMMIT_TO_DECREE_HARD);
ddebug_replica(
"apply in-memory mutations succeed, mutation count={}, app last_committed_decree={}",
Expand Down Expand Up @@ -1105,6 +1114,7 @@ void replica_split_manager::child_partition_active(
_replica->_primary_states.last_prepare_decree_on_new_primary =
_replica->_prepare_list->max_decree();
_replica->update_configuration(config);
_stub->_counter_replicas_splitting_recent_split_succ_count->increment();
ddebug_replica("child partition is active, status={}", enum_to_string(status()));
}

Expand All @@ -1121,10 +1131,13 @@ void replica_split_manager::child_handle_split_error(
const std::string &error_msg) // on child partition
{
if (status() != partition_status::PS_ERROR) {
derror_replica("child partition split failed because {}, parent = {}",
derror_replica("child partition split failed because {}, parent = {}, split_duration = "
"{}ms, async_learn_duration = {}ms",
error_msg,
_replica->_split_states.parent_gpid);
// TODO(heyuchen): add perf-counter (split_failed_count)
_replica->_split_states.parent_gpid,
_replica->_split_states.total_ms(),
_replica->_split_states.async_learn_ms());
_stub->_counter_replicas_splitting_recent_split_fail_count->increment();
_replica->update_local_configuration_with_no_ballot_change(partition_status::PS_ERROR);
}
}
Expand Down

0 comments on commit d39b109

Please sign in to comment.