Skip to content

Commit

Permalink
chore: add oom stats to /metrics (#2680)
Browse files Browse the repository at this point in the history
* chore: add oom stats to /metrics

Expose oom/cmd errors when we reject executing a command if we reached OOM state (controlled by oom_deny_ratio flag).
Expose oom/insert errors when we do not insert a new key or do not grow a dashtable (controlled by table_growth_margin).

Move OOM command check to a place that covers all types of transactions - including multi and squashing transactions. 
 
---------

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
  • Loading branch information
romange authored Mar 3, 2024
1 parent 7c443f3 commit 0c11509
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 12 deletions.
24 changes: 13 additions & 11 deletions src/server/main_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -968,7 +968,19 @@ static optional<ErrorReply> VerifyConnectionAclStatus(const CommandId* cid,
optional<ErrorReply> Service::VerifyCommandExecution(const CommandId* cid,
const ConnectionContext* cntx,
CmdArgList tail_args) {
// TODO: Move OOM check here
ServerState& etl = *ServerState::tlocal();

if ((cid->opt_mask() & CO::DENYOOM) && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();

uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
etl.stats.oom_error_cmd_cnt++;
return facade::ErrorReply{kOutOfMemory};
}
}

return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC",
tail_args);
}
Expand Down Expand Up @@ -1136,16 +1148,6 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx)
return cntx->SendSimpleString("QUEUED");
}

if (cid->opt_mask() & CO::DENYOOM && etl.is_master) {
uint64_t start_ns = absl::GetCurrentTimeNanos();

uint64_t used_memory = etl.GetUsedMemory(start_ns);
double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio);
if (used_memory > (max_memory_limit * oom_deny_ratio)) {
return cntx->reply_builder()->SendError(kOutOfMemory);
}
}

// Create command transaction
intrusive_ptr<Transaction> dist_trans;

Expand Down
7 changes: 7 additions & 0 deletions src/server/server_family.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,13 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
&resp->body());
AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE,
&resp->body());

if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) {
AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"},
&resp->body());
AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"},
&resp->body());
}
if (sdata_res.has_value()) {
size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages;
AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body());
Expand Down
3 changes: 2 additions & 1 deletion src/server/server_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ ServerState::Stats::Stats(unsigned num_shards) : tx_width_freq_arr(num_shards) {
}

ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
static_assert(sizeof(Stats) == 14 * 8, "Stats size mismatch");
static_assert(sizeof(Stats) == 15 * 8, "Stats size mismatch");

for (int i = 0; i < NUM_TX_TYPES; ++i) {
this->tx_type_cnt[i] += other.tx_type_cnt[i];
Expand All @@ -44,6 +44,7 @@ ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec;

this->blocked_on_interpreter += other.blocked_on_interpreter;
this->oom_error_cmd_cnt += other.oom_error_cmd_cnt;

if (this->tx_width_freq_arr.size() > 0) {
DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size());
Expand Down
3 changes: 3 additions & 0 deletions src/server/server_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class ServerState { // public struct - to allow initialization.

uint64_t blocked_on_interpreter = 0;

// Number of times we rejected command dispatch due to OOM condition.
uint64_t oom_error_cmd_cnt = 0;

std::valarray<uint64_t> tx_width_freq_arr;
};

Expand Down

0 comments on commit 0c11509

Please sign in to comment.