Skip to content

Commit

Permalink
Add runtime stats to track per-query memory reclaim time and bytes (f…
Browse files Browse the repository at this point in the history
…acebookincubator#9196)

Summary:
Add two runtime stats to track operator-level memory reclaim time and reclaimed memory bytes
to help analyze slow query
Add operator and task memory reclaim counters and the metric rename accordingly
Add documentation for the newly added runtime stats and cover the existing ones in followup

Pull Request resolved: facebookincubator#9196

Reviewed By: mbasmanova

Differential Revision: D55177631

Pulled By: xiaoxmeng

fbshipit-source-id: 42534504ad9bd910a7106f805bc09c5ebf35ba36
  • Loading branch information
xiaoxmeng authored and facebook-github-bot committed Mar 22, 2024
1 parent 5578357 commit 1ed8075
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 21 deletions.
24 changes: 16 additions & 8 deletions velox/common/base/Counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,30 @@ void registerVeloxMetrics() {

/// ================== Memory Arbitration Counters =================

// Tracks memory reclaim exec time in range of [0, 600s] with 20 buckets and
// reports P50, P90, P99, and P100.
// Tracks the memory reclaim count on an operator.
DEFINE_METRIC(kMetricMemoryReclaimCount, facebook::velox::StatType::COUNT);

// Tracks op memory reclaim exec time in range of [0, 600s] with 20 buckets
// and reports P50, P90, P99, and P100.
DEFINE_HISTOGRAM_METRIC(
kMetricMemoryReclaimExecTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100);

// Tracks op memory reclaim bytes.
DEFINE_METRIC(kMetricMemoryReclaimedBytes, facebook::velox::StatType::SUM);

// Tracks the memory reclaim count on an operator.
DEFINE_METRIC(
kMetricTaskMemoryReclaimCount, facebook::velox::StatType::COUNT);

// Tracks memory reclaim task wait time in range of [0, 60s] with 10 buckets
// and reports P50, P90, P99, and P100.
DEFINE_HISTOGRAM_METRIC(
kMetricMemoryReclaimWaitTimeMs, 6'000, 0, 60'000, 50, 90, 99, 100);

// Tracks memory reclaim bytes.
DEFINE_METRIC(kMetricMemoryReclaimedBytes, facebook::velox::StatType::SUM);
kMetricTaskMemoryReclaimWaitTimeMs, 6'000, 0, 60'000, 50, 90, 99, 100);

// Tracks the number of times that the memory reclaim wait timeouts.
// Tracks the number of times that the task memory reclaim wait timeouts.
DEFINE_METRIC(
kMetricMemoryReclaimWaitTimeoutCount, facebook::velox::StatType::SUM);
kMetricTaskMemoryReclaimWaitTimeoutCount,
facebook::velox::StatType::COUNT);

// The number of times that the memory reclaim fails because the operator is
// executing a non-reclaimable section where it is expected to have reserved
Expand Down
14 changes: 10 additions & 4 deletions velox/common/base/Counters.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,17 @@ constexpr folly::StringPiece kMetricMemoryReclaimExecTimeMs{
constexpr folly::StringPiece kMetricMemoryReclaimedBytes{
"velox.memory_reclaim_bytes"};

constexpr folly::StringPiece kMetricMemoryReclaimWaitTimeMs{
"velox.memory_reclaim_wait_ms"};
constexpr folly::StringPiece kMetricMemoryReclaimCount{
"velox.memory_reclaim_count"};

constexpr folly::StringPiece kMetricMemoryReclaimWaitTimeoutCount{
"velox.memory_reclaim_wait_timeout_count"};
constexpr folly::StringPiece kMetricTaskMemoryReclaimCount{
"velox.task_memory_reclaim_count"};

constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeMs{
"velox.task_memory_reclaim_wait_ms"};

constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeoutCount{
"velox.task_memory_reclaim_wait_timeout_count"};

constexpr folly::StringPiece kMetricMemoryNonReclaimableCount{
"velox.memory_non_reclaimable_count"};
Expand Down
10 changes: 10 additions & 0 deletions velox/common/memory/MemoryArbitrator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <utility>

#include "velox/common/base/Counters.h"
#include "velox/common/base/RuntimeMetrics.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/memory/Memory.h"

Expand Down Expand Up @@ -179,6 +180,15 @@ uint64_t MemoryReclaimer::run(
RECORD_HISTOGRAM_METRIC_VALUE(
kMetricMemoryReclaimExecTimeMs, execTimeUs / 1'000);
RECORD_METRIC_VALUE(kMetricMemoryReclaimedBytes, reclaimedBytes);
RECORD_METRIC_VALUE(kMetricMemoryReclaimCount, 1);
addThreadLocalRuntimeStat(
"memoryReclaimWallNanos",
RuntimeCounter(execTimeUs * 1'000, RuntimeCounter::Unit::kNanos));
addThreadLocalRuntimeStat(
"memoryReclaimCount", RuntimeCounter(1, RuntimeCounter::Unit::kNone));
addThreadLocalRuntimeStat(
"reclaimedMemoryBytes",
RuntimeCounter(reclaimedBytes, RuntimeCounter::Unit::kBytes));
return reclaimedBytes;
}

Expand Down
2 changes: 1 addition & 1 deletion velox/docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Velox Documentation
functions
spark_functions
configs
metrics
monitoring
bindings/python/README_generated_pyvelox
develop
programming-guide
9 changes: 9 additions & 0 deletions velox/docs/monitoring.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
**********
Monitoring
**********

.. toctree::
:maxdepth: 1

monitoring/metrics.rst
monitoring/stats.rst
18 changes: 12 additions & 6 deletions velox/docs/metrics.rst → velox/docs/monitoring/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ Memory Management
- The distribution of cache shrink latency in range of [0, 100s] with 10
buckets. It is configured to report the latency at P50, P90, P99, and
P100 percentiles.
* - memory_reclaim_count
- Count
- The count of operator memory reclaims.
* - memory_reclaim_exec_ms
- Histogram
- The distribution of memory reclaim execution time in range of [0, 600s]
Expand All @@ -91,14 +94,17 @@ Memory Management
* - memory_reclaim_bytes
- Sum
- The sum of reclaimed memory bytes.
* - memory_reclaim_wait_ms
* - task_memory_reclaim_count
- Count
- The count of task memory reclaims.
* - task_memory_reclaim_wait_ms
- Histogram
- The distribution of memory reclaim wait time in range of [0, 60s] with 10
buckets. It is configured to report latency at P50, P90, P99, and P100
percentiles.
* - memory_reclaim_wait_timeout_count
- The distribution of task memory reclaim wait time in range of [0, 60s]
with 10 buckets. It is configured to report latency at P50, P90, P99,
and P100 percentiles.
* - task_memory_reclaim_wait_timeout_count
- Count
- The number of times that the memory reclaim wait timeouts.
- The number of times that the task memory reclaim wait timeouts.
* - memory_non_reclaimable_count
- Count
- The number of times that the memory reclaim fails because the operator is executing a
Expand Down
41 changes: 41 additions & 0 deletions velox/docs/monitoring/stats.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
=============
Runtime Stats
=============

Runtime stats are used to collect the per-query velox runtime events for
offline query analysis purpose. The collected stats can provide insights into
the operator level query execution internals, such as how much time a query
operator spent in disk spilling. The collected stats are organized in a
free-form key-value for easy extension. The key is the event name and the
value is defined as RuntimeCounter which is used to store and aggregate a
particular event occurrences during the operator execution. RuntimeCounter has
three types: kNone used to record event count, kNanos used to record event time
in nanoseconds and kBytes used to record memory or storage size in bytes. It
records the count of events, and the min/max/sum of the event values. The stats
are stored in OperatorStats structure. The query system can aggregate the
operator level stats collected from each driver by pipeline and task for
analysis.

Memory Arbitration
------------------
.. list-table::
:widths: 40 10 50
:header-rows: 1

* - Stats Name
- Type
- Description
* - memoryReclaimCount
- kNone
- The number of times that the memory arbitration to reclaim memory from
an spillable operator.
This stats only applies for spillable operators.
* - memoryReclaimWallNanos
- kNano
- The memory reclaim execution time of an operator during the memory
arbitration. It collects time spent on disk spilling or file write.
This stats only applies for spillable operators.
* - reclaimedMemoryBytes
- kByte
- The reclaimed memory bytes of an operator during the memory arbitration.
This stats only applies for spillable operators.
5 changes: 3 additions & 2 deletions velox/exec/Task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2715,7 +2715,7 @@ uint64_t Task::MemoryReclaimer::reclaimTask(
}
VELOX_CHECK(paused || maxWaitMs != 0);
if (!paused) {
RECORD_METRIC_VALUE(kMetricMemoryReclaimWaitTimeoutCount, 1);
RECORD_METRIC_VALUE(kMetricTaskMemoryReclaimWaitTimeoutCount, 1);
VELOX_FAIL(
"Memory reclaim failed to wait for task {} to pause after {} with max timeout {}",
task->taskId(),
Expand All @@ -2724,8 +2724,9 @@ uint64_t Task::MemoryReclaimer::reclaimTask(
}

stats.reclaimWaitTimeUs += reclaimWaitTimeUs;
RECORD_METRIC_VALUE(kMetricTaskMemoryReclaimCount, 1);
RECORD_HISTOGRAM_METRIC_VALUE(
kMetricMemoryReclaimWaitTimeMs, reclaimWaitTimeUs / 1'000);
kMetricTaskMemoryReclaimWaitTimeMs, reclaimWaitTimeUs / 1'000);

// Don't reclaim from a cancelled task as it will terminate soon.
if (task->isCancelled()) {
Expand Down
2 changes: 2 additions & 0 deletions velox/exec/tests/AggregationTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3242,6 +3242,8 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregation) {
auto& planStats = taskStats.at(aggrNodeId);
ASSERT_GT(planStats.spilledBytes, 0);
ASSERT_GT(planStats.customStats["memoryArbitrationWallNanos"].sum, 0);
ASSERT_GT(planStats.customStats["memoryReclaimWallNanos"].sum, 0);
ASSERT_GT(planStats.customStats["reclaimedMemoryBytes"].sum, 0);
});

arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); });
Expand Down

0 comments on commit 1ed8075

Please sign in to comment.