diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index ae79681ae430..f47584dd7202 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -48,22 +48,30 @@ void registerVeloxMetrics() { /// ================== Memory Arbitration Counters ================= - // Tracks memory reclaim exec time in range of [0, 600s] with 20 buckets and - // reports P50, P90, P99, and P100. + // Tracks the memory reclaim count on an operator. + DEFINE_METRIC(kMetricMemoryReclaimCount, facebook::velox::StatType::COUNT); + + // Tracks op memory reclaim exec time in range of [0, 600s] with 20 buckets + // and reports P50, P90, P99, and P100. DEFINE_HISTOGRAM_METRIC( kMetricMemoryReclaimExecTimeMs, 30'000, 0, 600'000, 50, 90, 99, 100); + // Tracks op memory reclaim bytes. + DEFINE_METRIC(kMetricMemoryReclaimedBytes, facebook::velox::StatType::SUM); + + // Tracks the memory reclaim count on an operator. + DEFINE_METRIC( + kMetricTaskMemoryReclaimCount, facebook::velox::StatType::COUNT); + // Tracks memory reclaim task wait time in range of [0, 60s] with 10 buckets // and reports P50, P90, P99, and P100. DEFINE_HISTOGRAM_METRIC( - kMetricMemoryReclaimWaitTimeMs, 6'000, 0, 60'000, 50, 90, 99, 100); - - // Tracks memory reclaim bytes. - DEFINE_METRIC(kMetricMemoryReclaimedBytes, facebook::velox::StatType::SUM); + kMetricTaskMemoryReclaimWaitTimeMs, 6'000, 0, 60'000, 50, 90, 99, 100); - // Tracks the number of times that the memory reclaim wait timeouts. + // Tracks the number of times that the task memory reclaim wait timeouts. DEFINE_METRIC( - kMetricMemoryReclaimWaitTimeoutCount, facebook::velox::StatType::SUM); + kMetricTaskMemoryReclaimWaitTimeoutCount, + facebook::velox::StatType::COUNT); // The number of times that the memory reclaim fails because the operator is // executing a non-reclaimable section where it is expected to have reserved diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index c9fcd91bfb86..4625c250d11c 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -40,11 +40,17 @@ constexpr folly::StringPiece kMetricMemoryReclaimExecTimeMs{ constexpr folly::StringPiece kMetricMemoryReclaimedBytes{ "velox.memory_reclaim_bytes"}; -constexpr folly::StringPiece kMetricMemoryReclaimWaitTimeMs{ - "velox.memory_reclaim_wait_ms"}; +constexpr folly::StringPiece kMetricMemoryReclaimCount{ + "velox.memory_reclaim_count"}; -constexpr folly::StringPiece kMetricMemoryReclaimWaitTimeoutCount{ - "velox.memory_reclaim_wait_timeout_count"}; +constexpr folly::StringPiece kMetricTaskMemoryReclaimCount{ + "velox.task_memory_reclaim_count"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeMs{ + "velox.task_memory_reclaim_wait_ms"}; + +constexpr folly::StringPiece kMetricTaskMemoryReclaimWaitTimeoutCount{ + "velox.task_memory_reclaim_wait_timeout_count"}; constexpr folly::StringPiece kMetricMemoryNonReclaimableCount{ "velox.memory_non_reclaimable_count"}; diff --git a/velox/common/memory/MemoryArbitrator.cpp b/velox/common/memory/MemoryArbitrator.cpp index e3a8d6a054e1..709723796f78 100644 --- a/velox/common/memory/MemoryArbitrator.cpp +++ b/velox/common/memory/MemoryArbitrator.cpp @@ -19,6 +19,7 @@ #include #include "velox/common/base/Counters.h" +#include "velox/common/base/RuntimeMetrics.h" #include "velox/common/base/StatsReporter.h" #include "velox/common/memory/Memory.h" @@ -179,6 +180,15 @@ uint64_t MemoryReclaimer::run( RECORD_HISTOGRAM_METRIC_VALUE( kMetricMemoryReclaimExecTimeMs, execTimeUs / 1'000); RECORD_METRIC_VALUE(kMetricMemoryReclaimedBytes, reclaimedBytes); + RECORD_METRIC_VALUE(kMetricMemoryReclaimCount, 1); + addThreadLocalRuntimeStat( + "memoryReclaimWallNanos", + RuntimeCounter(execTimeUs * 1'000, RuntimeCounter::Unit::kNanos)); + addThreadLocalRuntimeStat( + "memoryReclaimCount", RuntimeCounter(1, RuntimeCounter::Unit::kNone)); + addThreadLocalRuntimeStat( + "reclaimedMemoryBytes", + RuntimeCounter(reclaimedBytes, RuntimeCounter::Unit::kBytes)); return reclaimedBytes; } diff --git a/velox/docs/index.rst b/velox/docs/index.rst index 36df74329d9e..17d5ce21173d 100644 --- a/velox/docs/index.rst +++ b/velox/docs/index.rst @@ -10,7 +10,7 @@ Velox Documentation functions spark_functions configs - metrics + monitoring bindings/python/README_generated_pyvelox develop programming-guide diff --git a/velox/docs/monitoring.rst b/velox/docs/monitoring.rst new file mode 100644 index 000000000000..b92026f9b637 --- /dev/null +++ b/velox/docs/monitoring.rst @@ -0,0 +1,9 @@ +********** +Monitoring +********** + +.. toctree:: + :maxdepth: 1 + + monitoring/metrics.rst + monitoring/stats.rst \ No newline at end of file diff --git a/velox/docs/metrics.rst b/velox/docs/monitoring/metrics.rst similarity index 94% rename from velox/docs/metrics.rst rename to velox/docs/monitoring/metrics.rst index a7b7c19cf194..d206d725d4e8 100644 --- a/velox/docs/metrics.rst +++ b/velox/docs/monitoring/metrics.rst @@ -83,6 +83,9 @@ Memory Management - The distribution of cache shrink latency in range of [0, 100s] with 10 buckets. It is configured to report the latency at P50, P90, P99, and P100 percentiles. + * - memory_reclaim_count + - Count + - The count of operator memory reclaims. * - memory_reclaim_exec_ms - Histogram - The distribution of memory reclaim execution time in range of [0, 600s] @@ -91,14 +94,17 @@ Memory Management * - memory_reclaim_bytes - Sum - The sum of reclaimed memory bytes. - * - memory_reclaim_wait_ms + * - task_memory_reclaim_count + - Count + - The count of task memory reclaims. + * - task_memory_reclaim_wait_ms - Histogram - - The distribution of memory reclaim wait time in range of [0, 60s] with 10 - buckets. It is configured to report latency at P50, P90, P99, and P100 - percentiles. - * - memory_reclaim_wait_timeout_count + - The distribution of task memory reclaim wait time in range of [0, 60s] + with 10 buckets. It is configured to report latency at P50, P90, P99, + and P100 percentiles. + * - task_memory_reclaim_wait_timeout_count - Count - - The number of times that the memory reclaim wait timeouts. + - The number of times that the task memory reclaim wait timeouts. * - memory_non_reclaimable_count - Count - The number of times that the memory reclaim fails because the operator is executing a diff --git a/velox/docs/monitoring/stats.rst b/velox/docs/monitoring/stats.rst new file mode 100644 index 000000000000..32c15eb31373 --- /dev/null +++ b/velox/docs/monitoring/stats.rst @@ -0,0 +1,41 @@ +============= +Runtime Stats +============= + +Runtime stats are used to collect the per-query velox runtime events for +offline query analysis purpose. The collected stats can provide insights into +the operator level query execution internals, such as how much time a query +operator spent in disk spilling. The collected stats are organized in a +free-form key-value for easy extension. The key is the event name and the +value is defined as RuntimeCounter which is used to store and aggregate a +particular event occurrences during the operator execution. RuntimeCounter has +three types: kNone used to record event count, kNanos used to record event time +in nanoseconds and kBytes used to record memory or storage size in bytes. It +records the count of events, and the min/max/sum of the event values. The stats +are stored in OperatorStats structure. The query system can aggregate the +operator level stats collected from each driver by pipeline and task for +analysis. + +Memory Arbitration +------------------ +.. list-table:: + :widths: 40 10 50 + :header-rows: 1 + + * - Stats Name + - Type + - Description + * - memoryReclaimCount + - kNone + - The number of times that the memory arbitration to reclaim memory from + an spillable operator. + This stats only applies for spillable operators. + * - memoryReclaimWallNanos + - kNano + - The memory reclaim execution time of an operator during the memory + arbitration. It collects time spent on disk spilling or file write. + This stats only applies for spillable operators. + * - reclaimedMemoryBytes + - kByte + - The reclaimed memory bytes of an operator during the memory arbitration. + This stats only applies for spillable operators. diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index 29aae0edb3e6..2b7b1c0366e2 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -2715,7 +2715,7 @@ uint64_t Task::MemoryReclaimer::reclaimTask( } VELOX_CHECK(paused || maxWaitMs != 0); if (!paused) { - RECORD_METRIC_VALUE(kMetricMemoryReclaimWaitTimeoutCount, 1); + RECORD_METRIC_VALUE(kMetricTaskMemoryReclaimWaitTimeoutCount, 1); VELOX_FAIL( "Memory reclaim failed to wait for task {} to pause after {} with max timeout {}", task->taskId(), @@ -2724,8 +2724,9 @@ uint64_t Task::MemoryReclaimer::reclaimTask( } stats.reclaimWaitTimeUs += reclaimWaitTimeUs; + RECORD_METRIC_VALUE(kMetricTaskMemoryReclaimCount, 1); RECORD_HISTOGRAM_METRIC_VALUE( - kMetricMemoryReclaimWaitTimeMs, reclaimWaitTimeUs / 1'000); + kMetricTaskMemoryReclaimWaitTimeMs, reclaimWaitTimeUs / 1'000); // Don't reclaim from a cancelled task as it will terminate soon. if (task->isCancelled()) { diff --git a/velox/exec/tests/AggregationTest.cpp b/velox/exec/tests/AggregationTest.cpp index 4e919bcfa3d4..04eb340f904e 100644 --- a/velox/exec/tests/AggregationTest.cpp +++ b/velox/exec/tests/AggregationTest.cpp @@ -3242,6 +3242,8 @@ DEBUG_ONLY_TEST_F(AggregationTest, reclaimFromAggregation) { auto& planStats = taskStats.at(aggrNodeId); ASSERT_GT(planStats.spilledBytes, 0); ASSERT_GT(planStats.customStats["memoryArbitrationWallNanos"].sum, 0); + ASSERT_GT(planStats.customStats["memoryReclaimWallNanos"].sum, 0); + ASSERT_GT(planStats.customStats["reclaimedMemoryBytes"].sum, 0); }); arbitrationWait.await([&] { return !arbitrationWaitFlag.load(); });