From 4466c879770565e0db05b56ee03858d7d95f8417 Mon Sep 17 00:00:00 2001 From: Bikramjeet Vig Date: Thu, 14 Dec 2023 18:43:26 -0800 Subject: [PATCH] Add metrics for spilling (#8025) Summary: Add the following global metrics: velox.spill_input_bytes velox.spill_bytes velox.spill__rows_count velox.spill_files_count velox.spill_fill_time_ms velox.spill_sort_time_ms velox.spill_serialization_time_ms velox.spill_disk_writes_count velox.spill_flush_time_ms velox.spill_write_time_ms Pull Request resolved: https://github.com/facebookincubator/velox/pull/8025 Reviewed By: xiaoxmeng Differential Revision: D52172296 Pulled By: bikramSingh91 fbshipit-source-id: bfbb67f6a0ab05c51ba9f47db2f0b5410e65a9cc --- velox/common/base/Counters.cpp | 61 ++++++++++++++++++++++++++++---- velox/common/base/Counters.h | 26 ++++++++++++++ velox/common/base/SpillStats.cpp | 13 +++++++ velox/docs/metrics.rst | 44 ++++++++++++++++++++++- 4 files changed, 137 insertions(+), 7 deletions(-) diff --git a/velox/common/base/Counters.cpp b/velox/common/base/Counters.cpp index 8bb877b5fe95..8d4cd95d21b4 100644 --- a/velox/common/base/Counters.cpp +++ b/velox/common/base/Counters.cpp @@ -32,10 +32,6 @@ void registerVeloxMetrics() { DEFINE_HISTOGRAM_METRIC( kMetricCacheShrinkTimeMs, 10, 0, 100'000, 50, 90, 99, 100); - // Tracks the number of times that we hit the max spill level limit. - DEFINE_METRIC( - kMetricMaxSpillLevelExceededCount, facebook::velox::StatType::COUNT); - /// ================== Memory Arbitration Counters ================= // Tracks memory reclaim exec time in range of [0, 600s] with 20 buckets and @@ -95,9 +91,62 @@ void registerVeloxMetrics() { DEFINE_HISTOGRAM_METRIC( kMetricArbitratorArbitrationTimeMs, 20, 0, 600'000, 50, 90, 99, 100); - /// Tracks the average of free memory capacity managed by the arbitrator in - /// bytes. + // Tracks the average of free memory capacity managed by the arbitrator in + // bytes. DEFINE_METRIC( kMetricArbitratorFreeCapacityBytes, facebook::velox::StatType::AVG); + + /// ================== Spill related Counters ================= + + // The number of bytes in memory to spill. + DEFINE_METRIC(kMetricSpilledInputBytes, facebook::velox::StatType::SUM); + + // The number of bytes spilled to disk which can be number of compressed + // bytes if compression is enabled. + DEFINE_METRIC(kMetricSpilledBytes, facebook::velox::StatType::SUM); + + // The number of spilled rows. + DEFINE_METRIC(kMetricSpilledRowsCount, facebook::velox::StatType::COUNT); + + // The number of spilled files. + DEFINE_METRIC(kMetricSpilledFilesCount, facebook::velox::StatType::COUNT); + + // The distribution of the amount of time spent on filling rows for spilling. + // in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillFillTimeMs, 20, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on sorting rows for spilling + // in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillSortTimeMs, 20, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on serializing rows for + // spilling in range of [0, 600s] with 20 buckets. It is configured to report + // the latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillSerializationTimeMs, 20, 0, 600'000, 50, 90, 99, 100); + + // The number of disk writes to spill rows. + DEFINE_METRIC(kMetricSpillDiskWritesCount, facebook::velox::StatType::COUNT); + + // The distribution of the amount of time spent on copy out serialized + // rows for disk write in range of [0, 600s] with 20 buckets. It is configured + // to report the latency at P50, P90, P99, and P100 percentiles. Note: If + // compression is enabled, this includes the compression time. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillFlushTimeMs, 20, 0, 600'000, 50, 90, 99, 100); + + // The distribution of the amount of time spent on writing spilled rows to + // disk in range of [0, 600s] with 20 buckets. It is configured to report the + // latency at P50, P90, P99, and P100 percentiles. + DEFINE_HISTOGRAM_METRIC( + kMetricSpillWriteTimeMs, 20, 0, 600'000, 50, 90, 99, 100); + + // Tracks the number of times that we hit the max spill level limit. + DEFINE_METRIC( + kMetricMaxSpillLevelExceededCount, facebook::velox::StatType::COUNT); } } // namespace facebook::velox diff --git a/velox/common/base/Counters.h b/velox/common/base/Counters.h index 05f004c05162..aeb93ee2cfdd 100644 --- a/velox/common/base/Counters.h +++ b/velox/common/base/Counters.h @@ -66,4 +66,30 @@ constexpr folly::StringPiece kMetricArbitratorArbitrationTimeMs{ constexpr folly::StringPiece kMetricArbitratorFreeCapacityBytes{ "velox.arbitrator_free_capacity_bytes"}; + +constexpr folly::StringPiece kMetricSpilledInputBytes{ + "velox.spill_input_bytes"}; + +constexpr folly::StringPiece kMetricSpilledBytes{"velox.spill_bytes"}; + +constexpr folly::StringPiece kMetricSpilledRowsCount{"velox.spill_rows_count"}; + +constexpr folly::StringPiece kMetricSpilledFilesCount{ + "velox.spill_files_count"}; + +constexpr folly::StringPiece kMetricSpillFillTimeMs{"velox.spill_fill_time_ms"}; + +constexpr folly::StringPiece kMetricSpillSortTimeMs{"velox.spill_sort_time_ms"}; + +constexpr folly::StringPiece kMetricSpillSerializationTimeMs{ + "velox.spill_serialization_time_ms"}; + +constexpr folly::StringPiece kMetricSpillDiskWritesCount{ + "velox.spill_disk_writes_count"}; + +constexpr folly::StringPiece kMetricSpillFlushTimeMs{ + "velox.spill_flush_time_ms"}; + +constexpr folly::StringPiece kMetricSpillWriteTimeMs{ + "velox.spill_write_time_ms"}; } // namespace facebook::velox diff --git a/velox/common/base/SpillStats.cpp b/velox/common/base/SpillStats.cpp index 0dfd58e5b4fa..942484228f26 100644 --- a/velox/common/base/SpillStats.cpp +++ b/velox/common/base/SpillStats.cpp @@ -15,7 +15,9 @@ */ #include "velox/common/base/SpillStats.h" +#include "velox/common/base/Counters.h" #include "velox/common/base/Exceptions.h" +#include "velox/common/base/StatsReporter.h" #include "velox/common/base/SuccinctPrinter.h" namespace facebook::velox::common { @@ -220,6 +222,9 @@ void updateGlobalSpillRunStats(uint64_t numRuns) { void updateGlobalSpillAppendStats( uint64_t numRows, uint64_t serializationTimeUs) { + RECORD_METRIC_VALUE(kMetricSpilledRowsCount, numRows); + RECORD_HISTOGRAM_METRIC_VALUE( + kMetricSpillSerializationTimeMs, serializationTimeUs / 1'000); auto statsLocked = localSpillStats().wlock(); statsLocked->spilledRows += numRows; statsLocked->spillSerializationTimeUs += serializationTimeUs; @@ -230,10 +235,12 @@ void incrementGlobalSpilledPartitionStats() { } void updateGlobalSpillFillTime(uint64_t timeUs) { + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillFillTimeMs, timeUs / 1'000); localSpillStats().wlock()->spillFillTimeUs += timeUs; } void updateGlobalSpillSortTime(uint64_t timeUs) { + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillSortTimeMs, timeUs / 1'000); localSpillStats().wlock()->spillSortTimeUs += timeUs; } @@ -241,6 +248,10 @@ void updateGlobalSpillWriteStats( uint64_t spilledBytes, uint64_t flushTimeUs, uint64_t writeTimeUs) { + RECORD_METRIC_VALUE(kMetricSpillDiskWritesCount); + RECORD_METRIC_VALUE(kMetricSpilledBytes, spilledBytes); + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillFlushTimeMs, flushTimeUs / 1'000); + RECORD_HISTOGRAM_METRIC_VALUE(kMetricSpillWriteTimeMs, writeTimeUs / 1'000); auto statsLocked = localSpillStats().wlock(); ++statsLocked->spillDiskWrites; statsLocked->spilledBytes += spilledBytes; @@ -249,11 +260,13 @@ void updateGlobalSpillWriteStats( } void updateGlobalSpillMemoryBytes(uint64_t spilledInputBytes) { + RECORD_METRIC_VALUE(kMetricSpilledInputBytes, spilledInputBytes); auto statsLocked = localSpillStats().wlock(); statsLocked->spilledInputBytes += spilledInputBytes; } void incrementGlobalSpilledFiles() { + RECORD_METRIC_VALUE(kMetricSpilledFilesCount); ++localSpillStats().wlock()->spilledFiles; } diff --git a/velox/docs/metrics.rst b/velox/docs/metrics.rst index 10e0edf3ad72..219fa278a196 100644 --- a/velox/docs/metrics.rst +++ b/velox/docs/metrics.rst @@ -137,8 +137,50 @@ Spilling - Description * - spill_max_level_exceeded_count - Count - - The number of times that a spillable operator hits the max spill level + - The number of times that a spill-able operator hits the max spill level limit. + * - spill_input_bytes + - Sum + - The number of bytes in memory to spill. + * - spill_bytes + - Sum + - The number of bytes spilled to disk which can be the number of compressed + bytes if compression is enabled. + * - spill_rows_count + - Count + - The number of spilled rows. + * - spill_files_count + - Count + - The number of spilled files. + * - spill_fill_time_ms + - Histogram + - The distribution of the amount of time spent on filling rows for spilling + in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. + * - spill_sort_time_ms + - Histogram + - The distribution of the amount of time spent on sorting rows for spilling + in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. + * - spill_serialization_time_ms + - Histogram + - The distribution of the amount of time spent on serializing rows for + spilling in range of [0, 600s] with 20 buckets. It is configured to report + the latency at P50, P90, P99, and P100 percentiles. + * - spill_disk_writes_count + - Count + - The number of disk writes to spill rows. + * - spill_flush_time_ms + - Histogram + - The distribution of the amount of time spent on copy out serialized + rows for disk write in range of [0, 600s] with 20 buckets. It is configured + to report the latency at P50, P90, P99, and P100 percentiles. Note: If + compression is enabled, this includes the compression time. + * - spill_write_time_ms + - Histogram + - The distribution of the amount of time spent on writing spilled rows to + disk in range of [0, 600s] with 20 buckets. It is configured to report the + latency at P50, P90, P99, and P100 percentiles. Hive Connector --------------