NVIDIA · jihoonson · Jul 18, 2024 · Jul 12, 2024 · Jul 12, 2024 · Jul 12, 2024
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/RapidsShuffleInternalManagerBase.scala
@@ -275,6 +275,28 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
   val diskBlockObjectWriters = new mutable.HashMap[Int, (Int, DiskBlockObjectWriter)]()
 
   override def write(records: Iterator[Product2[K, V]]): Unit = {
+    // Iterating the `records` may involve some heavy computations.
+    // This iterator is used to track how much time we spend for such computations.
+    class TimeTrackingIterator extends Iterator[Product2[K, V]] {
+      var iterateTimeNs: Long = 0L
+
+      override def hasNext: Boolean = {
+        val start = System.nanoTime()
+        val ret = records.hasNext
+        iterateTimeNs += System.nanoTime() - start
+        ret
+      }
+
+      override def next(): Product2[K, V] = {
+        val start = System.nanoTime()
+        val ret = records.next
+        iterateTimeNs += System.nanoTime() - start
+        ret
+      }
+    }
+
+    val timeTrackingIterator = new TimeTrackingIterator
+
     withResource(new NvtxRange("ThreadedWriter.write", NvtxColor.RED)) { _ =>
       withResource(new NvtxRange("compute", NvtxColor.GREEN)) { _ =>
         val mapOutputWriter = shuffleExecutorComponents.createMapOutputWriter(
@@ -283,7 +305,7 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
           numPartitions)
         try {
           var openTimeNs = 0L
-          val partLengths = if (!records.hasNext) {
+          val partLengths = if (!timeTrackingIterator.hasNext) {
             commitAllPartitions(mapOutputWriter, true /*empty checksum*/)
           } else {
             // per reduce partition id
@@ -305,15 +327,18 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
 
             // we call write on every writer for every record in parallel
             val writeFutures = new mutable.Queue[Future[Unit]]
-            val writeTimeStart: Long = System.nanoTime()
+            // Accumulated record write time as if they were sequential
             val recordWriteTime: AtomicLong = new AtomicLong(0L)
-            var computeTime: Long = 0L
+            // Time spent waiting on the limiter
+            var waitTimeOnLimiterNs: Long = 0L
+            // Time spent computing ColumnarBatch sizes
+            var batchSizeComputeTimeNs: Long = 0L
+            // Timestamp when the main processing begins
+            val processingStart: Long = System.nanoTime()
             try {
-              while (records.hasNext) {
+              while (timeTrackingIterator.hasNext) {
                 // get the record
-                val computeStartTime = System.nanoTime()
-                val record = records.next()
-                computeTime += System.nanoTime() - computeStartTime
+                val record = timeTrackingIterator.next()
                 val key = record._1
                 val value = record._2
                 val reducePartitionId: Int = partitioner.getPartition(key)
@@ -326,14 +351,18 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
                 } else {
                   // we close batches actively in the `records` iterator as we get the next batch
                   // this makes sure it is kept alive while a task is able to handle it.
+                  val sizeComputeStart = System.nanoTime()
                   val (cb, size) = value match {
                     case columnarBatch: ColumnarBatch =>
                       (SlicedGpuColumnVector.incRefCount(columnarBatch),
                         SlicedGpuColumnVector.getTotalHostMemoryUsed(columnarBatch))
                     case _ =>
                       (null, 0L)
                   }
+                  val waitOnLimiterStart = System.nanoTime()
+                  batchSizeComputeTimeNs += waitOnLimiterStart - sizeComputeStart
                   limiter.acquireOrBlock(size)
+                  waitTimeOnLimiterNs += System.nanoTime() - waitOnLimiterStart
                   writeFutures += RapidsShuffleInternalManagerBase.queueWriteTask(slotNum, () => {
                     withResource(cb) { _ =>
                       try {
@@ -371,9 +400,15 @@ abstract class RapidsShuffleThreadedWriterBase[K, V](
               }
             }
 
-            // writeTime is the amount of time it took to push bytes through the stream
-            // minus the amount of time it took to get the batch from the upstream execs
-            val writeTimeNs = (System.nanoTime() - writeTimeStart) - computeTime
+            // writeTimeNs is an approximation of the amount of time we spent in
+            // DiskBlockObjectWriter.write, which involves serializing records and writing them
+            // on disk. As we use multiple threads for writing, writeTimeNs is
+            // estimated by 'the total amount of time it took to finish processing the entire logic
+            // above' minus 'the amount of time it took to do anything expensive other than the
+            // serialization and the write. The latter involves computations in upstream execs,
+            // ColumnarBatch size estimation, and the time blocked on the limiter.
+            val writeTimeNs = (System.nanoTime() - processingStart) -
+              timeTrackingIterator.iterateTimeNs - batchSizeComputeTimeNs - waitTimeOnLimiterNs
 
             val combineTimeStart = System.nanoTime()
             val pl = writePartitionedData(mapOutputWriter)