linkedin · akshayrai · Jan 8, 2018 · Dec 1, 2017 · Dec 7, 2017 · Dec 7, 2017
diff --git a/app-conf/HeuristicConf.xml b/app-conf/HeuristicConf.xml
@@ -193,5 +193,11 @@
     <classname>com.linkedin.drelephant.spark.heuristics.StagesHeuristic</classname>
     <viewname>views.html.help.spark.helpStagesHeuristic</viewname>
   </heuristic>
+  <heuristic>
+    <applicationtype>spark</applicationtype>
+    <heuristicname>Spark GC Time to CPU Time</heuristicname>
+    <classname>com.linkedin.drelephant.spark.heuristics.GcCpuTimeHeuristic</classname>
+    <viewname>views.html.help.spark.helpGcCpuTimeHeuristic</viewname>
+  </heuristic>
 
 </heuristics>
diff --git a/app/com/linkedin/drelephant/spark/fetchers/statusapiv1/statusapiv1.scala b/app/com/linkedin/drelephant/spark/fetchers/statusapiv1/statusapiv1.scala
@@ -87,6 +87,7 @@ trait ExecutorSummary{
   def totalShuffleRead: Long
   def totalShuffleWrite: Long
   def maxMemory: Long
+  def totalGCTime: Long
   def executorLogs: Map[String, String]}
 
 trait JobData{
@@ -292,6 +293,7 @@ class ExecutorSummaryImpl(
   var totalShuffleRead: Long,
   var totalShuffleWrite: Long,
   var maxMemory: Long,
+  var totalGCTime: Long,
   var executorLogs: Map[String, String]) extends ExecutorSummary
 
 class JobDataImpl(

diff --git a/app/com/linkedin/drelephant/spark/heuristics/ExecutorStorageSpillHeuristic.scala b/app/com/linkedin/drelephant/spark/heuristics/ExecutorStorageSpillHeuristic.scala
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2016 LinkedIn Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.linkedin.drelephant.spark.heuristics
+
+import com.linkedin.drelephant.analysis.Severity
+import com.linkedin.drelephant.spark.fetchers.statusapiv1._
+import com.linkedin.drelephant.analysis._
+import com.linkedin.drelephant.configurations.heuristic.HeuristicConfigurationData
+import com.linkedin.drelephant.spark.data.SparkApplicationData
+import scala.collection.JavaConverters
+
+/**
+  * A heuristic based on GC time and CPU run time
+  *
+  */
+class GcCpuTimeHeuristic(private val heuristicConfigurationData: HeuristicConfigurationData)
+  extends Heuristic[SparkApplicationData] {
+
+  import GcCpuTimeHeuristic._
+  import JavaConverters._
+
+  override def getHeuristicConfData(): HeuristicConfigurationData = heuristicConfigurationData
+
+  override def apply(data: SparkApplicationData): HeuristicResult = {
+    val evaluator = new Evaluator(this, data)
+    var resultDetails = Seq(
+      new HeuristicResultDetails("GC time to Executor Run time ratio", evaluator.ratio.toString),
+      new HeuristicResultDetails("GC total time", evaluator.jvmTime.toString),
+      new HeuristicResultDetails("Executor Run time", evaluator.executorRunTimeTotal.toString)
+    )
+
+    //adding recommendations to the result, severityTimeA corresponds to the ascending severity calculation
+    if (evaluator.severityTimeA.getValue > Severity.LOW.getValue) {
+      resultDetails = resultDetails :+ new HeuristicResultDetails("Note", "The ratio of JVM GC Time and executor Time is above normal, we recommend to increase the executor memory")
+    }
+    //severityTimeD corresponds to the descending severity calculation
+    if (evaluator.severityTimeD.getValue > Severity.LOW.getValue) {
+      resultDetails = resultDetails :+ new HeuristicResultDetails("Note", "The ratio of JVM GC Time and executor Time is below normal, we recommend to decrease the executor memory")
+    }
+
+    val result = new HeuristicResult(
+      heuristicConfigurationData.getClassName,
+      heuristicConfigurationData.getHeuristicName,
+      evaluator.severity,
+      0,
+      resultDetails.asJava
+    )
+    result
+  }
+}
+
+object GcCpuTimeHeuristic {
+  val SPARK_EXECUTOR_MEMORY = "spark.executor.memory"
+  val SPARK_EXECUTOR_CORES = "spark.executor.cores"
+
+  /** The ascending severity thresholds for the ratio of JVM GC Time and executor Run Time (checking whether ratio is above normal)
+    * These thresholds are experimental and are likely to change */
+  val DEFAULT_GC_SEVERITY_A_THRESHOLDS =
+    SeverityThresholds(low = 0.08D, moderate = 0.1D, severe = 0.15D, critical = 0.2D, ascending = true)
+
+  /** The descending severity thresholds for the ratio of JVM GC Time and executor Run Time (checking whether ratio is below normal)
+    * These thresholds are experimental and are likely to change */
+  val DEFAULT_GC_SEVERITY_D_THRESHOLDS =
+    SeverityThresholds(low = 0.05D, moderate = 0.04D, severe = 0.03D, critical = 0.01D, ascending = false)
+
+  class Evaluator(memoryFractionHeuristic: GcCpuTimeHeuristic, data: SparkApplicationData) {
+    lazy val executorSummaries: Seq[ExecutorSummary] = data.executorSummaries
+    lazy val appConfigurationProperties: Map[String, String] =
+      data.appConfigurationProperties
+    var (jvmTime, executorRunTimeTotal) = getTimeValues(executorSummaries)
+
+    var ratio: Double = jvmTime.toDouble / executorRunTimeTotal.toDouble
+
+    lazy val severityTimeA: Severity = DEFAULT_GC_SEVERITY_A_THRESHOLDS.severityOf(ratio)
+    lazy val severityTimeD: Severity = DEFAULT_GC_SEVERITY_D_THRESHOLDS.severityOf(ratio)
+    lazy val severity : Severity = Severity.max(severityTimeA, severityTimeD)
+
+    /**
+      * returns the total JVM GC Time and total executor Run Time across all stages
+      * @param executorSummaries
+      * @return
+      */
+    private def getTimeValues(executorSummaries: Seq[ExecutorSummary]): (Long, Long) = {
+      var jvmGcTimeTotal: Long = 0
+      var executorRunTimeTotal: Long = 0
+      executorSummaries.foreach(executorSummary => {
+        jvmGcTimeTotal+=executorSummary.totalGCTime
+        executorRunTimeTotal+=executorSummary.totalDuration
+      })
+      (jvmGcTimeTotal, executorRunTimeTotal)
+    }
+  }
+}
+
diff --git a/app/com/linkedin/drelephant/spark/legacydata/LegacyDataConverters.scala b/app/com/linkedin/drelephant/spark/legacydata/LegacyDataConverters.scala
@@ -173,6 +173,7 @@ object LegacyDataConverters {
         executorInfo.shuffleRead,
         executorInfo.shuffleWrite,
         executorInfo.maxMem,
+        executorInfo.totalGCTime,
         executorLogs = Map.empty
       )
     }

diff --git a/app/com/linkedin/drelephant/spark/legacydata/SparkExecutorData.java b/app/com/linkedin/drelephant/spark/legacydata/SparkExecutorData.java
@@ -43,14 +43,15 @@ public static class ExecutorInfo {
     public long inputBytes = 0L;
     public long outputBytes = 0L;
     public long shuffleRead = 0L;
+    public long totalGCTime = 0L;
     public long shuffleWrite = 0L;
 
     public String toString() {
       return "{execId: " + execId + ", hostPort:" + hostPort + " , rddBlocks: " + rddBlocks + ", memUsed: " + memUsed
           + ", maxMem: " + maxMem + ", diskUsed: " + diskUsed + ", totalTasks" + totalTasks + ", tasksActive: "
           + activeTasks + ", tasksComplete: " + completedTasks + ", tasksFailed: " + failedTasks + ", duration: "
           + duration + ", inputBytes: " + inputBytes + ", outputBytes:" + outputBytes + ", shuffleRead: " + shuffleRead
-          + ", shuffleWrite: " + shuffleWrite + "}";
+          + ", shuffleWrite: " + shuffleWrite + ", totalGCTime: " + totalGCTime + "}";
     }
   }
 

diff --git a/app/views/help/spark/helpGcCpuTimeHeuristic.scala.html b/app/views/help/spark/helpGcCpuTimeHeuristic.scala.html
@@ -0,0 +1,6 @@
+<p>The ratio of jvmGcTime to executorCpuTime is checked, to see if GC is taking too much time (providing more memory could help) or too little time (memory may be over provisioned, and can be reduced).</p>
+<p>The severity thresholds are as follows : </p>
+<p>Low: avg (jvmGcTime / executorCpuTime) >= .08</p>
+<p>Moderate: avg (jvmGcTime / executorCpuTime) >= .1</p>
+<p>Critical: avg (jvmGcTime / executorCpuTime) >= .15</p>
+<p>Severe:avg (jvmGcTime / executorCpuTime) >= .2</p>
diff --git a/test/com/linkedin/drelephant/spark/SparkMetricsAggregatorTest.scala b/test/com/linkedin/drelephant/spark/SparkMetricsAggregatorTest.scala
@@ -194,6 +194,7 @@ object SparkMetricsAggregatorTest {
     totalShuffleRead = 0,
     totalShuffleWrite = 0,
     maxMemory = 0,
+    totalGCTime = 0,
     executorLogs = Map.empty
   )
 }
diff --git a/test/com/linkedin/drelephant/spark/heuristics/ExecutorStorageSpillHeuristicTest.scala b/test/com/linkedin/drelephant/spark/heuristics/ExecutorStorageSpillHeuristicTest.scala
@@ -0,0 +1,138 @@
+/*
+ * Copyright 2016 LinkedIn Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package com.linkedin.drelephant.spark.heuristics
+
+import scala.collection.JavaConverters
+import com.linkedin.drelephant.analysis.{ApplicationType, Severity, SeverityThresholds}
+import com.linkedin.drelephant.configurations.heuristic.HeuristicConfigurationData
+import com.linkedin.drelephant.spark.data.{SparkApplicationData, SparkLogDerivedData, SparkRestDerivedData}
+import com.linkedin.drelephant.spark.fetchers.statusapiv1.{ApplicationInfoImpl, ExecutorSummaryImpl, StageDataImpl}
+import org.apache.spark.scheduler.SparkListenerEnvironmentUpdate
+import org.scalatest.{FunSpec, Matchers}
+
+import scala.concurrent.duration.Duration
+
+
+class GcCpuTimeHeuristicTest extends FunSpec with Matchers {
+  import GcCpuTimeHeuristicTest._
+
+  describe("GcCpuTimeHeuristic") {
+    val heuristicConfigurationData = newFakeHeuristicConfigurationData(
+      Map(
+        "max_to_median_ratio_severity_thresholds" -> "1.414,2,4,16",
+        "ignore_max_bytes_less_than_threshold" -> "4000000",
+        "ignore_max_millis_less_than_threshold" -> "4000001"
+      )
+    )
+    val gcCpuTimeHeuristic = new GcCpuTimeHeuristic(heuristicConfigurationData)
+
+    val executorSummaries = Seq(
+      newFakeExecutorSummary(
+        id = "1",
+        totalGCTime = Duration("2min").toMillis,
+        totalDuration = Duration("15min").toMillis
+      ),
+      newFakeExecutorSummary(
+        id = "2",
+        totalGCTime = Duration("6min").toMillis,
+        totalDuration = Duration("14min").toMillis
+      ),
+      newFakeExecutorSummary(
+        id = "3",
+        totalGCTime = Duration("4min").toMillis,
+        totalDuration = Duration("20min").toMillis
+      ),
+      newFakeExecutorSummary(
+        id = "4",
+        totalGCTime = Duration("8min").toMillis,
+        totalDuration = Duration("30min").toMillis
+      )
+    )
+
+    describe(".apply") {
+      val data1 = newFakeSparkApplicationData(executorSummaries)
+      val heuristicResult = gcCpuTimeHeuristic.apply(data1)
+      val heuristicResultDetails = heuristicResult.getHeuristicResultDetails
+
+      it("returns the severity") {
+        heuristicResult.getSeverity should be(Severity.CRITICAL)
+      }
+
+      it("returns the JVM GC time to Executor Run time duration") {
+        val details = heuristicResultDetails.get(0)
+        details.getName should include("GC time to Executor Run time ratio")
+        details.getValue should include("0.2531")
+      }
+
+      it("returns the total GC time") {
+        val details = heuristicResultDetails.get(1)
+        details.getName should include("GC total time")
+        details.getValue should be("1200000")
+      }
+
+      it("returns the executor's run time") {
+        val details = heuristicResultDetails.get(2)
+        details.getName should include("Executor Run time")
+        details.getValue should be("4740000")
+      }
+    }
+  }
+}
+
+object GcCpuTimeHeuristicTest {
+  import JavaConverters._
+
+  def newFakeHeuristicConfigurationData(params: Map[String, String] = Map.empty): HeuristicConfigurationData =
+    new HeuristicConfigurationData("heuristic", "class", "view", new ApplicationType("type"), params.asJava)
+
+  def newFakeExecutorSummary(
+    id: String,
+    totalGCTime: Long,
+    totalDuration: Long
+  ): ExecutorSummaryImpl = new ExecutorSummaryImpl(
+    id,
+    hostPort = "",
+    rddBlocks = 0,
+    memoryUsed=0,
+    diskUsed = 0,
+    activeTasks = 0,
+    failedTasks = 0,
+    completedTasks = 0,
+    totalTasks = 0,
+    totalDuration,
+    totalInputBytes=0,
+    totalShuffleRead=0,
+    totalShuffleWrite= 0,
+    maxMemory= 0,
+    totalGCTime,
+    executorLogs = Map.empty
+  )
+
+  def newFakeSparkApplicationData(
+    executorSummaries: Seq[ExecutorSummaryImpl]
+  ): SparkApplicationData = {
+    val appId = "application_1"
+
+    val restDerivedData = SparkRestDerivedData(
+      new ApplicationInfoImpl(appId, name = "app", Seq.empty),
+      jobDatas = Seq.empty,
+      stageDatas = Seq.empty,
+      executorSummaries = executorSummaries
+    )
+    SparkApplicationData(appId, restDerivedData, None)
+  }
+}
diff --git a/test/com/linkedin/drelephant/spark/heuristics/ExecutorsHeuristicTest.scala b/test/com/linkedin/drelephant/spark/heuristics/ExecutorsHeuristicTest.scala
@@ -249,6 +249,7 @@ object ExecutorsHeuristicTest {
     totalShuffleRead,
     totalShuffleWrite,
     maxMemory,
+    totalGCTime = 0,
     executorLogs = Map.empty
   )