salesforce · gerashegalov · Sep 11, 2019 · Sep 9, 2019 · Sep 9, 2019 · Sep 9, 2019
@@ -57,16 +57,23 @@ private[op] class OpCrossValidation[M <: Model[_], E <: Estimator[_]]
   private def findBestModel(
     folds: Seq[ValidatedModel[E]]
   ): ValidatedModel[E] = {
-    val metrics = folds.map(_.metrics).reduce(_ + _)
-    blas.dscal(metrics.length, 1.0 / numFolds, metrics, 1)
-    val ValidatedModel(est, _, _, grid) = folds.head
-    log.info(s"Average cross-validation for $est metrics: {}", metrics.toSeq.mkString(","))
-    val (bestMetric, bestIndex) =
-      if (evaluator.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
-      else metrics.zipWithIndex.minBy(_._1)
-    log.info(s"Best set of parameters:\n${grid(bestIndex)}")
+    require(folds.map(_.model.uid).toSet.size == 1) // Should be called only on instances of the same model
+    val gridCounts = folds.map(_.grids.map(_ -> 1).toMap).reduce(_ + _)
+    val maxFolds = gridCounts.maxBy(_._2)._2
+    val gridsIn = gridCounts.filter(_._2 == maxFolds).keySet
+    val gridMetrics = folds.map(f => f.grids.zip(f.metrics).toMap).reduce(_ + _)
+      .filterKeys(gridsIn.contains)
+      .map{ case (key, met) => key -> met / maxFolds}
+      .toSeq
+    val ((bestGrid, bestMetric), bestIndex) =
+      if (evaluator.isLargerBetter) gridMetrics.zipWithIndex.maxBy(_._1._2)
+      else gridMetrics.zipWithIndex.minBy(_._1._2)
+    val ValidatedModel(est, _, _, _) = folds.head
+    log.info(s"Average cross-validation for $est metrics: {}", gridMetrics.mkString(","))
+    log.info(s"Best set of parameters:\n$bestGrid")
     log.info(s"Best cross-validation metric: $bestMetric.")
-    ValidatedModel(est, bestIndex, metrics, grid)
+    val (grid, metrics) = gridMetrics.unzip
+    ValidatedModel(est, bestIndex, metrics.toArray, grid.toArray)
   }
 
   private[op] override def validate[T](

@@ -219,7 +219,7 @@ class RegressionModelSelectorTest extends FlatSpec with TestSparkContext
     justScores.length shouldEqual transformedData.count()
   }
 
-  it should "fit and predict for even when some models fail" in {
+  it should "fit and predict even when some models fail" in {
     val testEstimator = RegressionModelSelector
       .withCrossValidation(
         numFolds = 4,
@@ -240,11 +240,34 @@ class RegressionModelSelectorTest extends FlatSpec with TestSparkContext
       assert(metaData.trainEvaluation.toJson(false).contains(s"${metric.entryName}"),
         s"Metric ${metric.entryName} is not present in metadata: " + metaData)
     )
-    metaData.validationResults.foreach(println(_))
     metaData.validationResults.size shouldBe 42
   }
 
 
+  it should "fit and predict even when some parameter settings fail for one of the models" in {
+    val testEstimator = RegressionModelSelector
+      .withCrossValidation(
+        numFolds = 4,
+        validationMetric = Evaluators.Regression.mse(),
+        seed = 10L,
+        modelTypesToUse = Seq(RMT.OpGeneralizedLinearRegression)
+      )
+      .setInput(label, features)
+
+
+    val model = testEstimator.fit(data)
+    model.evaluateModel(data)
+
+    // evaluation metrics from train set should be in metadata
+    val metaData = ModelSelectorSummary.fromMetadata(model.getMetadata().getSummaryMetadata())
+    RegressionEvalMetrics.values.foreach(metric =>
+      assert(metaData.trainEvaluation.toJson(false).contains(s"${metric.entryName}"),
+        s"Metric ${metric.entryName} is not present in metadata: " + metaData)
+    )
+    metaData.validationResults.size shouldBe 32
+  }
+
+
   it should "fail when all models fail due to inappropriate data" in {
 
     val glr = new OpGeneralizedLinearRegression()