apache · selvinsource · Oct 12, 2014 · Oct 18, 2014 · Oct 26, 2014 · Oct 27, 2014
diff --git a/LICENSE b/LICENSE
@@ -814,6 +814,7 @@ BSD-style licenses
 The following components are provided under a BSD-style license. See project link for details.
 
      (BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
+     (BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.7 - https://github.com/jpmml/jpmml-model)
      (BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
      (BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
      (BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)

diff --git a/mllib/pom.xml b/mllib/pom.xml
@@ -109,6 +109,21 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.jpmml</groupId>
+      <artifactId>pmml-model</artifactId>
+      <version>1.1.15</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.sun.xml.fastinfoset</groupId>
+          <artifactId>FastInfoset</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.istack</groupId>
+          <artifactId>istack-commons-runtime</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -23,6 +23,7 @@ import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.BLAS.dot
 import org.apache.spark.mllib.linalg.{DenseVector, Vector}
 import org.apache.spark.mllib.optimization._
+import org.apache.spark.mllib.pmml.PMMLExportable
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{DataValidators, Saveable, Loader}
 import org.apache.spark.rdd.RDD
@@ -46,7 +47,7 @@ class LogisticRegressionModel (
     val numFeatures: Int,
     val numClasses: Int)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
-  with Saveable {
+  with Saveable with PMMLExportable {
 
   if (numClasses == 2) {
     require(weights.size == numFeatures,

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -22,6 +22,7 @@ import org.apache.spark.annotation.Experimental
 import org.apache.spark.mllib.classification.impl.GLMClassificationModel
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.mllib.optimization._
+import org.apache.spark.mllib.pmml.PMMLExportable
 import org.apache.spark.mllib.regression._
 import org.apache.spark.mllib.util.{DataValidators, Loader, Saveable}
 import org.apache.spark.rdd.RDD
@@ -36,7 +37,7 @@ class SVMModel (
     override val weights: Vector,
     override val intercept: Double)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable
-  with Saveable {
+  with Saveable with PMMLExportable {
 
   private var threshold: Option[Double] = Some(0.0)
 

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -25,6 +25,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.pmml.PMMLExportable
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
@@ -34,7 +35,8 @@ import org.apache.spark.sql.Row
 /**
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
  */
-class KMeansModel (val clusterCenters: Array[Vector]) extends Saveable with Serializable {
+class KMeansModel (
+    val clusterCenters: Array[Vector]) extends Saveable with Serializable with PMMLExportable {
 
   /** A Java-friendly constructor that takes an Iterable of Vectors. */
   def this(centers: java.lang.Iterable[Vector]) = this(centers.asScala.toArray)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.pmml
+
+import java.io.{File, OutputStream, StringWriter}
+import javax.xml.transform.stream.StreamResult
+
+import org.jpmml.model.JAXBUtil
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
+
+/**
+ * Export model to the PMML format
+ * Predictive Model Markup Language (PMML) is an XML-based file format
+ * developed by the Data Mining Group (www.dmg.org).
+ */
+trait PMMLExportable {
+
+  /**
+   * Export the model to the stream result in PMML format
+   */
+  private def toPMML(streamResult: StreamResult): Unit = {
+    val pmmlModelExport = PMMLModelExportFactory.createPMMLModelExport(this)
+    JAXBUtil.marshalPMML(pmmlModelExport.getPmml, streamResult)
+  }
+
+  /**
+   * Export the model to a local file in PMML format
+   */
+  def toPMML(localPath: String): Unit = {
+    toPMML(new StreamResult(new File(localPath)))
+  }
+
+  /**
+   * Export the model to a directory on a distributed file system in PMML format
+   */
+  def toPMML(sc: SparkContext, path: String): Unit = {
+    val pmml = toPMML()
+    sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
+  }
+
+  /**
+   * Export the model to the OutputStream in PMML format
+   */
+  def toPMML(outputStream: OutputStream): Unit = {
+    toPMML(new StreamResult(outputStream))
+  }
+
+  /**
+   * Export the model to a String in PMML format
+   */
+  def toPMML(): String = {
+    val writer = new StringWriter
+    toPMML(new StreamResult(writer))
+    writer.toString
+  }
+
+}
diff --git a/...c/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/...c/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.pmml.export
+
+import scala.{Array => SArray}
+
+import org.dmg.pmml._
+
+import org.apache.spark.mllib.regression.GeneralizedLinearModel
+
+/**
+ * PMML Model Export for GeneralizedLinearModel class with binary ClassificationModel
+ */
+private[mllib] class BinaryClassificationPMMLModelExport(
+    model : GeneralizedLinearModel, 
+    description : String,
+    normalizationMethod : RegressionNormalizationMethodType,
+    threshold: Double) 
+  extends PMMLModelExport {
+
+  populateBinaryClassificationPMML()
+
+  /**
+   * Export the input LogisticRegressionModel or SVMModel to PMML format.
+   */
+  private def populateBinaryClassificationPMML(): Unit = {
+     pmml.getHeader.setDescription(description)
+
+     if (model.weights.size > 0) {
+       val fields = new SArray[FieldName](model.weights.size)
+       val dataDictionary = new DataDictionary
+       val miningSchema = new MiningSchema
+       val regressionTableYES = new RegressionTable(model.intercept).withTargetCategory("1")
+       var interceptNO = threshold
+       if (RegressionNormalizationMethodType.LOGIT == normalizationMethod) {
+         if (threshold <= 0)
+           interceptNO = -1000
+         else if (threshold >= 1)
+           interceptNO = 1000
+         else
+           interceptNO = -math.log(1/threshold -1)
+       }
+       val regressionTableNO = new RegressionTable(interceptNO).withTargetCategory("0")
+       val regressionModel = new RegressionModel()
+         .withFunctionName(MiningFunctionType.CLASSIFICATION)
+         .withMiningSchema(miningSchema)
+         .withModelName(description)
+         .withNormalizationMethod(normalizationMethod)
+         .withRegressionTables(regressionTableYES, regressionTableNO)
+
+       for (i <- 0 until model.weights.size) {
+         fields(i) = FieldName.create("field_" + i)
+         dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
+         miningSchema
+           .withMiningFields(new MiningField(fields(i))
+           .withUsageType(FieldUsageType.ACTIVE))
+         regressionTableYES.withNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
+       }
+
+       // add target field
+       val targetField = FieldName.create("target")
+       dataDictionary
+         .withDataFields(new DataField(targetField, OpType.CATEGORICAL, DataType.STRING))
+       miningSchema
+         .withMiningFields(new MiningField(targetField)
+         .withUsageType(FieldUsageType.TARGET))
+
+       dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)
+
+       pmml.setDataDictionary(dataDictionary)
+       pmml.withModels(regressionModel)
+     }
+  }
+}
diff --git a/.../src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala b/.../src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.pmml.export
+
+import scala.{Array => SArray}
+
+import org.dmg.pmml._
+
+import org.apache.spark.mllib.regression.GeneralizedLinearModel
+
+/**
+ * PMML Model Export for GeneralizedLinearModel abstract class
+ */
+private[mllib] class GeneralizedLinearPMMLModelExport(
+    model: GeneralizedLinearModel,
+    description: String)
+  extends PMMLModelExport {
+
+  populateGeneralizedLinearPMML(model)
+
+  /**
+   * Export the input GeneralizedLinearModel model to PMML format.
+   */
+  private def populateGeneralizedLinearPMML(model: GeneralizedLinearModel): Unit = {
+    pmml.getHeader.setDescription(description)
+
+    if (model.weights.size > 0) {
+      val fields = new SArray[FieldName](model.weights.size)
+      val dataDictionary = new DataDictionary
+      val miningSchema = new MiningSchema
+      val regressionTable = new RegressionTable(model.intercept)
+      val regressionModel = new RegressionModel()
+        .withFunctionName(MiningFunctionType.REGRESSION)
+        .withMiningSchema(miningSchema)
+        .withModelName(description)
+        .withRegressionTables(regressionTable)
+
+      for (i <- 0 until model.weights.size) {
+        fields(i) = FieldName.create("field_" + i)
+        dataDictionary.withDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE))
+        miningSchema
+          .withMiningFields(new MiningField(fields(i))
+          .withUsageType(FieldUsageType.ACTIVE))
+        regressionTable.withNumericPredictors(new NumericPredictor(fields(i), model.weights(i)))
+      }
+
+      // for completeness add target field
+      val targetField = FieldName.create("target")
+      dataDictionary.withDataFields(new DataField(targetField, OpType.CONTINUOUS, DataType.DOUBLE))
+      miningSchema
+        .withMiningFields(new MiningField(targetField)
+        .withUsageType(FieldUsageType.TARGET))
+
+      dataDictionary.withNumberOfFields(dataDictionary.getDataFields.size)
+
+      pmml.setDataDictionary(dataDictionary)
+      pmml.withModels(regressionModel)
+    }
+  }
+}