Skip to content

Commit

Permalink
use static constructor for MLContext
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Mar 31, 2014
1 parent 6f59eed commit d088552
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 10 deletions.
23 changes: 16 additions & 7 deletions mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

class MLContext(self: SparkContext) {
/**
* Provides methods related to machine learning on top of [[org.apache.spark.SparkContext]].
*
* @param sparkContext a [[org.apache.spark.SparkContext]] instance
*/
class MLContext(val sparkContext: SparkContext) {
/**
* Reads labeled data in the LIBSVM format into an RDD[LabeledPoint].
* The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR.
Expand All @@ -34,16 +39,16 @@ class MLContext(self: SparkContext) {
* where the feature indices are converted to zero-based.
*
* @param path file or directory path in any Hadoop-supported file system URI
* @param numFeatures number of features, it will be determined from input
* if a non-positive value is given
*@param labelParser parser for labels, default: _.toDouble
* @param numFeatures number of features, which will be determined from the input data if a
* non-positive value is given. The default value is 0.
* @param labelParser parser for labels, default: _.toDouble
* @return labeled data stored as an RDD[LabeledPoint]
*/
def libSVMFile(
path: String,
numFeatures: Int,
numFeatures: Int = 0,
labelParser: String => Double = _.toDouble): RDD[LabeledPoint] = {
val parsed = self.textFile(path).map(_.trim).filter(!_.isEmpty).map(_.split(' '))
val parsed = sparkContext.textFile(path).map(_.trim).filter(!_.isEmpty).map(_.split(' '))
// Determine number of features.
val d = if (numFeatures > 0) {
numFeatures
Expand All @@ -70,5 +75,9 @@ class MLContext(self: SparkContext) {
}

object MLContext {
implicit def sparkContextToMLContext(sc: SparkContext): MLContext = new MLContext(sc)
/**
* Creates an [[org.apache.spark.mllib.MLContext]] instance from
* an [[org.apache.spark.SparkContext]] instance.
*/
def apply(sc: SparkContext): MLContext = new MLContext(sc)
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import org.scalatest.FunSuite
import com.google.common.base.Charsets
import com.google.common.io.Files

import org.apache.spark.mllib.MLContext._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.LocalSparkContext

Expand All @@ -40,8 +39,10 @@ class MLContextSuite extends FunSuite with LocalSparkContext {
val file = new File(tempDir.getPath, "part-00000")
Files.write(lines, file, Charsets.US_ASCII)

val pointsWithNumFeatures = sc.libSVMFile(tempDir.toURI.toString, 6).collect()
val pointsWithoutNumFeatures = sc.libSVMFile(tempDir.toURI.toString, 0).collect()
val mlc = MLContext(sc)

val pointsWithNumFeatures = mlc.libSVMFile(tempDir.toURI.toString, 6).collect()
val pointsWithoutNumFeatures = mlc.libSVMFile(tempDir.toURI.toString, 0).collect()

for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) {
assert(points.length === 3)
Expand Down

0 comments on commit d088552

Please sign in to comment.