Skip to content

Commit

Permalink
[SPARK-2434][MLlib]: Warning messages that point users to original ML…
Browse files Browse the repository at this point in the history
…lib implementations added to Examples

[SPARK-2434][MLlib]: Warning messages that refer users to the original MLlib implementations of some popular example machine learning algorithms added both in the comments and the code. The following examples have been modified:
Scala:
* LocalALS
* LocalFileLR
* LocalKMeans
* LocalLP
* SparkALS
* SparkHdfsLR
* SparkKMeans
* SparkLR
Python:
 * kmeans.py
 * als.py
 * logistic_regression.py

Author: Burak <brkyvz@gmail.com>

Closes apache#1515 from brkyvz/SPARK-2434 and squashes the following commits:

7505da9 [Burak] [SPARK-2434][MLlib]: Warning messages added, scalastyle errors fixed, and added missing punctuation
b96b522 [Burak] [SPARK-2434][MLlib]: Warning messages added and scalastyle errors fixed
4762f39 [Burak] [SPARK-2434]: Warning messages added
17d3d83 [Burak] SPARK-2434: Added warning messages to the naive implementations of the example algorithms
2cb5301 [Burak] SPARK-2434: Warning messages redirecting to original implementaions added.
  • Loading branch information
brkyvz authored and mengxr committed Jul 22, 2014
1 parent abeacff commit a4d6020
Show file tree
Hide file tree
Showing 11 changed files with 141 additions and 1 deletion.
9 changes: 9 additions & 0 deletions examples/src/main/python/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#

"""
This is an example implementation of ALS for learning how to use Spark. Please refer to
ALS in pyspark.mllib.recommendation for more conventional use.
This example requires numpy (http://www.numpy.org/)
"""
from os.path import realpath
Expand Down Expand Up @@ -49,9 +52,15 @@ def update(i, vec, mat, ratings):


if __name__ == "__main__":

"""
Usage: als [M] [U] [F] [iterations] [slices]"
"""

print >> sys.stderr, """WARN: This is a naive implementation of ALS and is given as an
example. Please use the ALS method found in pyspark.mllib.recommendation for more
conventional use."""

sc = SparkContext(appName="PythonALS")
M = int(sys.argv[1]) if len(sys.argv) > 1 else 100
U = int(sys.argv[2]) if len(sys.argv) > 2 else 500
Expand Down
6 changes: 6 additions & 0 deletions examples/src/main/python/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,15 @@ def closestPoint(p, centers):


if __name__ == "__main__":

if len(sys.argv) != 4:
print >> sys.stderr, "Usage: kmeans <file> <k> <convergeDist>"
exit(-1)

print >> sys.stderr, """WARN: This is a naive implementation of KMeans Clustering and is given
as an example! Please refer to examples/src/main/python/mllib/kmeans.py for an example on
how to use MLlib's KMeans implementation."""

sc = SparkContext(appName="PythonKMeans")
lines = sc.textFile(sys.argv[1])
data = lines.map(parseVector).cache()
Expand Down
6 changes: 6 additions & 0 deletions examples/src/main/python/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,15 @@ def readPointBatch(iterator):
return [matrix]

if __name__ == "__main__":

if len(sys.argv) != 3:
print >> sys.stderr, "Usage: logistic_regression <file> <iterations>"
exit(-1)

print >> sys.stderr, """WARN: This is a naive implementation of Logistic Regression and is
given as an example! Please refer to examples/src/main/python/mllib/logistic_regression.py
to see how MLlib's implementation is used."""

sc = SparkContext(appName="PythonLR")
points = sc.textFile(sys.argv[1]).mapPartitions(readPointBatch).cache()
iterations = int(sys.argv[2])
Expand Down
15 changes: 15 additions & 0 deletions examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import cern.jet.math._

/**
* Alternating least squares matrix factorization.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.recommendation.ALS
*/
object LocalALS {
// Parameters set through command line arguments
Expand Down Expand Up @@ -107,7 +110,16 @@ object LocalALS {
solved2D.viewColumn(0)
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of ALS and is given as an example!
|Please use the ALS method found in org.apache.spark.mllib.recommendation
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

args match {
case Array(m, u, f, iters) => {
M = m.toInt
Expand All @@ -120,6 +132,9 @@ object LocalALS {
System.exit(1)
}
}

showWarning()

printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)

val R = generateR()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ import java.util.Random

import breeze.linalg.{Vector, DenseVector}

/**
* Logistic regression based classification.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.classification.LogisticRegression
*/
object LocalFileLR {
val D = 10 // Numer of dimensions
val rand = new Random(42)
Expand All @@ -32,7 +38,18 @@ object LocalFileLR {
DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of Logistic Regression and is given as an example!
|Please use the LogisticRegression method found in org.apache.spark.mllib.classification
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

showWarning()

val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
val points = lines.map(parsePoint _)
val ITERATIONS = args(1).toInt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ import org.apache.spark.SparkContext._

/**
* K-means clustering.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.clustering.KMeans
*/
object LocalKMeans {
val N = 1000
Expand Down Expand Up @@ -61,7 +64,18 @@ object LocalKMeans {
bestIndex
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of KMeans Clustering and is given as an example!
|Please use the KMeans method found in org.apache.spark.mllib.clustering
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

showWarning()

val data = generateData
var points = new HashSet[Vector[Double]]
var kPoints = new HashMap[Int, Vector[Double]]
Expand Down
15 changes: 14 additions & 1 deletion examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ import breeze.linalg.{Vector, DenseVector}

/**
* Logistic regression based classification.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.classification.LogisticRegression
*/
object LocalLR {
val N = 10000 // Number of data points
Expand All @@ -42,9 +45,19 @@ object LocalLR {
Array.tabulate(N)(generatePoint)
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of Logistic Regression and is given as an example!
|Please use the LogisticRegression method found in org.apache.spark.mllib.classification
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {
val data = generateData

showWarning()

val data = generateData
// Initialize w to a random value
var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
println("Initial w: " + w)
Expand Down
16 changes: 16 additions & 0 deletions examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ import org.apache.spark._

/**
* Alternating least squares matrix factorization.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.recommendation.ALS
*/
object SparkALS {
// Parameters set through command line arguments
Expand Down Expand Up @@ -87,7 +90,16 @@ object SparkALS {
solved2D.viewColumn(0)
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of ALS and is given as an example!
|Please use the ALS method found in org.apache.spark.mllib.recommendation
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

var slices = 0

val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
Expand All @@ -103,7 +115,11 @@ object SparkALS {
System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
System.exit(1)
}

showWarning()

printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)

val sparkConf = new SparkConf().setAppName("SparkALS")
val sc = new SparkContext(sparkConf)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ import org.apache.spark.scheduler.InputFormatInfo

/**
* Logistic regression based classification.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.classification.LogisticRegression
*/
object SparkHdfsLR {
val D = 10 // Numer of dimensions
Expand All @@ -48,12 +51,23 @@ object SparkHdfsLR {
DataPoint(new DenseVector(x), y)
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of Logistic Regression and is given as an example!
|Please use the LogisticRegression method found in org.apache.spark.mllib.classification
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

if (args.length < 2) {
System.err.println("Usage: SparkHdfsLR <file> <iters>")
System.exit(1)
}

showWarning()

val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
val inputPath = args(0)
val conf = SparkHadoopUtil.get.newConfiguration()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ import org.apache.spark.SparkContext._

/**
* K-means clustering.
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.clustering.KMeans
*/
object SparkKMeans {

Expand All @@ -46,11 +49,23 @@ object SparkKMeans {
bestIndex
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of KMeans Clustering and is given as an example!
|Please use the KMeans method found in org.apache.spark.mllib.clustering
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

if (args.length < 3) {
System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
System.exit(1)
}

showWarning()

val sparkConf = new SparkConf().setAppName("SparkKMeans")
val sc = new SparkContext(sparkConf)
val lines = sc.textFile(args(0))
Expand Down
15 changes: 15 additions & 0 deletions examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ import org.apache.spark._
/**
* Logistic regression based classification.
* Usage: SparkLR [slices]
*
* This is an example implementation for learning how to use Spark. For more conventional use,
* please refer to org.apache.spark.mllib.classification.LogisticRegression
*/
object SparkLR {
val N = 10000 // Number of data points
Expand All @@ -47,7 +50,18 @@ object SparkLR {
Array.tabulate(N)(generatePoint)
}

def showWarning() {
System.err.println(
"""WARN: This is a naive implementation of Logistic Regression and is given as an example!
|Please use the LogisticRegression method found in org.apache.spark.mllib.classification
|for more conventional use.
""".stripMargin)
}

def main(args: Array[String]) {

showWarning()

val sparkConf = new SparkConf().setAppName("SparkLR")
val sc = new SparkContext(sparkConf)
val numSlices = if (args.length > 0) args(0).toInt else 2
Expand All @@ -66,6 +80,7 @@ object SparkLR {
}

println("Final w: " + w)

sc.stop()
}
}

0 comments on commit a4d6020

Please sign in to comment.