Skip to content

Commit

Permalink
Example usage for StreamingKMeans
Browse files Browse the repository at this point in the history
  • Loading branch information
freeman-lab committed Oct 25, 2014
1 parent f33684b commit 5db7074
Showing 1 changed file with 75 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.examples.mllib

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
* Estimate clusters on one stream of data and make predictions
* on another stream, where the data streams arrive as text files
* into two different directories.
*
* The rows of the text files must be vector data in the form
* `[x1,x2,x3,...,xn]`
* Where n is the number of dimensions. n must be the same for train and test.
*
* Usage: StreamingKmeans <trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>
*
* To run on your local machine using the two directories `trainingDir` and `testDir`,
* with updates every 5 seconds, 2 dimensions per data point, and 3 clusters, call:
* $ bin/run-example \
* org.apache.spark.examples.mllib.StreamingKMeans trainingDir testDir 5 3 2
*
* As you add text files to `trainingDir` the clusters will continuously update.
* Anytime you add text files to `testDir`, you'll see predicted labels using the current model.
*
*/
object StreamingKMeans {

def main(args: Array[String]) {

if (args.length != 5) {
System.err.println(
"Usage: StreamingKMeans " +
"<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")
System.exit(1)
}

val conf = new SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
val testData = ssc.textFileStream(args(1)).map(Vectors.parse)

val model = new StreamingKMeans()
.setK(args(3).toInt)
.setDecayFactor(1.0)
.setRandomCenters(args(4).toInt)

model.trainOn(trainingData)
model.predictOn(testData).print()

ssc.start()
ssc.awaitTermination()

}

}

0 comments on commit 5db7074

Please sign in to comment.