Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-1212] Adding sparse data support and update KMeans #117

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
07ffaf2
add dense/sparse vector data models and conversions to/from breeze ve…
mengxr Mar 10, 2014
4e7d5ca
minor style update
mengxr Mar 10, 2014
ab74f67
add fastSquaredDistance for KMeans
mengxr Mar 10, 2014
f355411
add BreezeVectorWithSquaredNorm case class
mengxr Mar 11, 2014
0ff8046
update KMeans to use fastSquaredDistance
mengxr Mar 11, 2014
87bc755
tuned the KMeans code: changed some for loops to while, use view to a…
mengxr Mar 12, 2014
0107e19
update NOTICE
mengxr Mar 12, 2014
3ed1a24
add doc to BreezeVectorWithSquaredNorm
mengxr Mar 12, 2014
a4ace73
Merge branch 'fast-dist' into sparse-kmeans
mengxr Mar 12, 2014
42b4e50
line feed at the end
mengxr Mar 12, 2014
d6e6c07
add predict(RDD[Vector]) to KMeansModel
mengxr Mar 12, 2014
42512f2
Merge branch 'master' into sparse-kmeans
mengxr Mar 12, 2014
6f5cdde
fix a bug in filtering finished runs
mengxr Mar 12, 2014
07c3cf2
change Mahout to breeze in doc
mengxr Mar 18, 2014
27858e4
update breeze version to 0.7
mengxr Mar 19, 2014
712cb88
make Vectors.sparse Java friendly
mengxr Mar 20, 2014
72bde33
clean up code for distance computation
mengxr Mar 20, 2014
e69b10c
remove examples/JavaKMeans.java, which is replaced by mllib/examples/…
mengxr Mar 20, 2014
b28ba2f
add toArray to Vector
mengxr Mar 21, 2014
238ba34
add VectorRDDs with a converter from RDD[Array[Double]]
mengxr Mar 21, 2014
226d2cd
update Java friendly methods in Vectors
mengxr Mar 22, 2014
9bb1b31
optimize SparseVector.toArray
mengxr Mar 22, 2014
1da1033
remove dependency on commons-math3 and compute EPSILON directly
mengxr Mar 22, 2014
67abe31
move ArrayRDDs to mllib.rdd
mengxr Mar 22, 2014
5eda0de
update NOTICE
mengxr Mar 23, 2014
67b368d
fix SparseVector.toArray
mengxr Mar 23, 2014
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion NOTICE
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
Apache Spark
Copyright 2013 The Apache Software Foundation.
Copyright 2014 The Apache Software Foundation.

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

In addition, this product includes:

- JUnit (http://www.junit.org) is a testing framework for Java. We included it
under the terms of the Eclipse Public License v1.0.

- JTransforms (https://sites.google.com/site/piotrwendykier/software/jtransforms)
provides fast transforms in Java. It is tri-licensed, and we included it under
the terms of the Mozilla Public License v1.1.
138 changes: 0 additions & 138 deletions examples/src/main/java/org/apache/spark/examples/JavaKMeans.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,33 @@

package org.apache.spark.mllib.examples;

import java.util.regex.Pattern;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;

import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;

import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;

/**
* Example using MLLib KMeans from Java.
*/
public final class JavaKMeans {

static class ParsePoint implements Function<String, double[]> {
private static class ParsePoint implements Function<String, Vector> {
private static final Pattern SPACE = Pattern.compile(" ");

@Override
public double[] call(String line) {
public Vector call(String line) {
String[] tok = SPACE.split(line);
double[] point = new double[tok.length];
for (int i = 0; i < tok.length; ++i) {
point[i] = Double.parseDouble(tok[i]);
}
return point;
return Vectors.dense(point);
}
}

Expand All @@ -65,15 +66,15 @@ public static void main(String[] args) {

JavaSparkContext sc = new JavaSparkContext(args[0], "JavaKMeans",
System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaKMeans.class));
JavaRDD<String> lines = sc.textFile(args[1]);
JavaRDD<String> lines = sc.textFile(inputFile);

JavaRDD<double[]> points = lines.map(new ParsePoint());
JavaRDD<Vector> points = lines.map(new ParsePoint());

KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs);
KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs, KMeans.K_MEANS_PARALLEL());

System.out.println("Cluster centers:");
for (double[] center : model.clusterCenters()) {
System.out.println(" " + Arrays.toString(center));
for (Vector center : model.clusterCenters()) {
System.out.println(" " + center);
}
double cost = model.computeCost(points.rdd());
System.out.println("Cost: " + cost);
Expand Down
5 changes: 5 additions & 0 deletions mllib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@
<artifactId>jblas</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.scalanlp</groupId>
<artifactId>breeze_${scala.binary.version}</artifactId>
<version>0.7</version>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
*/

package org.apache.spark.mllib.api.python

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.regression._
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.clustering._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation._
import org.apache.spark.mllib.regression._
import org.apache.spark.rdd.RDD
import java.nio.ByteBuffer
import java.nio.ByteOrder

/**
* The Java stubs necessary for the Python mllib bindings.
Expand Down Expand Up @@ -205,10 +207,10 @@ class PythonMLLibAPI extends Serializable {
def trainKMeansModel(dataBytesJRDD: JavaRDD[Array[Byte]], k: Int,
maxIterations: Int, runs: Int, initializationMode: String):
java.util.List[java.lang.Object] = {
val data = dataBytesJRDD.rdd.map(xBytes => deserializeDoubleVector(xBytes))
val data = dataBytesJRDD.rdd.map(xBytes => Vectors.dense(deserializeDoubleVector(xBytes)))
val model = KMeans.train(data, k, maxIterations, runs, initializationMode)
val ret = new java.util.LinkedList[java.lang.Object]()
ret.add(serializeDoubleMatrix(model.clusterCenters))
ret.add(serializeDoubleMatrix(model.clusterCenters.map(_.toArray)))
ret
}

Expand Down
Loading