Skip to content

Commit

Permalink
Converted from custom Linalg routines to Breeze: added JavaDoc commen…
Browse files Browse the repository at this point in the history
…ts; added Markdown documentation
  • Loading branch information
sboeschhuawei committed Jan 28, 2015
1 parent bea48ea commit 90e7fa4
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 333 deletions.
15 changes: 15 additions & 0 deletions data/mllib/pic-15.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
91 2.1419053155 1.919407949 0.0501333631 -0.1069902864 1.2809776381 1.6846227956 0.1827785926 -0.3966434027 0.8090554869 2.4862133924
82 1.8023071497 0.8784870753 2.4105062239 0.3597672178 -0.2096444593 1.3537576979 0.5096503508 1.5507215383 -0.203551002 1.3210160806
73 2.5511476389 1.4385302862 1.4815980608 2.519631079 0.7231682708 0.9160610215 2.2558330058 0.6747272061 0.8267096669 -0.8585851446
64 2.4238069456 -0.3637260241 -0.9646660988 0.0814051561 -1.5488873934 -0.6309606578 0.8779952254 2.2891590718 0.7308611443 1.2574914085
55 0.680085624 -0.7684998593 0.5165496871 0.4900095346 2.116673377 0.9590527985 -0.1076715169 2.8623214176 2.1457411377 -0.0586772049
46 2.0725991339 -0.931744152 1.3010252161 1.2475231583 2.4061568492 -0.5202207204 1.2709294127 1.5612492848 0.470170422 1.5390221915
37 3.2123402142 0.3670664312 -0.8831759122 1.3865659854 1.3258292709 0.0986956805 0.9973196911 0.526040745 0.4520218452 0.9808998515
28 2.6468163883 -0.1070625922 1.5938103927 0.8443353789 1.6632872929 2.2267933607 1.8839698438 1.2217245467 1.919702086 0.2606241814
19 1.8035177495 0.7460582552 0.2361611395 -0.8645567427 -0.8613062 0.4234001189 0.5910061938 1.2484609376 0.5190870451 1.4462120574
10 0.5534111111 1.0456386879 1.7045663273 0.7281759816 1.0807487792 2.2590964696 1.7635098382 2.7220810802 1.1459500541 0.0053369875
911 1.200749626 1.8962364439 2.5117192131 -0.4034737281 -0.9069696484 2.3685654487 0.4403269676 1.7446081537 2.5736655957 2.1280434418
812 0.8079184133 -1.2544936618 1.4398518629 1.6568003266 0.2550498386 2.1994753269 2.7797467522 1.0674041521 2.295064022 0.4173234715
713 1.7688682382 1.4176645502 0.530907764 1.4141481733 1.6630227275 1.8671946375 1.2967008778 1.3215230565 3.2242953581 1.8358482078
614 -0.193302298 1.118805146 1.5580410346 -0.9527104651 2.4960553383 0.2374178113 1.8951776489 0.8173290971 1.929763464 0.5625196402
515 0.895089061 0.3885617561 1.3527646645 -0.1445166108 0.3461682011 3.6770971085 1.1513217164 2.8470372001 1.440743315 1.8773090852
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 30 additions & 0 deletions docs/mllib-pic-clustering.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
layout: global
title: Clustering - MLlib
displayTitle: <a href="mllib-guide.html">MLlib</a> - Power Iteration Clustering
---

* Table of contents
{:toc}


## Power Iteration Clustering

Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values. Internally the algorithm:

* computes the Gaussian distance between all pairs of points and represents these distances in an Affinity Matrix
* calculates a Normalized Affinity Matrix
* calculates the principal eigenvalue and eigenvector
* Clusters each of the input points according to their principal eigenvector component value

Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf}

Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation:

<p style="text-align: center;">
<img src="img/PIClusteringFiveCirclesInputsAndOutputs.png"
title="The Property Graph"
alt="The Property Graph"
width="50%" />
<!-- Images are downsized intentionally to improve quality on retina displays -->
</p>
104 changes: 19 additions & 85 deletions mllib/src/main/scala/org/apache/spark/mllib/clustering/PICLinalg.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ import org.apache.spark.mllib.linalg.Vectors
import scala.reflect.ClassTag
import scala.util.Random
import breeze.linalg.{DenseVector => BDV,DenseMatrix => BDM}
import scala.language.existentials
import scala.language.implicitConversions

/**
* PICLinalg
* Linear Algebra helper routines associated with the PIClustering implementation
*
*/

object PICLinalg {

import breeze.linalg.DenseVector._
import breeze.linalg.DenseMatrix._
type DMatrix = BDM[Double]

type LabeledVector = (Long, BDV[Double])
Expand All @@ -38,68 +41,17 @@ object PICLinalg {

type Vertices = Seq[LabeledVector]

// implicit def arrayToVect(darr: Array[Double]): BDV[Double] = new BDV(darr)
implicit def bdvToSeq[T](vect: BDV[T])(implicit ct: ClassTag[T]): Seq[T] = vect.toArray.toSeq
implicit def bdvToArray[T](vect: BDV[T])(implicit ct: ClassTag[T]): Array[T] = vect.toArray
// implicit def arrayToSeq(arr: Array[Double]) = Predef.doubleArrayOps(arr)
def add(v1: BDV[Double], v2: BDV[Double]) =
v1.zip(v2).map { x => x._1 + x._2}

def norm(darr: Array[Double]): Double = {
Math.sqrt(darr.foldLeft(0.0) { case (sum, dval) => sum + Math.pow(dval, 2)})
}

def norm(darr: BDV[Double]): Double = {
darr.norm(2)
}

//
//
// // Implicits to convert between Breeze DenseVector's and Arrays
// implicit def arrToSeq[T](arr: Array[T]): Seq[T] = arr.toSeq
//// implicit def arrayToBDV(darr: Array[Double]): BDV[Double]
//// = Vectors.dense(darr).asInstanceOf[BDV[Double]]
//// implicit def bdvToArray[T](vect: BDV[T])(implicit ct: ClassTag[T]): Array[T] = vect.toArray
//// implicit def bdvToSeq[T](vect: BDV[T])(implicit ct: ClassTag[T]): Seq[T] = vect.toArray.toSeq
//
// def add(v1: BDV[Double], v2: BDV[Double]) =
// v1.zip(v2).map { x => x._1 + x._2}

def mult(v1: BDV[Double], d: Double) = {
v1 * d
}

def mult(v1: BDV[Double], v2: BDV[Double]) = {
v1 * v2
}

def multColByRow(v1: BDV[Double], v2: BDV[Double]) = {
val mat = v1 * v2.t
mat
}

def norm(vect: BDV[Double]): Double = {
vect.norm
}

// def norm(darr: Array[Double]): Double = {
// Math.sqrt(darr.foldLeft(0.0) { case (sum, dval) => sum + Math.pow(dval, 2)})
// }

def manhattanNorm(vect: BDV[Double]): Double = {
vect.norm(1)
Vectors.norm(Vectors.fromBreeze(vect),2.0)
}

def dot(v1: BDV[Double], v2: BDV[Double]) : Double = {
v1.dot(v2)
}

def onesVector(len: Int): BDV[Double] = {
BDV.ones(len)
}

val calcEigenDiffs = true

def withinTol(d: Double, tol: Double = DefaultTolerance) = Math.abs(d) <= tol

val DefaultTolerance: Double = 1e-8
Expand All @@ -112,17 +64,17 @@ object PICLinalg {
}
}

def transpose(mat: DMatrix) = {
mat.t
}

def printMatrix(mat: BDM[Double]): String
= printMatrix(mat, mat.rows, mat.cols)

def printMatrix(mat: BDM[Double], numRows: Int, numCols: Int): String = {
printMatrix(mat.toArray, numRows, numCols)
}

def printMatrix(vectors: Array[BDV[Double]]) : String = {
printMatrix(vectors.map{_.toArray}.flatten, vectors.length, vectors.length)
}

def printMatrix(vect: Array[Double], numRows: Int, numCols: Int): String = {
val darr = vect
val stride = darr.length / numCols
Expand Down Expand Up @@ -185,12 +137,13 @@ object PICLinalg {
cnorm = makeNonZero(norm(eigen))
eigen = eigen.map(_ / cnorm)
}
val signum = Math.signum(dot(mat(0), eigen))
val lambda = dot(mat(0), eigen) / eigen(0)
val matDotEigen = mat(::,0) dot eigen
val signum = Math.signum(matDotEigen)
val lambda = matDotEigen / eigen(0)
eigen = eigen.map(_ * signum)
println(s"lambda=$lambda eigen=${printVector(eigen)}")
if (expLambda.toArray.length > 0) {
val compareVect = eigen.zip(expdat(k)).map { case (a, b) => a / b}
val compareVect = eigen.toArray.zip(expdat(::,k)).map { case (a, b) => a / b}
println(s"Ratio to expected: lambda=${lambda / expLambda(k)} " +
s"Vect=${compareVect.mkString("[", ",", "]")}")
}
Expand All @@ -216,11 +169,8 @@ object PICLinalg {
}

def deflate(mat: DMatrix, lambda: Double, eigen: BDV[Double]) = {
// mat = mat.map(subtractProjection(_, mult(eigen, lambda)))
val eigT = eigen
val projected = (eigen * eigen.t) * lambda
// println(s"projected matrix:\n${printMatrix(projected,
// eigen.length, eigen.length)}")
val matOut = mat - projected
println(s"Updated matrix:\n${
printMatrix(mat,
Expand All @@ -234,29 +184,12 @@ object PICLinalg {
outMat
}

// def mult(mat: DMatrix, vect: BDV[Double]): DMatrix = {
// val outMat = mat.map { m =>
// mult(m, vect)
// }
// outMat
// }
//
// def mult(vect: BDV[Double], mat: DMatrix): DMatrix = {
// for {d <- vect.zip(transpose(mat)) }
// yield mult(d._2, d._1)
// }

def scale(mat: DMatrix, d: Double): DMatrix = {
mat * d
}

def transpose(vector: BDV[Double]) = {
vector.map { d => Array(d)}
}

def toMat(dvect: Array[Double], ncols: Int) = {
val m = dvect.toSeq.grouped(ncols).map(_.toArray)
m
def matToCols(mat: DMatrix) = {
mat.toArray.grouped(mat.cols)
}

def schurComplement(mat: DMatrix, lambda: Double, eigen: BDV[Double]) = {
Expand All @@ -273,9 +206,10 @@ object PICLinalg {
printMatrix(numerat2,
eigen.length, eigen.length)
}")
val denom1 = eigT * mat
val denom1 = eig * mat
val denom2 = denom1 * eigen
val denom = denom2.toArray(0)
val denomTmp = denom2.toArray
val denom = denomTmp(0)
println(s"denom is $denom")
val projMat = scale(numerat2, 1.0 / denom)
println(s"Updated matrix:\n${
Expand Down
Loading

0 comments on commit 90e7fa4

Please sign in to comment.