Converted from custom Linalg routines to Breeze: added JavaDoc commen…

…ts; added Markdown documentation
freeman-lab · Jan 28, 2015 · 90e7fa4 · 90e7fa4
1 parent bea48ea
commit 90e7fa4
Show file tree

Hide file tree

Showing 7 changed files with 312 additions and 333 deletions.
diff --git a/data/mllib/pic-15.txt b/data/mllib/pic-15.txt
@@ -0,0 +1,15 @@
+91	2.1419053155	1.919407949	0.0501333631	-0.1069902864	1.2809776381	1.6846227956	0.1827785926	-0.3966434027	0.8090554869	2.4862133924
+82	1.8023071497	0.8784870753	2.4105062239	0.3597672178	-0.2096444593	1.3537576979	0.5096503508	1.5507215383	-0.203551002	1.3210160806
+73	2.5511476389	1.4385302862	1.4815980608	2.519631079	0.7231682708	0.9160610215	2.2558330058	0.6747272061	0.8267096669	-0.8585851446
+64	2.4238069456	-0.3637260241	-0.9646660988	0.0814051561	-1.5488873934	-0.6309606578	0.8779952254	2.2891590718	0.7308611443	1.2574914085
+55	0.680085624	-0.7684998593	0.5165496871	0.4900095346	2.116673377	0.9590527985	-0.1076715169	2.8623214176	2.1457411377	-0.0586772049
+46	2.0725991339	-0.931744152	1.3010252161	1.2475231583	2.4061568492	-0.5202207204	1.2709294127	1.5612492848	0.470170422	1.5390221915
+37	3.2123402142	0.3670664312	-0.8831759122	1.3865659854	1.3258292709	0.0986956805	0.9973196911	0.526040745	0.4520218452	0.9808998515
+28	2.6468163883	-0.1070625922	1.5938103927	0.8443353789	1.6632872929	2.2267933607	1.8839698438	1.2217245467	1.919702086	0.2606241814
+19	1.8035177495	0.7460582552	0.2361611395	-0.8645567427	-0.8613062	0.4234001189	0.5910061938	1.2484609376	0.5190870451	1.4462120574
+10	0.5534111111	1.0456386879	1.7045663273	0.7281759816	1.0807487792	2.2590964696	1.7635098382	2.7220810802	1.1459500541	0.0053369875
+911	1.200749626	1.8962364439	2.5117192131	-0.4034737281	-0.9069696484	2.3685654487	0.4403269676	1.7446081537	2.5736655957	2.1280434418
+812	0.8079184133	-1.2544936618	1.4398518629	1.6568003266	0.2550498386	2.1994753269	2.7797467522	1.0674041521	2.295064022	0.4173234715
+713	1.7688682382	1.4176645502	0.530907764	1.4141481733	1.6630227275	1.8671946375	1.2967008778	1.3215230565	3.2242953581	1.8358482078
+614	-0.193302298	1.118805146	1.5580410346	-0.9527104651	2.4960553383	0.2374178113	1.8951776489	0.8173290971	1.929763464	0.5625196402
+515	0.895089061	0.3885617561	1.3527646645	-0.1445166108	0.3461682011	3.6770971085	1.1513217164	2.8470372001	1.440743315	1.8773090852
diff --git a/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png b/docs/img/PIClusteringFiveCirclesInputsAndOutputs.png
diff --git a/docs/mllib-pic-clustering.md b/docs/mllib-pic-clustering.md
@@ -0,0 +1,30 @@
+---
+layout: global
+title: Clustering - MLlib
+displayTitle: <a href="mllib-guide.html">MLlib</a> - Power Iteration Clustering
+---
+
+* Table of contents
+{:toc}
+
+
+## Power Iteration Clustering
+
+Power iteration clustering is a scalable and efficient algorithm for clustering points given pointwise mutual affinity values.  Internally the algorithm:
+
+* computes the Gaussian distance between all pairs of points and represents these distances in an Affinity Matrix
+* calculates a Normalized Affinity Matrix
+* calculates the principal eigenvalue and eigenvector
+* Clusters each of the input points according to their principal eigenvector component value
+
+Details of this algorithm are found within [Power Iteration Clustering, Lin and Cohen]{www.icml2010.org/papers/387.pdf}
+
+Example outputs for a dataset inspired by the paper - but with five clusters instead of three- have he following output from our implementation:
+
+<p style="text-align: center;">
+  <img src="img/PIClusteringFiveCirclesInputsAndOutputs.png"
+       title="The Property Graph"
+       alt="The Property Graph"
+       width="50%" />
+  <!-- Images are downsized intentionally to improve quality on retina displays -->
+</p>
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PICLinalg.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PICLinalg.scala
@@ -22,14 +22,17 @@ import org.apache.spark.mllib.linalg.Vectors
 import scala.reflect.ClassTag
 import scala.util.Random
 import breeze.linalg.{DenseVector => BDV,DenseMatrix => BDM}
+import scala.language.existentials
+import scala.language.implicitConversions
 
 /**
- * PICLinalg
+ * Linear Algebra helper routines associated with the PIClustering implementation
  *
  */
-
 object PICLinalg {
 
+  import breeze.linalg.DenseVector._
+  import breeze.linalg.DenseMatrix._
   type DMatrix = BDM[Double]
 
   type LabeledVector = (Long, BDV[Double])
@@ -38,68 +41,17 @@ object PICLinalg {
 
   type Vertices = Seq[LabeledVector]
 
-//  implicit def arrayToVect(darr: Array[Double]): BDV[Double] = new BDV(darr)
   implicit def bdvToSeq[T](vect: BDV[T])(implicit ct: ClassTag[T]): Seq[T] = vect.toArray.toSeq
-  implicit def bdvToArray[T](vect: BDV[T])(implicit ct: ClassTag[T]): Array[T] = vect.toArray
-//  implicit def arrayToSeq(arr: Array[Double]) = Predef.doubleArrayOps(arr)
-  def add(v1: BDV[Double], v2: BDV[Double]) =
-    v1.zip(v2).map { x => x._1 + x._2}
-
-  def norm(darr: Array[Double]): Double = {
-    Math.sqrt(darr.foldLeft(0.0) { case (sum, dval) => sum + Math.pow(dval, 2)})
-  }
-
-  def norm(darr: BDV[Double]): Double = {
-    darr.norm(2)
-  }
-
-//
-//
-//  // Implicits to convert between Breeze DenseVector's and Arrays
-//  implicit def arrToSeq[T](arr: Array[T]): Seq[T] = arr.toSeq
-////  implicit def arrayToBDV(darr: Array[Double]): BDV[Double]
-////    = Vectors.dense(darr).asInstanceOf[BDV[Double]]
-////  implicit def bdvToArray[T](vect: BDV[T])(implicit ct: ClassTag[T]): Array[T] = vect.toArray
-////  implicit def bdvToSeq[T](vect: BDV[T])(implicit ct: ClassTag[T]): Seq[T] = vect.toArray.toSeq
-//
-//  def add(v1: BDV[Double], v2: BDV[Double]) =
-//    v1.zip(v2).map { x => x._1 + x._2}
-
-  def mult(v1: BDV[Double], d: Double) = {
-    v1 * d
-  }
-
-  def mult(v1: BDV[Double], v2: BDV[Double]) = {
-    v1 * v2
-  }
 
-  def multColByRow(v1: BDV[Double], v2: BDV[Double]) = {
-    val mat = v1 * v2.t
-    mat
-  }
 
   def norm(vect: BDV[Double]): Double = {
-    vect.norm
-  }
-
-//  def norm(darr: Array[Double]): Double = {
-//    Math.sqrt(darr.foldLeft(0.0) { case (sum, dval) => sum + Math.pow(dval, 2)})
-//  }
-
-  def manhattanNorm(vect: BDV[Double]): Double = {
-    vect.norm(1)
+    Vectors.norm(Vectors.fromBreeze(vect),2.0)
   }
 
   def dot(v1: BDV[Double], v2: BDV[Double]) : Double = {
     v1.dot(v2)
   }
 
-  def onesVector(len: Int): BDV[Double] = {
-    BDV.ones(len)
-  }
-
-  val calcEigenDiffs = true
-
   def withinTol(d: Double, tol: Double = DefaultTolerance) = Math.abs(d) <= tol
 
   val DefaultTolerance: Double = 1e-8
@@ -112,17 +64,17 @@ object PICLinalg {
     }
   }
 
-  def transpose(mat: DMatrix) = {
-    mat.t
-  }
-
   def printMatrix(mat: BDM[Double]): String
   = printMatrix(mat, mat.rows, mat.cols)
 
   def printMatrix(mat: BDM[Double], numRows: Int, numCols: Int): String = {
     printMatrix(mat.toArray, numRows, numCols)
   }
 
+  def printMatrix(vectors: Array[BDV[Double]]) : String = {
+    printMatrix(vectors.map{_.toArray}.flatten, vectors.length, vectors.length)
+  }
+
   def printMatrix(vect: Array[Double], numRows: Int, numCols: Int): String = {
     val darr = vect
     val stride = darr.length / numCols
@@ -185,12 +137,13 @@ object PICLinalg {
         cnorm = makeNonZero(norm(eigen))
         eigen = eigen.map(_ / cnorm)
       }
-      val signum = Math.signum(dot(mat(0), eigen))
-      val lambda = dot(mat(0), eigen) / eigen(0)
+      val matDotEigen = mat(::,0) dot eigen
+      val signum = Math.signum(matDotEigen)
+      val lambda = matDotEigen / eigen(0)
       eigen = eigen.map(_ * signum)
       println(s"lambda=$lambda eigen=${printVector(eigen)}")
       if (expLambda.toArray.length > 0) {
-        val compareVect = eigen.zip(expdat(k)).map { case (a, b) => a / b}
+        val compareVect = eigen.toArray.zip(expdat(::,k)).map { case (a, b) => a / b}
         println(s"Ratio  to expected: lambda=${lambda / expLambda(k)} " +
           s"Vect=${compareVect.mkString("[", ",", "]")}")
       }
@@ -216,11 +169,8 @@ object PICLinalg {
   }
 
   def deflate(mat: DMatrix, lambda: Double, eigen: BDV[Double]) = {
-    //        mat = mat.map(subtractProjection(_, mult(eigen, lambda)))
     val eigT = eigen
     val projected = (eigen * eigen.t) * lambda
-    //        println(s"projected matrix:\n${printMatrix(projected,
-    //          eigen.length, eigen.length)}")
     val matOut = mat - projected
     println(s"Updated matrix:\n${
       printMatrix(mat,
@@ -234,29 +184,12 @@ object PICLinalg {
     outMat
   }
 
-  //    def mult(mat: DMatrix, vect: BDV[Double]): DMatrix  = {
-  //      val outMat = mat.map { m =>
-  //        mult(m, vect)
-  //      }
-  //      outMat
-  //    }
-  //
-  //    def mult(vect: BDV[Double], mat: DMatrix): DMatrix = {
-  //      for {d <- vect.zip(transpose(mat)) }
-  //        yield mult(d._2, d._1)
-  //    }
-
   def scale(mat: DMatrix, d: Double): DMatrix = {
     mat * d
   }
 
-  def transpose(vector: BDV[Double]) = {
-    vector.map { d => Array(d)}
-  }
-
-  def toMat(dvect: Array[Double], ncols: Int) = {
-    val m = dvect.toSeq.grouped(ncols).map(_.toArray)
-    m
+  def matToCols(mat: DMatrix) = {
+    mat.toArray.grouped(mat.cols)
   }
 
   def schurComplement(mat: DMatrix, lambda: Double, eigen: BDV[Double]) = {
@@ -273,9 +206,10 @@ object PICLinalg {
       printMatrix(numerat2,
         eigen.length, eigen.length)
     }")
-    val denom1 = eigT  *  mat
+    val denom1 = eig  *  mat
     val denom2 = denom1 * eigen
-    val denom = denom2.toArray(0)
+    val denomTmp = denom2.toArray
+    val denom = denomTmp(0)
     println(s"denom is $denom")
     val projMat = scale(numerat2, 1.0 / denom)
     println(s"Updated matrix:\n${