Skip to content

Commit

Permalink
allow specifying a random seed in ALS
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Mar 13, 2014
1 parent 200bef0 commit 4c7cde2
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,15 @@ case class Rating(val user: Int, val product: Int, val rating: Double)
* indicated user
* preferences rather than explicit ratings given to items.
*/
class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var lambda: Double,
var implicitPrefs: Boolean, var alpha: Double)
extends Serializable with Logging
{
class ALS private (
var numBlocks: Int,
var rank: Int,
var iterations: Int,
var lambda: Double,
var implicitPrefs: Boolean,
var alpha: Double,
var seed: Long = System.nanoTime()
) extends Serializable with Logging {
def this() = this(-1, 10, 10, 0.01, false, 1.0)

/**
Expand Down Expand Up @@ -132,6 +137,11 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
this
}

def setSeed(seed: Long): ALS = {
this.seed = seed
this
}

/**
* Run ALS with the configured parameters on an input RDD of (user, product, rating) triples.
* Returns a MatrixFactorizationModel with feature vectors for each user and product.
Expand All @@ -155,7 +165,7 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l

// Initialize user and product factors randomly, but use a deterministic seed for each
// partition so that fault recovery works
val seedGen = new Random()
val seedGen = new Random(seed)
val seed1 = seedGen.nextInt()
val seed2 = seedGen.nextInt()
// Hash an integer to propagate random bits at all positions, similar to java.util.HashTable
Expand Down Expand Up @@ -468,6 +478,7 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
* Top-level methods for calling Alternating Least Squares (ALS) matrix factorization.
*/
object ALS {

/**
* Train a matrix factorization model given an RDD of ratings given by users to some products,
* in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
Expand All @@ -480,15 +491,39 @@ object ALS {
* @param iterations number of iterations of ALS (recommended: 10-20)
* @param lambda regularization factor (recommended: 0.01)
* @param blocks level of parallelism to split computation into
* @param seed random seed
*/
def train(
ratings: RDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
blocks: Int)
: MatrixFactorizationModel =
{
blocks: Int,
seed: Long
): MatrixFactorizationModel = {
new ALS(blocks, rank, iterations, lambda, false, 1.0, seed).run(ratings)
}

/**
* Train a matrix factorization model given an RDD of ratings given by users to some products,
* in the form of (userID, productID, rating) pairs. We approximate the ratings matrix as the
* product of two lower-rank matrices of a given rank (number of features). To solve for these
* features, we run a given number of iterations of ALS. This is done using a level of
* parallelism given by `blocks`.
*
* @param ratings RDD of (userID, productID, rating) pairs
* @param rank number of features to use
* @param iterations number of iterations of ALS (recommended: 10-20)
* @param lambda regularization factor (recommended: 0.01)
* @param blocks level of parallelism to split computation into
*/
def train(
ratings: RDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
blocks: Int
): MatrixFactorizationModel = {
new ALS(blocks, rank, iterations, lambda, false, 1.0).run(ratings)
}

Expand All @@ -505,8 +540,7 @@ object ALS {
* @param lambda regularization factor (recommended: 0.01)
*/
def train(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double)
: MatrixFactorizationModel =
{
: MatrixFactorizationModel = {
train(ratings, rank, iterations, lambda, -1)
}

Expand All @@ -522,8 +556,7 @@ object ALS {
* @param iterations number of iterations of ALS (recommended: 10-20)
*/
def train(ratings: RDD[Rating], rank: Int, iterations: Int)
: MatrixFactorizationModel =
{
: MatrixFactorizationModel = {
train(ratings, rank, iterations, 0.01, -1)
}

Expand All @@ -540,16 +573,42 @@ object ALS {
* @param lambda regularization factor (recommended: 0.01)
* @param blocks level of parallelism to split computation into
* @param alpha confidence parameter (only applies when immplicitPrefs = true)
* @param seed random seed
*/
def trainImplicit(
ratings: RDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
blocks: Int,
alpha: Double)
: MatrixFactorizationModel =
{
alpha: Double,
seed: Long
): MatrixFactorizationModel = {
new ALS(blocks, rank, iterations, lambda, true, alpha, seed).run(ratings)
}

/**
* Train a matrix factorization model given an RDD of 'implicit preferences' given by users
* to some products, in the form of (userID, productID, preference) pairs. We approximate the
* ratings matrix as the product of two lower-rank matrices of a given rank (number of features).
* To solve for these features, we run a given number of iterations of ALS. This is done using
* a level of parallelism given by `blocks`.
*
* @param ratings RDD of (userID, productID, rating) pairs
* @param rank number of features to use
* @param iterations number of iterations of ALS (recommended: 10-20)
* @param lambda regularization factor (recommended: 0.01)
* @param blocks level of parallelism to split computation into
* @param alpha confidence parameter (only applies when immplicitPrefs = true)
*/
def trainImplicit(
ratings: RDD[Rating],
rank: Int,
iterations: Int,
lambda: Double,
blocks: Int,
alpha: Double
): MatrixFactorizationModel = {
new ALS(blocks, rank, iterations, lambda, true, alpha).run(ratings)
}

Expand All @@ -565,8 +624,8 @@ object ALS {
* @param iterations number of iterations of ALS (recommended: 10-20)
* @param lambda regularization factor (recommended: 0.01)
*/
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double,
alpha: Double): MatrixFactorizationModel = {
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int, lambda: Double, alpha: Double)
: MatrixFactorizationModel = {
trainImplicit(ratings, rank, iterations, lambda, -1, alpha)
}

Expand All @@ -583,8 +642,7 @@ object ALS {
* @param iterations number of iterations of ALS (recommended: 10-20)
*/
def trainImplicit(ratings: RDD[Rating], rank: Int, iterations: Int)
: MatrixFactorizationModel =
{
: MatrixFactorizationModel = {
trainImplicit(ratings, rank, iterations, 0.01, -1, 1.0)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import org.scalatest.FunSuite

import org.jblas._

import org.apache.spark.SparkContext._
import org.apache.spark.mllib.util.LocalSparkContext

object ALSSuite {
Expand Down Expand Up @@ -115,6 +116,18 @@ class ALSSuite extends FunSuite with LocalSparkContext {
testALS(100, 200, 2, 15, 0.7, 0.4, true, false, true)
}

test("pseudorandomness") {
val ratings = sc.parallelize(ALSSuite.generateRatings(10, 20, 5, 0.5, false, false)._1, 2)
val model11 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
val model12 = ALS.train(ratings, 5, 1, 1.0, 2, 1)
val u11 = model11.userFeatures.values.flatMap(_.toList).collect().toList
val u12 = model12.userFeatures.values.flatMap(_.toList).collect().toList
val model2 = ALS.train(ratings, 5, 1, 1.0, 2, 2)
val u2 = model2.userFeatures.values.flatMap(_.toList).collect().toList
assert(u11 == u12)
assert(u11 != u2)
}

/**
* Test if we can correctly factorize R = U * P where U and P are of known rank.
*
Expand Down

0 comments on commit 4c7cde2

Please sign in to comment.