Skip to content

Commit

Permalink
Merge pull request apache#168 from sun-rui/SPARKR-153_2
Browse files Browse the repository at this point in the history
[SPARKR-153] phase 2: implement aggregateByKey() and foldByKey().
  • Loading branch information
concretevitamin committed Feb 18, 2015
2 parents fd8f8a9 + 0cda231 commit 0981dff
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
exportClasses("RDD")
exportClasses("Broadcast")
exportMethods(
"aggregateByKey",
"aggregateRDD",
"cache",
"checkpoint",
Expand All @@ -19,6 +20,7 @@ exportMethods(
"flatMap",
"flatMapValues",
"fold",
"foldByKey",
"foreach",
"foreachPartition",
"fullOuterJoin",
Expand Down
82 changes: 82 additions & 0 deletions pkg/R/pairRDD.R
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,88 @@ setMethod("combineByKey",
lapplyPartition(shuffled, mergeAfterShuffle)
})

#' Aggregate a pair RDD by each key.
#'
#' Aggregate the values of each key in an RDD, using given combine functions
#' and a neutral "zero value". This function can return a different result type,
#' U, than the type of the values in this RDD, V. Thus, we need one operation
#' for merging a V into a U and one operation for merging two U's, The former
#' operation is used for merging values within a partition, and the latter is
#' used for merging values between partitions. To avoid memory allocation, both
#' of these functions are allowed to modify and return their first argument
#' instead of creating a new U.
#'
#' @param rdd An RDD.
#' @param zeroValue A neutral "zero value".
#' @param seqOp A function to aggregate the values of each key. It may return
#' a different result type from the type of the values.
#' @param combOp A function to aggregate results of seqOp.
#' @return An RDD containing the aggregation result.
#' @rdname aggregateByKey
#' @seealso foldByKey, combineByKey
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
#' zeroValue <- list(0, 0)
#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
#' aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
#' # list(list(1, list(3, 2)), list(2, list(7, 2)))
#'}
setGeneric("aggregateByKey",
function(rdd, zeroValue, seqOp, combOp, numPartitions) {
standardGeneric("aggregateByKey")
})

#' @rdname aggregateByKey
#' @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method
setMethod("aggregateByKey",
signature(rdd = "RDD", zeroValue = "ANY", seqOp = "ANY",
combOp = "ANY", numPartitions = "integer"),
function(rdd, zeroValue, seqOp, combOp, numPartitions) {
createCombiner <- function(v) {
do.call(seqOp, list(zeroValue, v))
}

combineByKey(rdd, createCombiner, seqOp, combOp, numPartitions)
})

#' Fold a pair RDD by each key.
#'
#' Aggregate the values of each key in an RDD, using an associative function "func"
#' and a neutral "zero value" which may be added to the result an arbitrary
#' number of times, and must not change the result (e.g., 0 for addition, or
#' 1 for multiplication.).
#'
#' @param rdd An RDD.
#' @param zeroValue A neutral "zero value".
#' @param func An associative function for folding values of each key.
#' @return An RDD containing the aggregation result.
#' @rdname foldByKey
#' @seealso aggregateByKey, combineByKey
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
#' foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
#'}
setGeneric("foldByKey",
function(rdd, zeroValue, func, numPartitions) {
standardGeneric("foldByKey")
})

#' @rdname foldByKey
#' @aliases foldByKey,RDD,ANY,ANY,integer-method
setMethod("foldByKey",
signature(rdd = "RDD", zeroValue = "ANY",
func = "ANY", numPartitions = "integer"),
function(rdd, zeroValue, func, numPartitions) {
aggregateByKey(rdd, zeroValue, func, func, numPartitions)
})

############ Binary Functions #############

#' Join two RDDs
Expand Down
71 changes: 71 additions & 0 deletions pkg/inst/tests/test_shuffle.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,77 @@ test_that("combineByKey for doubles", {
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
})

test_that("aggregateByKey", {
# test aggregateByKey for int keys
rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))

zeroValue <- list(0, 0)
seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)

actual <- collect(aggregatedRDD)

expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))

# test aggregateByKey for string keys
rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))

zeroValue <- list(0, 0)
seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)

actual <- collect(aggregatedRDD)

expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
})

test_that("foldByKey", {
# test foldByKey for int keys
folded <- foldByKey(intRdd, 0, "+", 2L)

actual <- collect(folded)

expected <- list(list(2L, 101), list(1L, 199))
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))

# test foldByKey for double keys
folded <- foldByKey(doubleRdd, 0, "+", 2L)

actual <- collect(folded)

expected <- list(list(1.5, 199), list(2.5, 101))
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))

# test foldByKey for string keys
stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))

stringKeyRDD <- parallelize(sc, stringKeyPairs)
folded <- foldByKey(stringKeyRDD, 0, "+", 2L)

actual <- collect(folded)

expected <- list(list("b", 101), list("a", 199))
expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))

# test foldByKey for empty pair RDD
rdd <- parallelize(sc, list())
folded <- foldByKey(rdd, 0, "+", 2L)
actual <- collect(folded)
expected <- list()
expect_equal(actual, expected)

# test foldByKey for RDD with only 1 pair
rdd <- parallelize(sc, list(list(1, 1)))
folded <- foldByKey(rdd, 0, "+", 2L)
actual <- collect(folded)
expected <- list(list(1, 1))
expect_equal(actual, expected)
})

test_that("partitionBy() partitions data correctly", {
# Partition by magnitude
partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
Expand Down
50 changes: 50 additions & 0 deletions pkg/man/aggregateByKey.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
% Generated by roxygen2 (4.0.2): do not edit by hand
\docType{methods}
\name{aggregateByKey}
\alias{aggregateByKey}
\alias{aggregateByKey,RDD,ANY,ANY,ANY,integer-method}
\title{Aggregate a pair RDD by each key.}
\usage{
aggregateByKey(rdd, zeroValue, seqOp, combOp, numPartitions)

\S4method{aggregateByKey}{RDD,ANY,ANY,ANY,integer}(rdd, zeroValue, seqOp,
combOp, numPartitions)
}
\arguments{
\item{rdd}{An RDD.}

\item{zeroValue}{A neutral "zero value".}

\item{seqOp}{A function to aggregate the values of each key. It may return
a different result type from the type of the values.}

\item{combOp}{A function to aggregate results of seqOp.}
}
\value{
An RDD containing the aggregation result.
}
\description{
Aggregate the values of each key in an RDD, using given combine functions
and a neutral "zero value". This function can return a different result type,
U, than the type of the values in this RDD, V. Thus, we need one operation
for merging a V into a U and one operation for merging two U's, The former
operation is used for merging values within a partition, and the latter is
used for merging values between partitions. To avoid memory allocation, both
of these functions are allowed to modify and return their first argument
instead of creating a new U.
}
\examples{
\dontrun{
sc <- sparkR.init()
rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
zeroValue <- list(0, 0)
seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
# list(list(1, list(3, 2)), list(2, list(7, 2)))
}
}
\seealso{
foldByKey, combineByKey
}
38 changes: 38 additions & 0 deletions pkg/man/foldByKey.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
% Generated by roxygen2 (4.0.2): do not edit by hand
\docType{methods}
\name{foldByKey}
\alias{foldByKey}
\alias{foldByKey,RDD,ANY,ANY,integer-method}
\title{Fold a pair RDD by each key.}
\usage{
foldByKey(rdd, zeroValue, func, numPartitions)

\S4method{foldByKey}{RDD,ANY,ANY,integer}(rdd, zeroValue, func, numPartitions)
}
\arguments{
\item{rdd}{An RDD.}

\item{zeroValue}{A neutral "zero value".}

\item{func}{An associative function for folding values of each key.}
}
\value{
An RDD containing the aggregation result.
}
\description{
Aggregate the values of each key in an RDD, using an associative function "func"
and a neutral "zero value" which may be added to the result an arbitrary
number of times, and must not change the result (e.g., 0 for addition, or
1 for multiplication.).
}
\examples{
\dontrun{
sc <- sparkR.init()
rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
}
}
\seealso{
aggregateByKey, combineByKey
}

0 comments on commit 0981dff

Please sign in to comment.