Skip to content

Commit

Permalink
Fixed validation of relativeSD in countApproxDistinct
Browse files Browse the repository at this point in the history
  • Loading branch information
Vinod K C committed May 7, 2015
1 parent 32cdc81 commit 122d378
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
7 changes: 4 additions & 3 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1151,8 +1151,8 @@ abstract class RDD[T: ClassTag](
*/
@Experimental
def countApproxDistinct(p: Int, sp: Int): Long = withScope {
require(p >= 4, s"p ($p) must be at least 4")
require(sp <= 32, s"sp ($sp) cannot be greater than 32")
require(p >= 4, s"p ($p) must be >= 4")
require(sp <= 32, s"sp ($sp) must be <= 32")
require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
val zeroCounter = new HyperLogLogPlus(p, sp)
aggregate(zeroCounter)(
Expand All @@ -1177,8 +1177,9 @@ abstract class RDD[T: ClassTag](
* It must be greater than 0.000017.
*/
def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
countApproxDistinct(p, 0)
countApproxDistinct(if (p < 4) 4 else p, 0)
}

/**
Expand Down
2 changes: 2 additions & 0 deletions core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
}

test("SparkContext.union") {
Expand Down

0 comments on commit 122d378

Please sign in to comment.