Skip to content

Commit

Permalink
[SPARK-7438] [SPARK CORE] Fixed validation of relativeSD in countAppr…
Browse files Browse the repository at this point in the history
…oxDistinct

Author: Vinod K C <vinod.kc@huawei.com>

Closes apache#5974 from vinodkc/fix_countApproxDistinct_Validation and squashes the following commits:

3a3d59c [Vinod K C] Reverted removal of validation relativeSD<0.000017
799976e [Vinod K C] Removed testcase to assert IAE when relativeSD>3.7
8ddbfae [Vinod K C] Remove blank line
b1b00a3 [Vinod K C] Removed relativeSD validation from python API,RDD.scala will do validation
122d378 [Vinod K C] Fixed validation of relativeSD in  countApproxDistinct
  • Loading branch information
Vinod K C authored and nemccarthy committed Jun 19, 2015
1 parent 18eac0a commit e0b38b7
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 6 deletions.
7 changes: 4 additions & 3 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1161,8 +1161,8 @@ abstract class RDD[T: ClassTag](
*/
@Experimental
def countApproxDistinct(p: Int, sp: Int): Long = withScope {
require(p >= 4, s"p ($p) must be at least 4")
require(sp <= 32, s"sp ($sp) cannot be greater than 32")
require(p >= 4, s"p ($p) must be >= 4")
require(sp <= 32, s"sp ($sp) must be <= 32")
require(sp == 0 || p <= sp, s"p ($p) cannot be greater than sp ($sp)")
val zeroCounter = new HyperLogLogPlus(p, sp)
aggregate(zeroCounter)(
Expand All @@ -1187,8 +1187,9 @@ abstract class RDD[T: ClassTag](
* It must be greater than 0.000017.
*/
def countApproxDistinct(relativeSD: Double = 0.05): Long = withScope {
require(relativeSD > 0.000017, s"accuracy ($relativeSD) must be greater than 0.000017")
val p = math.ceil(2.0 * math.log(1.054 / relativeSD) / math.log(2)).toInt
countApproxDistinct(p, 0)
countApproxDistinct(if (p < 4) 4 else p, 0)
}

/**
Expand Down
2 changes: 2 additions & 0 deletions core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
val simpleRdd = sc.makeRDD(uniformDistro, 10)
assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.2)
assert(error(simpleRdd.countApproxDistinct(12, 0), size) < 0.1)
assert(error(simpleRdd.countApproxDistinct(0.02), size) < 0.1)
assert(error(simpleRdd.countApproxDistinct(0.5), size) < 0.22)
}

test("SparkContext.union") {
Expand Down
2 changes: 0 additions & 2 deletions python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2239,8 +2239,6 @@ def countApproxDistinct(self, relativeSD=0.05):
"""
if relativeSD < 0.000017:
raise ValueError("relativeSD should be greater than 0.000017")
if relativeSD > 0.37:
raise ValueError("relativeSD should be smaller than 0.37")
# the hash space in Java is 2^32
hashRDD = self.map(lambda x: portable_hash(x) & 0xFFFFFFFF)
return hashRDD._to_java_object_rdd().countApproxDistinct(relativeSD)
Expand Down
1 change: 0 additions & 1 deletion python/pyspark/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,6 @@ def test_count_approx_distinct(self):
self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)

self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))

def test_histogram(self):
# empty
Expand Down

0 comments on commit e0b38b7

Please sign in to comment.