Skip to content

Commit

Permalink
Use multiplication instead of sum for default estimates.
Browse files Browse the repository at this point in the history
  • Loading branch information
concretevitamin committed Jul 29, 2014
1 parent 4ef0d26 commit 0ef9e5b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,24 @@ import org.apache.spark.sql.catalyst.trees
abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
self: Product =>

// TODO: handle overflow?
/**
* Estimates of various statistics. The default estimation logic simply sums up the corresponding
* statistic produced by the children. To override this behavior, override `statistics` and
* assign it a overriden version of `Statistics`.
*/
case class Statistics(
numTuples: Long = childrenStats.map(_.numTuples).sum,
sizeInBytes: Long = childrenStats.map(_.sizeInBytes).sum
/**
* Number of output tuples. For leaf operators this defaults to 1, otherwise it is set to the
* product of children's `numTuples`.
*/
numTuples: Long = childrenStats.map(_.numTuples).product,

/**
* Physical size in bytes. For leaf operators this defaults to 1, otherwise it is set to the
* product of children's `sizeInBytes`.
*/
sizeInBytes: Long = childrenStats.map(_.sizeInBytes).product
)
lazy val statistics: Statistics = new Statistics
lazy val childrenStats = children.map(_.statistics)
Expand Down Expand Up @@ -104,6 +114,8 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] {
abstract class LeafNode extends LogicalPlan with trees.LeafNode[LogicalPlan] {
self: Product =>

override lazy val statistics = Statistics(numTuples = 1L, sizeInBytes = 1L)

// Leaf nodes by definition cannot reference any input attributes.
override def references = Set.empty
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ case class SparkLogicalPlan(alreadyPlanned: SparkPlan)
// If this is wrapping around ExistingRdd and no reasonable estimation logic is implemented,
// return a default value.
sizeInBytes = {
val defaultSum = childrenStats.map(_.sizeInBytes).sum
val naiveVal = childrenStats.map(_.sizeInBytes).product
alreadyPlanned match {
// TODO: Instead of returning a default value here, find a way to return a meaningful
// size estimate for RDDs. See PR 1238 for more discussions.
case e: ExistingRdd if defaultSum == 0 => statsDefaultSizeInBytes
case _ => defaultSum
case e: ExistingRdd if naiveVal == 1L => statsDefaultSizeInBytes
case _ => naiveVal
}
}
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ private[sql] case class ParquetRelation(
sizeInBytes = {
val hdfsPath = new Path(path)
val fs = hdfsPath.getFileSystem(conf.getOrElse(ContextUtil.getConfiguration(new Job())))
fs.getContentSummary(hdfsPath).getLength // TODO: in bytes or system-dependent?
math.max(fs.getContentSummary(hdfsPath).getLength, 1L) // TODO: in bytes or system-dependent?
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ private[hive] case class MetastoreRelation
// TODO: check if this estimate is valid for tables after partition pruning.
// Size getters adapted from SizeBasedBigTableSelectorForAutoSMJ.java in Hive (version 0.13).
override val sizeInBytes: Long =
maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path)
math.max(maybeGetSize(hiveConf, hiveQlTable.getProperty("totalSize"), path), 1L)

private[this] def maybeGetSize(conf: HiveConf, size: String, path: Path): Long = {
val res = try { Some(size.toLong) } catch { case _: Exception => None }
Expand Down

0 comments on commit 0ef9e5b

Please sign in to comment.