Skip to content

Commit

Permalink
Merge pull request #8 from marmbrus/testImprovment
Browse files Browse the repository at this point in the history
A bunch of small improvements to testing and golden answer generation.
  • Loading branch information
rxin committed Jan 12, 2014
2 parents 0d2388b + 86355a6 commit 8a8b521
Show file tree
Hide file tree
Showing 15 changed files with 299 additions and 79 deletions.
58 changes: 32 additions & 26 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,54 +1,60 @@
all: a b c d e f g h i j k l m n o p q r s t u v w x y" z

buildWhiteList:
sbt -Dshark.hive.alltests "test-only catalyst.execution.HiveCompatibility"

findBroken:
sbt -Dshark.hive.alltests -Dshark.hive.failFast "test-only catalyst.execution.HiveCompatibility"

a:
sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatibility"
b:
sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatibility"
c:
sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatibility"
d:
sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatibility"
e:
sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatibility"
f:
sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatibility"
g:
sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatibility"
h:
sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatibility"
i:
sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatibility"
j:
sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatibility"
k:
sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatibility"
l:
sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatibility"
m:
sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatibility"
n:
sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatibility"
o:
sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatibility"
p:
sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatibility"
q:
sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatibility"
r:
sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatibility"
s:
sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatibility"
t:
sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatibility"
u:
sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatibility"
v:
sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatibility"
w:
sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatibility"
x:
sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatibility"
y:
sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatibility"
z:
sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatability"
sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatibility"
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ resolvers += "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/

libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating-SNAPSHOT"

libraryDependencies += "catalyst" % "hive-golden" % "2" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden2.jar"
libraryDependencies += "catalyst" % "hive-golden" % "3" % "test" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden3.jar"

// Hive 0.10.0 relies on a weird version of jdo that is not published anywhere... Remove when we upgrade to 0.11.0
libraryDependencies += "javax.jdo" % "jdo2-api" % "2.3-ec" from "http://www.datanucleus.org/downloads/maven2/javax/jdo/jdo2-api/2.3-ec/jdo2-api-2.3-ec.jar"
Expand Down
6 changes: 4 additions & 2 deletions src/main/scala/catalyst/execution/MetastoreCatalog.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Partition, Table, StorageDescriptor, SerDeInfo}
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
import org.apache.hadoop.hive.ql.plan.TableDesc
import org.apache.hadoop.hive.ql.session.SessionState
import org.apache.hadoop.hive.serde2.AbstractDeserializer
import org.apache.hadoop.mapred.InputFormat

Expand All @@ -21,7 +22,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog {

def lookupRelation(name: String, alias: Option[String]): BaseRelation = {
val (databaseName, tableName) = name.split("\\.") match {
case Array(tableOnly) => ("default", tableOnly)
case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly)
case Array(db, table) => (db, table)
}
val table = client.getTable(databaseName, tableName)
Expand All @@ -46,7 +47,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog {
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case InsertIntoCreatedTable(name, child) =>
val (databaseName, tableName) = name.split("\\.") match {
case Array(tableOnly) => ("default", tableOnly)
case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly)
case Array(db, table) => (db, table)
}

Expand Down Expand Up @@ -81,6 +82,7 @@ object HiveMetatoreTypes {
def toDataType(metastoreType: String): DataType =
metastoreType match {
case "string" => StringType
case "float" => FloatType
case "int" => IntegerType
case "double" => DoubleType
case "bigint" => LongType
Expand Down
6 changes: 3 additions & 3 deletions src/main/scala/catalyst/execution/SharkInstance.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ abstract class SharkInstance extends Logging {
def metastorePath: String

/** The SharkContext */
lazy val sc = createContext()
lazy val sc: SharkContext = createContext()

protected def createContext(): SharkContext = {
SharkEnv.initWithSharkContext("catalyst.execution", master)
Expand All @@ -45,8 +45,8 @@ abstract class SharkInstance extends Logging {
/** Sets up the system initially or after a RESET command */
protected def configure() {
// TODO: refactor this so we can work with other databases.
runSqlHive("set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + metastorePath +
";create=true")
runSqlHive(
s"set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=$metastorePath;create=true")
runSqlHive("set hive.metastore.warehouse.dir=" + warehousePath)
}

Expand Down
10 changes: 9 additions & 1 deletion src/main/scala/catalyst/execution/TestShark.scala
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ object TestShark extends SharkInstance {
* hive test cases assume the system is set up.
*/
private def rewritePaths(cmd: String): String =
if (cmd.toUpperCase startsWith "LOAD")
if (cmd.toUpperCase contains "LOAD DATA")
cmd.replaceAll("\\.\\.", hiveDevHome.getCanonicalPath)
else
cmd
Expand Down Expand Up @@ -234,6 +234,8 @@ object TestShark extends SharkInstance {
// For some reason, RESET does not reset the following variables...
runSqlHive("set datanucleus.cache.collections=true")
runSqlHive("set datanucleus.cache.collections.lazy=true")
// Lots of tests fail if we do not change the partition whitelist from the default.
runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")

loadedTables.clear()
catalog.client.getAllTables("default").foreach { t =>
Expand Down Expand Up @@ -261,6 +263,12 @@ object TestShark extends SharkInstance {
configure()

runSqlHive("USE default")

// Just loading src makes a lot of tests pass. This is because some tests do something like
// drop an index on src at the beginning. Since we just pass DDL to hive this bypasses our
// Analyzer and thus the test table auto-loading mechanism.
// Remove after we handle more DDL operations natively.
loadTestTable("src")
} catch {
case e: Exception =>
logger.error(s"FATAL ERROR: Failed to reset TestDB state. $e")
Expand Down
91 changes: 64 additions & 27 deletions src/main/scala/catalyst/frontend/Hive.scala
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ object HiveQl {
"TOK_ALTERINDEX_REBUILD",
"TOK_ALTERTABLE_ADDCOLS",
"TOK_ALTERTABLE_ADDPARTS",
"TOK_ALTERTABLE_ALTERPARTS",
"TOK_ALTERTABLE_ARCHIVE",
"TOK_ALTERTABLE_CLUSTER_SORT",
"TOK_ALTERTABLE_DROPPARTS",
Expand All @@ -90,6 +91,7 @@ object HiveQl {
"TOK_ALTERTABLE_RENAME",
"TOK_ALTERTABLE_RENAMECOL",
"TOK_ALTERTABLE_REPLACECOLS",
"TOK_ALTERTABLE_SKEWED",
"TOK_ALTERTABLE_TOUCH",
"TOK_ALTERTABLE_UNARCHIVE",
"TOK_ANALYZE",
Expand All @@ -103,6 +105,7 @@ object HiveQl {

// TODO(marmbrus): Figure out how view are expanded by hive, as we might need to handle this.
"TOK_ALTERVIEW_ADDPARTS",
"TOK_ALTERVIEW_AS",
"TOK_ALTERVIEW_DROPPARTS",
"TOK_ALTERVIEW_PROPERTIES",
"TOK_ALTERVIEW_RENAME",
Expand Down Expand Up @@ -254,7 +257,7 @@ object HiveQl {
/** Extractor for matching Hive's AST Tokens. */
object Token {
/** @return matches of the form (tokenName, children). */
def unapply(t: Any) = t match {
def unapply(t: Any): Option[(String, Seq[ASTNode])] = t match {
case t: ASTNode =>
Some((t.getText, Option(t.getChildren).map(_.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]]))
case _ => None
Expand Down Expand Up @@ -339,8 +342,8 @@ object HiveQl {
}

protected def nodeToPlan(node: Node): LogicalPlan = node match {
// Just fake explain on create function...
case Token("TOK_EXPLAIN", Token("TOK_CREATEFUNCTION", _) :: Nil) => NoRelation
// Just fake explain for any of the native commands.
case Token("TOK_EXPLAIN", Token(explainType, _) :: Nil) if nativeCommands contains explainType => NoRelation
case Token("TOK_EXPLAIN", explainArgs) =>
// Ignore FORMATTED if present.
val Some(query) :: _ :: _ :: Nil = getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
Expand All @@ -362,24 +365,29 @@ object HiveQl {

// Return one query for each insert clause.
val queries = insertClauses.map { case Token("TOK_INSERT", singleInsert) =>
val (Some(destClause) ::
Some(selectClause) ::
val (
intoClause ::
destClause ::
selectClause ::
selectDistinctClause ::
whereClause ::
groupByClause ::
orderByClause ::
sortByClause ::
limitClause :: Nil) = getClauses(Seq("TOK_DESTINATION", "TOK_SELECT", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert)
limitClause :: Nil) = getClauses(Seq("TOK_INSERT_INTO", "TOK_DESTINATION", "TOK_SELECT", "TOK_SELECTDI", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert)

val relations = nodeToRelation(fromClause)
val withWhere = whereClause.map { whereNode =>
val Seq(whereExpr) = whereNode.getChildren().toSeq
Filter(nodeToExpr(whereExpr), relations)
}.getOrElse(relations)

val select =
(selectClause orElse selectDistinctClause).getOrElse(sys.error("No select clause."))

// Script transformations are expressed as a select clause with a single expression of type
// TOK_TRANSFORM
val transformation = selectClause.getChildren.head match {
val transformation = select.getChildren.head match {
case Token("TOK_SELEXPR",
Token("TOK_TRANSFORM",
Token("TOK_EXPLIST", inputExprs) ::
Expand All @@ -400,28 +408,39 @@ object HiveQl {
// a script transformation.
val withProject = transformation.getOrElse {
// Not a transformation so must be either project or aggregation.
val selectExpressions = nameExpressions(selectClause.getChildren.flatMap(selExprNodeToExpr))
val selectExpressions = nameExpressions(select.getChildren.flatMap(selExprNodeToExpr))

groupByClause match {
case Some(groupBy) => Aggregate(groupBy.getChildren.map(nodeToExpr), selectExpressions, withWhere)
case None => Project(selectExpressions, withWhere)
}
}

val withDistinct =
if(selectDistinctClause.isDefined)
Distinct(withProject)
else
withProject

require(!(orderByClause.isDefined && sortByClause.isDefined), "Can't have both a sort by and order by.")
// Right now we treat sorting and ordering as identical.
val withSort =
(orderByClause orElse sortByClause)
.map(_.getChildren.map(nodeToSortOrder))
.map(Sort(_, withProject))
.getOrElse(withProject)
.map(Sort(_, withDistinct))
.getOrElse(withDistinct)
val withLimit =
limitClause.map(l => nodeToExpr(l.getChildren.head))
.map(StopAfter(_, withSort))
.getOrElse(withSort)

// There are two tokens for specifying where to sent the result that seem to be used almost
// interchangeably.
val resultDestination =
(intoClause orElse destClause).getOrElse(sys.error("No destination found."))

nodeToDest(
destClause,
resultDestination,
withLimit)
}

Expand All @@ -440,12 +459,34 @@ object HiveQl {
query :: Token(alias, Nil) :: Nil) =>
Subquery(alias, nodeToPlan(query))

/* Table, No Alias */
case Token("TOK_TABREF",
Token("TOK_TABNAME",
tableNameParts) :: Nil) =>
val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".")
UnresolvedRelation(tableName, None)
/* All relations, possibly with aliases or sampling clauses. */
case Token("TOK_TABREF", clauses) =>
// If the last clause is not a token then it's the alias of the table.
val (nonAliasClauses, aliasClause) =
if(clauses.last.getText.startsWith("TOK"))
(clauses, None)
else
(clauses.dropRight(1), Some(clauses.last))

val (Some(tableNameParts) ::
splitSampleClause ::
bucketSampleClause :: Nil) = getClauses(Seq("TOK_TABNAME", "TOK_TABLESPLITSAMPLE", "TOK_TABLEBUCKETSAMPLE"), nonAliasClauses)

val tableName = tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) }.mkString(".")
val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) }
val relation = UnresolvedRelation(tableName, alias)

// Apply sampling if requested.
(bucketSampleClause orElse splitSampleClause).map {
case Token("TOK_TABLESPLITSAMPLE",
Token("TOK_ROWCOUNT", Nil) ::
Token(count, Nil) :: Nil) =>
StopAfter(Literal(count.toInt), relation)
case Token("TOK_TABLEBUCKETSAMPLE",
Token(numerator, Nil) ::
Token(denominator, Nil) :: Nil) =>
Sample(numerator.toDouble / denominator.toDouble, relation)
}.getOrElse(relation)

case Token("TOK_UNIQUEJOIN", joinArgs) =>
val tableOrdinals =
Expand Down Expand Up @@ -492,14 +533,6 @@ object HiveQl {
// named output expressions where some aggregate expression has been applied (i.e. First).
??? /// Aggregate(groups, Star(None, First(_)) :: Nil, joinedResult)

/* Table with Alias */
case Token("TOK_TABREF",
Token("TOK_TABNAME",
tableNameParts) ::
Token(alias, Nil) :: Nil) =>
val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".")
UnresolvedRelation(tableName, Some(alias))

case Token(allJoinTokens(joinToken),
relation1 ::
relation2 :: other) =>
Expand All @@ -510,6 +543,7 @@ object HiveQl {
case "TOK_LEFTOUTERJOIN" => LeftOuter
case "TOK_FULLOUTERJOIN" => FullOuter
}
assert(other.size <= 1, "Unhandled join clauses.")
Join(nodeToRelation(relation1),
nodeToRelation(relation2),
joinType,
Expand All @@ -529,13 +563,14 @@ object HiveQl {
throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ")
}

val destinationToken = "TOK_DESTINATION|TOK_INSERT_INTO".r
protected def nodeToDest(node: Node, query: LogicalPlan): LogicalPlan = node match {
case Token("TOK_DESTINATION",
case Token(destinationToken(),
Token("TOK_DIR",
Token("TOK_TMP_FILE", Nil) :: Nil) :: Nil) =>
query

case Token("TOK_DESTINATION",
case Token(destinationToken(),
Token("TOK_TAB",
tableArgs) :: Nil) =>
val Some(nameClause) :: partitionClause :: Nil =
Expand Down Expand Up @@ -655,6 +690,8 @@ object HiveQl {
/* UDFs - Must be last otherwise will preempt built in functions */
case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
UnresolvedFunction(name, args.map(nodeToExpr))
case Token("TOK_FUNCTIONSTAR", Token(name, Nil) :: args) =>
UnresolvedFunction(name, Star(None) :: Nil)

/* Literals */
case Token("TOK_NULL", Nil) => Literal(null, IntegerType) // TODO: What type is null?
Expand Down
Loading

0 comments on commit 8a8b521

Please sign in to comment.