Merge pull request #8 from marmbrus/testImprovment

A bunch of small improvements to testing and golden answer generation.
marmbrus · Jan 12, 2014 · 8a8b521 · 8a8b521
2 parents 0d2388b + 86355a6
commit 8a8b521
Show file tree

Hide file tree

Showing 15 changed files with 299 additions and 79 deletions.
diff --git a/Makefile b/Makefile
@@ -1,54 +1,60 @@
 all: a b c d e f g h i j k l m n o p q r s t u v w x y" z
 
+buildWhiteList:
+	sbt -Dshark.hive.alltests "test-only catalyst.execution.HiveCompatibility"
+
+findBroken:
+	sbt -Dshark.hive.alltests -Dshark.hive.failFast "test-only catalyst.execution.HiveCompatibility"
+
 a:
-	 sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatibility"
 b:
-	 sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatibility"
 c:
-	 sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatibility"
 d:
-	 sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatibility"
 e:
-	 sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatibility"
 f:
-	 sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatibility"
 g:
-	 sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatibility"
 h:
-	 sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatibility"
 i:
-	 sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatibility"
 j:
-	 sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatibility"
 k:
-	 sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatibility"
 l:
-	 sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatibility"
 m:
-	 sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatibility"
 n:
-	 sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatibility"
 o:
-	 sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatibility"
 p:
-	 sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatibility"
 q:
-	 sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatibility"
 r:
-	 sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatibility"
 s:
-	 sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatibility"
 t:
-	 sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatibility"
 u:
-	 sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatibility"
 v:
-	 sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatibility"
 w:
-	 sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatibility"
 x:
-	 sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatibility"
 y:
-	 sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatibility"
 z:
-	 sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatability"
+	 sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatibility"
diff --git a/build.sbt b/build.sbt
@@ -14,7 +14,7 @@ resolvers += "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/
 
 libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating-SNAPSHOT"
 
-libraryDependencies += "catalyst" % "hive-golden" % "2" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden2.jar"
+libraryDependencies += "catalyst" % "hive-golden" % "3" % "test" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden3.jar"
 
 // Hive 0.10.0 relies on a weird version of jdo that is not published anywhere... Remove when we upgrade to 0.11.0
 libraryDependencies += "javax.jdo" % "jdo2-api" % "2.3-ec" from "http://www.datanucleus.org/downloads/maven2/javax/jdo/jdo2-api/2.3-ec/jdo2-api-2.3-ec.jar"

diff --git a/src/main/scala/catalyst/execution/MetastoreCatalog.scala b/src/main/scala/catalyst/execution/MetastoreCatalog.scala
@@ -5,6 +5,7 @@ import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.{FieldSchema, Partition, Table, StorageDescriptor, SerDeInfo}
 import org.apache.hadoop.hive.metastore.HiveMetaStoreClient
 import org.apache.hadoop.hive.ql.plan.TableDesc
+import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.AbstractDeserializer
 import org.apache.hadoop.mapred.InputFormat
 
@@ -21,7 +22,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog {
 
   def lookupRelation(name: String, alias: Option[String]): BaseRelation = {
     val (databaseName, tableName) = name.split("\\.") match {
-      case Array(tableOnly) => ("default", tableOnly)
+      case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly)
       case Array(db, table) => (db, table)
     }
     val table = client.getTable(databaseName, tableName)
@@ -46,7 +47,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog {
     def apply(plan: LogicalPlan): LogicalPlan = plan transform {
       case InsertIntoCreatedTable(name, child) =>
         val (databaseName, tableName) = name.split("\\.") match {
-          case Array(tableOnly) => ("default", tableOnly)
+          case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly)
           case Array(db, table) => (db, table)
         }
 
@@ -81,6 +82,7 @@ object HiveMetatoreTypes {
   def toDataType(metastoreType: String): DataType =
     metastoreType match {
       case "string" => StringType
+      case "float" => FloatType
       case "int" => IntegerType
       case "double" => DoubleType
       case "bigint" => LongType

diff --git a/src/main/scala/catalyst/execution/SharkInstance.scala b/src/main/scala/catalyst/execution/SharkInstance.scala
@@ -36,7 +36,7 @@ abstract class SharkInstance extends Logging {
   def metastorePath: String
 
   /** The SharkContext */
-  lazy val sc = createContext()
+  lazy val sc: SharkContext = createContext()
 
   protected def createContext(): SharkContext = {
     SharkEnv.initWithSharkContext("catalyst.execution", master)
@@ -45,8 +45,8 @@ abstract class SharkInstance extends Logging {
   /** Sets up the system initially or after a RESET command */
   protected def configure() {
     // TODO: refactor this so we can work with other databases.
-    runSqlHive("set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + metastorePath +
-      ";create=true")
+    runSqlHive(
+      s"set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=$metastorePath;create=true")
     runSqlHive("set hive.metastore.warehouse.dir=" + warehousePath)
   }
 

diff --git a/src/main/scala/catalyst/execution/TestShark.scala b/src/main/scala/catalyst/execution/TestShark.scala
@@ -85,7 +85,7 @@ object TestShark extends SharkInstance {
    * hive test cases assume the system is set up.
    */
   private def rewritePaths(cmd: String): String =
-    if (cmd.toUpperCase startsWith "LOAD")
+    if (cmd.toUpperCase contains "LOAD DATA")
       cmd.replaceAll("\\.\\.", hiveDevHome.getCanonicalPath)
     else
       cmd
@@ -234,6 +234,8 @@ object TestShark extends SharkInstance {
       // For some reason, RESET does not reset the following variables...
       runSqlHive("set datanucleus.cache.collections=true")
       runSqlHive("set datanucleus.cache.collections.lazy=true")
+      // Lots of tests fail if we do not change the partition whitelist from the default.
+      runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*")
 
       loadedTables.clear()
       catalog.client.getAllTables("default").foreach { t =>
@@ -261,6 +263,12 @@ object TestShark extends SharkInstance {
       configure()
 
       runSqlHive("USE default")
+
+      // Just loading src makes a lot of tests pass.  This is because some tests do something like
+      // drop an index on src at the beginning.  Since we just pass DDL to hive this bypasses our
+      // Analyzer and thus the test table auto-loading mechanism.
+      // Remove after we handle more DDL operations natively.
+      loadTestTable("src")
     } catch {
       case e: Exception =>
         logger.error(s"FATAL ERROR: Failed to reset TestDB state. $e")

diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala
@@ -82,6 +82,7 @@ object HiveQl {
     "TOK_ALTERINDEX_REBUILD",
     "TOK_ALTERTABLE_ADDCOLS",
     "TOK_ALTERTABLE_ADDPARTS",
+    "TOK_ALTERTABLE_ALTERPARTS",
     "TOK_ALTERTABLE_ARCHIVE",
     "TOK_ALTERTABLE_CLUSTER_SORT",
     "TOK_ALTERTABLE_DROPPARTS",
@@ -90,6 +91,7 @@ object HiveQl {
     "TOK_ALTERTABLE_RENAME",
     "TOK_ALTERTABLE_RENAMECOL",
     "TOK_ALTERTABLE_REPLACECOLS",
+    "TOK_ALTERTABLE_SKEWED",
     "TOK_ALTERTABLE_TOUCH",
     "TOK_ALTERTABLE_UNARCHIVE",
     "TOK_ANALYZE",
@@ -103,6 +105,7 @@ object HiveQl {
 
     // TODO(marmbrus): Figure out how view are expanded by hive, as we might need to handle this.
     "TOK_ALTERVIEW_ADDPARTS",
+    "TOK_ALTERVIEW_AS",
     "TOK_ALTERVIEW_DROPPARTS",
     "TOK_ALTERVIEW_PROPERTIES",
     "TOK_ALTERVIEW_RENAME",
@@ -254,7 +257,7 @@ object HiveQl {
   /** Extractor for matching Hive's AST Tokens. */
   object Token {
     /** @return matches of the form (tokenName, children). */
-    def unapply(t: Any) = t match {
+    def unapply(t: Any): Option[(String, Seq[ASTNode])] = t match {
       case t: ASTNode =>
         Some((t.getText, Option(t.getChildren).map(_.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]]))
       case _ => None
@@ -339,8 +342,8 @@ object HiveQl {
   }
 
   protected def nodeToPlan(node: Node): LogicalPlan = node match {
-    // Just fake explain on create function...
-    case Token("TOK_EXPLAIN", Token("TOK_CREATEFUNCTION", _) :: Nil) => NoRelation
+    // Just fake explain for any of the native commands.
+    case Token("TOK_EXPLAIN", Token(explainType, _) :: Nil) if nativeCommands contains explainType => NoRelation
     case Token("TOK_EXPLAIN", explainArgs) =>
       // Ignore FORMATTED if present.
       val Some(query) :: _ :: _ :: Nil = getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
@@ -362,24 +365,29 @@ object HiveQl {
 
       // Return one query for each insert clause.
       val queries = insertClauses.map { case Token("TOK_INSERT", singleInsert) =>
-        val (Some(destClause) ::
-            Some(selectClause) ::
+        val (
+            intoClause ::
+            destClause ::
+            selectClause ::
+            selectDistinctClause ::
             whereClause ::
             groupByClause ::
             orderByClause ::
             sortByClause ::
-            limitClause :: Nil) = getClauses(Seq("TOK_DESTINATION", "TOK_SELECT", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert)
+            limitClause :: Nil) = getClauses(Seq("TOK_INSERT_INTO", "TOK_DESTINATION", "TOK_SELECT", "TOK_SELECTDI", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert)
 
         val relations = nodeToRelation(fromClause)
         val withWhere = whereClause.map { whereNode =>
           val Seq(whereExpr) = whereNode.getChildren().toSeq
           Filter(nodeToExpr(whereExpr), relations)
         }.getOrElse(relations)
 
+        val select =
+          (selectClause orElse selectDistinctClause).getOrElse(sys.error("No select clause."))
 
         // Script transformations are expressed as a select clause with a single expression of type
         // TOK_TRANSFORM
-        val transformation = selectClause.getChildren.head match {
+        val transformation = select.getChildren.head match {
           case Token("TOK_SELEXPR",
                  Token("TOK_TRANSFORM",
                    Token("TOK_EXPLIST", inputExprs) ::
@@ -400,28 +408,39 @@ object HiveQl {
         // a script transformation.
         val withProject = transformation.getOrElse {
           // Not a transformation so must be either project or aggregation.
-          val selectExpressions = nameExpressions(selectClause.getChildren.flatMap(selExprNodeToExpr))
+          val selectExpressions = nameExpressions(select.getChildren.flatMap(selExprNodeToExpr))
 
           groupByClause match {
             case Some(groupBy) => Aggregate(groupBy.getChildren.map(nodeToExpr), selectExpressions, withWhere)
             case None => Project(selectExpressions, withWhere)
           }
         }
 
+        val withDistinct =
+          if(selectDistinctClause.isDefined)
+            Distinct(withProject)
+          else
+            withProject
+
         require(!(orderByClause.isDefined && sortByClause.isDefined), "Can't have both a sort by and order by.")
         // Right now we treat sorting and ordering as identical.
         val withSort =
           (orderByClause orElse sortByClause)
             .map(_.getChildren.map(nodeToSortOrder))
-            .map(Sort(_, withProject))
-            .getOrElse(withProject)
+            .map(Sort(_, withDistinct))
+            .getOrElse(withDistinct)
         val withLimit =
           limitClause.map(l => nodeToExpr(l.getChildren.head))
             .map(StopAfter(_, withSort))
             .getOrElse(withSort)
 
+        // There are two tokens for specifying where to sent the result that seem to be used almost
+        // interchangeably.
+        val resultDestination =
+          (intoClause orElse destClause).getOrElse(sys.error("No destination found."))
+
         nodeToDest(
-          destClause,
+          resultDestination,
           withLimit)
       }
 
@@ -440,12 +459,34 @@ object HiveQl {
            query :: Token(alias, Nil) :: Nil) =>
       Subquery(alias, nodeToPlan(query))
 
-    /* Table, No Alias */
-    case Token("TOK_TABREF",
-           Token("TOK_TABNAME",
-             tableNameParts) :: Nil) =>
-      val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".")
-      UnresolvedRelation(tableName, None)
+    /* All relations, possibly with aliases or sampling clauses. */
+    case Token("TOK_TABREF", clauses) =>
+      // If the last clause is not a token then it's the alias of the table.
+      val (nonAliasClauses, aliasClause) =
+        if(clauses.last.getText.startsWith("TOK"))
+          (clauses, None)
+        else
+          (clauses.dropRight(1), Some(clauses.last))
+
+      val (Some(tableNameParts) ::
+          splitSampleClause ::
+          bucketSampleClause :: Nil) = getClauses(Seq("TOK_TABNAME", "TOK_TABLESPLITSAMPLE", "TOK_TABLEBUCKETSAMPLE"), nonAliasClauses)
+
+      val tableName = tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) }.mkString(".")
+      val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) }
+      val relation = UnresolvedRelation(tableName, alias)
+
+      // Apply sampling if requested.
+      (bucketSampleClause orElse splitSampleClause).map {
+        case Token("TOK_TABLESPLITSAMPLE",
+               Token("TOK_ROWCOUNT", Nil) ::
+               Token(count, Nil) :: Nil) =>
+          StopAfter(Literal(count.toInt), relation)
+        case Token("TOK_TABLEBUCKETSAMPLE",
+               Token(numerator, Nil) ::
+               Token(denominator, Nil) :: Nil) =>
+          Sample(numerator.toDouble / denominator.toDouble, relation)
+      }.getOrElse(relation)
 
     case Token("TOK_UNIQUEJOIN", joinArgs) =>
       val tableOrdinals =
@@ -492,14 +533,6 @@ object HiveQl {
       // named output expressions where some aggregate expression has been applied (i.e. First).
       ??? /// Aggregate(groups, Star(None, First(_)) :: Nil, joinedResult)
 
-    /* Table with Alias */
-    case Token("TOK_TABREF",
-           Token("TOK_TABNAME",
-             tableNameParts) ::
-             Token(alias, Nil) :: Nil) =>
-      val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".")
-      UnresolvedRelation(tableName, Some(alias))
-
     case Token(allJoinTokens(joinToken),
            relation1 ::
            relation2 :: other) =>
@@ -510,6 +543,7 @@ object HiveQl {
         case "TOK_LEFTOUTERJOIN" => LeftOuter
         case "TOK_FULLOUTERJOIN" => FullOuter
       }
+      assert(other.size <= 1, "Unhandled join clauses.")
       Join(nodeToRelation(relation1),
         nodeToRelation(relation2),
         joinType,
@@ -529,13 +563,14 @@ object HiveQl {
       throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ")
   }
 
+  val destinationToken = "TOK_DESTINATION|TOK_INSERT_INTO".r
   protected def nodeToDest(node: Node, query: LogicalPlan): LogicalPlan = node match {
-    case Token("TOK_DESTINATION",
+    case Token(destinationToken(),
            Token("TOK_DIR",
              Token("TOK_TMP_FILE", Nil) :: Nil) :: Nil) =>
       query
 
-    case Token("TOK_DESTINATION",
+    case Token(destinationToken(),
            Token("TOK_TAB",
               tableArgs) :: Nil) =>
       val Some(nameClause) :: partitionClause :: Nil =
@@ -655,6 +690,8 @@ object HiveQl {
     /* UDFs - Must be last otherwise will preempt built in functions */
     case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
       UnresolvedFunction(name, args.map(nodeToExpr))
+    case Token("TOK_FUNCTIONSTAR", Token(name, Nil) :: args) =>
+      UnresolvedFunction(name, Star(None) :: Nil)
 
     /* Literals */
     case Token("TOK_NULL", Nil) => Literal(null, IntegerType) // TODO: What type is null?