From f54c94c36af64ee9c01190dbcf4d5a049c27db4a Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 13:17:09 -0800 Subject: [PATCH 01/27] make golden answers file a test dependency --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index afae82f22288c..401fc04fd3cdd 100644 --- a/build.sbt +++ b/build.sbt @@ -14,7 +14,7 @@ resolvers += "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/ libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating-SNAPSHOT" -libraryDependencies += "catalyst" % "hive-golden" % "2" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden2.jar" +libraryDependencies += "catalyst" % "hive-golden" % "2" % "test" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden2.jar" // Hive 0.10.0 relies on a weird version of jdo that is not published anywhere... Remove when we upgrade to 0.11.0 libraryDependencies += "javax.jdo" % "jdo2-api" % "2.3-ec" from "http://www.datanucleus.org/downloads/maven2/javax/jdo/jdo2-api/2.3-ec/jdo2-api-2.3-ec.jar" From eafaeed4d154bd3fec44fb94adf7ccd3eeca7b11 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 13:17:31 -0800 Subject: [PATCH 02/27] add type documentation --- src/main/scala/catalyst/execution/SharkInstance.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/catalyst/execution/SharkInstance.scala b/src/main/scala/catalyst/execution/SharkInstance.scala index dd1ec84d1618f..130578e87fbe2 100644 --- a/src/main/scala/catalyst/execution/SharkInstance.scala +++ b/src/main/scala/catalyst/execution/SharkInstance.scala @@ -36,7 +36,7 @@ abstract class SharkInstance extends Logging { def metastorePath: String /** The SharkContext */ - lazy val sc = createContext() + lazy val sc: SharkContext = createContext() protected def createContext(): SharkContext = { SharkEnv.initWithSharkContext("catalyst.execution", master) From 6f64cee0bd69dac4fbd63c987f47bde1931ecb32 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 13:17:46 -0800 Subject: [PATCH 03/27] don't line wrap string literal --- src/main/scala/catalyst/execution/SharkInstance.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/catalyst/execution/SharkInstance.scala b/src/main/scala/catalyst/execution/SharkInstance.scala index 130578e87fbe2..c7d4eba52751f 100644 --- a/src/main/scala/catalyst/execution/SharkInstance.scala +++ b/src/main/scala/catalyst/execution/SharkInstance.scala @@ -45,8 +45,8 @@ abstract class SharkInstance extends Logging { /** Sets up the system initially or after a RESET command */ protected def configure() { // TODO: refactor this so we can work with other databases. - runSqlHive("set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=" + metastorePath + - ";create=true") + runSqlHive( + s"set javax.jdo.option.ConnectionURL=jdbc:derby:;databaseName=$metastorePath;create=true") runSqlHive("set hive.metastore.warehouse.dir=" + warehousePath) } From 9b02b44a38ebf55ff66749c4fc262be7c8c5c655 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 14:33:49 -0800 Subject: [PATCH 04/27] Fix spelling error. Add failFast mode. --- Makefile | 55 ++++++++++--------- .../catalyst/execution/BigDataBenchmark.scala | 2 +- ...ionTest.scala => HiveComparisonTest.scala} | 34 +++++++++--- ...tability.scala => HiveCompatibility.scala} | 2 +- .../execution/HiveQueryFileTest.scala | 2 +- .../catalyst/execution/HiveQueryTests.scala | 2 +- .../execution/HiveResolutionSuite.scala | 2 +- .../execution/HiveTypeCoersionSuite.scala | 2 +- 8 files changed, 61 insertions(+), 40 deletions(-) rename src/test/scala/catalyst/execution/{HiveComparisionTest.scala => HiveComparisonTest.scala} (84%) rename src/test/scala/catalyst/execution/{HiveCompatability.scala => HiveCompatibility.scala} (99%) diff --git a/Makefile b/Makefile index e763516432300..bbcf31f353131 100644 --- a/Makefile +++ b/Makefile @@ -1,54 +1,57 @@ all: a b c d e f g h i j k l m n o p q r s t u v w x y" z +findBroken: + sbt -Dshark.hive.alltests -Dshark.hive.failFast "test-only catalyst.execution.HiveCompatibility" + a: - sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=a.* "test-only catalyst.execution.HiveCompatibility" b: - sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=b.* "test-only catalyst.execution.HiveCompatibility" c: - sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=c.* "test-only catalyst.execution.HiveCompatibility" d: - sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=d.* "test-only catalyst.execution.HiveCompatibility" e: - sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=e.* "test-only catalyst.execution.HiveCompatibility" f: - sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=f.* "test-only catalyst.execution.HiveCompatibility" g: - sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=g.* "test-only catalyst.execution.HiveCompatibility" h: - sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=h.* "test-only catalyst.execution.HiveCompatibility" i: - sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=i.* "test-only catalyst.execution.HiveCompatibility" j: - sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=j.* "test-only catalyst.execution.HiveCompatibility" k: - sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=k.* "test-only catalyst.execution.HiveCompatibility" l: - sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=l.* "test-only catalyst.execution.HiveCompatibility" m: - sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=m.* "test-only catalyst.execution.HiveCompatibility" n: - sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=n.* "test-only catalyst.execution.HiveCompatibility" o: - sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=o.* "test-only catalyst.execution.HiveCompatibility" p: - sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=p.* "test-only catalyst.execution.HiveCompatibility" q: - sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=q.* "test-only catalyst.execution.HiveCompatibility" r: - sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=r.* "test-only catalyst.execution.HiveCompatibility" s: - sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=s.* "test-only catalyst.execution.HiveCompatibility" t: - sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=t.* "test-only catalyst.execution.HiveCompatibility" u: - sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=u.* "test-only catalyst.execution.HiveCompatibility" v: - sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=v.* "test-only catalyst.execution.HiveCompatibility" w: - sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=w.* "test-only catalyst.execution.HiveCompatibility" x: - sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=x.* "test-only catalyst.execution.HiveCompatibility" y: - sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=y.* "test-only catalyst.execution.HiveCompatibility" z: - sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatability" + sbt -Dshark.hive.whitelist=z.* "test-only catalyst.execution.HiveCompatibility" diff --git a/src/test/scala/catalyst/execution/BigDataBenchmark.scala b/src/test/scala/catalyst/execution/BigDataBenchmark.scala index 52146d16f0cfe..1ff9cb6491746 100644 --- a/src/test/scala/catalyst/execution/BigDataBenchmark.scala +++ b/src/test/scala/catalyst/execution/BigDataBenchmark.scala @@ -7,7 +7,7 @@ import java.io.File * A set of test cases based on the big-data-benchmark. * https://amplab.cs.berkeley.edu/benchmark/ */ -class BigDataBenchmarkTests extends HiveComaparisionTest { +class BigDataBenchmarkTests extends HiveComparisonTest { import TestShark._ val testDataDirectory = new File("target/big-data-benchmark-testdata") diff --git a/src/test/scala/catalyst/execution/HiveComparisionTest.scala b/src/test/scala/catalyst/execution/HiveComparisonTest.scala similarity index 84% rename from src/test/scala/catalyst/execution/HiveComparisionTest.scala rename to src/test/scala/catalyst/execution/HiveComparisonTest.scala index bf73e8a153817..996c643125a26 100644 --- a/src/test/scala/catalyst/execution/HiveComparisionTest.scala +++ b/src/test/scala/catalyst/execution/HiveComparisonTest.scala @@ -11,22 +11,35 @@ import util._ * Allows the creations of tests that execute the same query against both hive * and catalyst, comparing the results. * - * The "golden" results from Hive are cached in [[answerCache]] to speed up testing. + * The "golden" results from Hive are cached in an retrieved both from the classpath and + * [[answerCache]] to speed up testing. */ -// TODO: correct the mispelled name. -abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with GivenWhenThen with Logging { +abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with GivenWhenThen with Logging { protected val targetDir = new File("target") + + /** The local directory with cached golden answer will be stored */ protected val answerCache = new File(targetDir, "comparison-test-cache") if (!answerCache.exists) answerCache.mkdir() + /** The [[ClassLoader]] that contains test dependencies. Used to look for golden answers. */ + protected val testClassLoader = this.getClass.getClassLoader + + /** A file where all the test cases that pass are written. Can be used to update the whiteList. */ val passedFile = new File(targetDir, s"$suiteName.passed") - val passedList = new PrintWriter(passedFile) + protected val passedList = new PrintWriter(passedFile) override def afterAll() { passedList.close() } + /** + * When `-Dshark.hive.failFast` is set the first test to fail will cause all subsequent tests to + * also fail. + */ + val failFast = System.getProperty("shark.hive.failFast") != null + private var testFailed = false + protected val cacheDigest = java.security.MessageDigest.getInstance("MD5") protected def getMd5(str: String): String = { val digest = java.security.MessageDigest.getInstance("MD5") @@ -36,7 +49,8 @@ abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with protected def prepareAnswer(sharkQuery: TestShark.type#SharkSqlQuery, answer: Seq[String]): Seq[String] = { val orderedAnswer = sharkQuery.parsed match { - case _: Command => answer.filterNot(nonDeterministicLine) // Clean out nondeterministic time schema info. + // Clean out non-deterministic time schema info. + case _: Command => answer.filterNot(nonDeterministicLine) case _ => val isOrdered = sharkQuery.executedPlan.collect { case s: Sort => s}.nonEmpty // If the query results aren't sorted, then sort them to ensure deterministic answers. @@ -52,7 +66,7 @@ abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with line.replaceAll("\"lastUpdateTime\":\\d+", "") /** - * Removes non-deterministic paths from [[str]] so cached answers will still pass. + * Removes non-deterministic paths from str` so cached answers will still pass. */ protected def cleanPaths(str: String): String = { str.replaceAll("file:\\/.*\\/", "") @@ -61,6 +75,8 @@ abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with val installHooksCommand = "(?i)SET.*hooks".r def createQueryTest(testCaseName: String, sql: String) = { test(testCaseName) { + if(failFast && testFailed) sys.error("Failing fast due to previous failure") + testFailed = true logger.error( s""" |============================= @@ -82,11 +98,11 @@ abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with } val hiveCachedResults = hiveCacheFiles.flatMap { cachedAnswerFile => - logger.debug(s"Looking for cached answer file $cachedAnswerFile.") + logger.warn(s"Looking for cached answer file $cachedAnswerFile.") if (cachedAnswerFile.exists) { Some(fileToString(cachedAnswerFile)) } else if (getClass.getClassLoader.getResourceAsStream(cachedAnswerFile.toString) != null) { - Some(resourceToString(cachedAnswerFile.toString)) + Some(resourceToString(cachedAnswerFile.toString, classLoader = testClassLoader)) } else { logger.debug(s"File $cachedAnswerFile not found") None @@ -123,6 +139,8 @@ abstract class HiveComaparisionTest extends FunSuite with BeforeAndAfterAll with computedResults } + testFailed = false + // Run w/ catalyst val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) => info(queryString) diff --git a/src/test/scala/catalyst/execution/HiveCompatability.scala b/src/test/scala/catalyst/execution/HiveCompatibility.scala similarity index 99% rename from src/test/scala/catalyst/execution/HiveCompatability.scala rename to src/test/scala/catalyst/execution/HiveCompatibility.scala index 53eecffce4a52..32daa17156b92 100644 --- a/src/test/scala/catalyst/execution/HiveCompatability.scala +++ b/src/test/scala/catalyst/execution/HiveCompatibility.scala @@ -8,7 +8,7 @@ import util._ /** * Runs the test cases that are included in the hive distribution. */ -class HiveCompatability extends HiveQueryFileTest { +class HiveCompatibility extends HiveQueryFileTest { // TODO: bundle in jar files... get from classpath lazy val hiveQueryDir = new File(TestShark.hiveDevHome, "ql/src/test/queries/clientpositive") def testCases = hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f) diff --git a/src/test/scala/catalyst/execution/HiveQueryFileTest.scala b/src/test/scala/catalyst/execution/HiveQueryFileTest.scala index afda9f7b5270b..4edcacc1131c3 100644 --- a/src/test/scala/catalyst/execution/HiveQueryFileTest.scala +++ b/src/test/scala/catalyst/execution/HiveQueryFileTest.scala @@ -11,7 +11,7 @@ import util._ * TestSuites that derive from this class must provide a map of testCaseName -> testCaseFiles that should be included. * Additionally, there is support for whitelisting and blacklisting tests as development progresses. */ -abstract class HiveQueryFileTest extends HiveComaparisionTest { +abstract class HiveQueryFileTest extends HiveComparisonTest { /** A list of tests deemed out of scope and thus completely disregarded */ def blackList: Seq[String] = Nil diff --git a/src/test/scala/catalyst/execution/HiveQueryTests.scala b/src/test/scala/catalyst/execution/HiveQueryTests.scala index 5a39c35640684..a694a1d6194c6 100644 --- a/src/test/scala/catalyst/execution/HiveQueryTests.scala +++ b/src/test/scala/catalyst/execution/HiveQueryTests.scala @@ -4,7 +4,7 @@ package execution /** * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution. */ -class HiveQueryTests extends HiveComaparisionTest { +class HiveQueryTests extends HiveComparisonTest { import TestShark._ createQueryTest("Simple Average", diff --git a/src/test/scala/catalyst/execution/HiveResolutionSuite.scala b/src/test/scala/catalyst/execution/HiveResolutionSuite.scala index 13dfb951cb55a..4ab6c0dc80a1f 100644 --- a/src/test/scala/catalyst/execution/HiveResolutionSuite.scala +++ b/src/test/scala/catalyst/execution/HiveResolutionSuite.scala @@ -4,7 +4,7 @@ package execution /** * A set of test cases expressed in Hive QL that are not covered by the tests included in the hive distribution. */ -class HiveResolutionSuite extends HiveComaparisionTest { +class HiveResolutionSuite extends HiveComparisonTest { import TestShark._ createQueryTest("table.attr", diff --git a/src/test/scala/catalyst/execution/HiveTypeCoersionSuite.scala b/src/test/scala/catalyst/execution/HiveTypeCoersionSuite.scala index 095dce23aade9..f94b9951bacab 100644 --- a/src/test/scala/catalyst/execution/HiveTypeCoersionSuite.scala +++ b/src/test/scala/catalyst/execution/HiveTypeCoersionSuite.scala @@ -4,7 +4,7 @@ package execution /** * A set of tests that validate type promotion rules. */ -class HiveTypeCoersionSuite extends HiveComaparisionTest { +class HiveTypeCoersionSuite extends HiveComparisonTest { import TestShark._ val baseTypes = Seq("1", "1.0", "1L", "1S", "1Y", "'1'") From 962761679ba3d5b7533cc8ca6db8c33aba15fb66 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:42:40 -0800 Subject: [PATCH 05/27] Use current database as default database. --- src/main/scala/catalyst/execution/MetastoreCatalog.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/catalyst/execution/MetastoreCatalog.scala b/src/main/scala/catalyst/execution/MetastoreCatalog.scala index 590bccfe7c8f4..d83242583f541 100644 --- a/src/main/scala/catalyst/execution/MetastoreCatalog.scala +++ b/src/main/scala/catalyst/execution/MetastoreCatalog.scala @@ -5,6 +5,7 @@ import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.api.{FieldSchema, Partition, Table, StorageDescriptor, SerDeInfo} import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import org.apache.hadoop.hive.ql.plan.TableDesc +import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.serde2.AbstractDeserializer import org.apache.hadoop.mapred.InputFormat @@ -21,7 +22,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog { def lookupRelation(name: String, alias: Option[String]): BaseRelation = { val (databaseName, tableName) = name.split("\\.") match { - case Array(tableOnly) => ("default", tableOnly) + case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly) case Array(db, table) => (db, table) } val table = client.getTable(databaseName, tableName) @@ -46,7 +47,7 @@ class HiveMetastoreCatalog(hiveConf: HiveConf) extends Catalog { def apply(plan: LogicalPlan): LogicalPlan = plan transform { case InsertIntoCreatedTable(name, child) => val (databaseName, tableName) = name.split("\\.") match { - case Array(tableOnly) => ("default", tableOnly) + case Array(tableOnly) => (SessionState.get.getCurrentDatabase(), tableOnly) case Array(db, table) => (db, table) } From 1aafea35c32ebb242d12e9ff7b24f04100050ecb Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:42:59 -0800 Subject: [PATCH 06/27] Configure partition whitelist in TestShark reset. --- src/main/scala/catalyst/execution/TestShark.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/catalyst/execution/TestShark.scala b/src/main/scala/catalyst/execution/TestShark.scala index a27be94c107c4..e5cba37be9b1b 100644 --- a/src/main/scala/catalyst/execution/TestShark.scala +++ b/src/main/scala/catalyst/execution/TestShark.scala @@ -234,6 +234,8 @@ object TestShark extends SharkInstance { // For some reason, RESET does not reset the following variables... runSqlHive("set datanucleus.cache.collections=true") runSqlHive("set datanucleus.cache.collections.lazy=true") + // Lots of tests fail if we do not change the partition whitelist from the default. + runSqlHive("set hive.metastore.partition.name.whitelist.pattern=[\\-A-Za-z0-9:_]*") loadedTables.clear() catalog.client.getAllTables("default").foreach { t => From ca4ea2636e5666fd0442790b623095247d7e5254 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:43:29 -0800 Subject: [PATCH 07/27] Support for parsing UDF(*). --- src/main/scala/catalyst/frontend/Hive.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index 7161690c29cdd..fb6c1d47b78ec 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -655,6 +655,8 @@ object HiveQl { /* UDFs - Must be last otherwise will preempt built in functions */ case Token("TOK_FUNCTION", Token(name, Nil) :: args) => UnresolvedFunction(name, args.map(nodeToExpr)) + case Token("TOK_FUNCTIONSTAR", Token(name, Nil) :: args) => + UnresolvedFunction(name, Star(None) :: Nil) /* Literals */ case Token("TOK_NULL", Nil) => Literal(null, IntegerType) // TODO: What type is null? From 68aa2e6f6e85d6e06424676de53f00794e64b468 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:43:52 -0800 Subject: [PATCH 08/27] Stronger type for Token extractor. --- src/main/scala/catalyst/frontend/Hive.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index fb6c1d47b78ec..d72b521a3e6b5 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -254,7 +254,7 @@ object HiveQl { /** Extractor for matching Hive's AST Tokens. */ object Token { /** @return matches of the form (tokenName, children). */ - def unapply(t: Any) = t match { + def unapply(t: Any): Option[(String, Seq[ASTNode])] = t match { case t: ASTNode => Some((t.getText, Option(t.getChildren).map(_.toList).getOrElse(Nil).asInstanceOf[Seq[ASTNode]])) case _ => None From 516481ca197807e91309780c81a719866cef336f Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:44:21 -0800 Subject: [PATCH 09/27] Ignore requests to explain native commands. --- src/main/scala/catalyst/frontend/Hive.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index d72b521a3e6b5..031bd48739952 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -339,8 +339,8 @@ object HiveQl { } protected def nodeToPlan(node: Node): LogicalPlan = node match { - // Just fake explain on create function... - case Token("TOK_EXPLAIN", Token("TOK_CREATEFUNCTION", _) :: Nil) => NoRelation + // Just fake explain for any of the native commands. + case Token("TOK_EXPLAIN", Token(explainType, _) :: Nil) if nativeCommands contains explainType => NoRelation case Token("TOK_EXPLAIN", explainArgs) => // Ignore FORMATTED if present. val Some(query) :: _ :: _ :: Nil = getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs) From 4b6fed8a47e00675171faafac0e582df39f3ff58 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Thu, 9 Jan 2014 18:44:54 -0800 Subject: [PATCH 10/27] support for parsing both DESTINATION and INSERT_INTO. --- src/main/scala/catalyst/frontend/Hive.scala | 52 ++++++++++++++------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index 031bd48739952..9b2b9997d2d69 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -362,13 +362,15 @@ object HiveQl { // Return one query for each insert clause. val queries = insertClauses.map { case Token("TOK_INSERT", singleInsert) => - val (Some(destClause) :: + val ( + intoClause :: + destClause :: Some(selectClause) :: whereClause :: groupByClause :: orderByClause :: sortByClause :: - limitClause :: Nil) = getClauses(Seq("TOK_DESTINATION", "TOK_SELECT", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert) + limitClause :: Nil) = getClauses(Seq("TOK_INSERT_INTO", "TOK_DESTINATION", "TOK_SELECT", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert) val relations = nodeToRelation(fromClause) val withWhere = whereClause.map { whereNode => @@ -420,8 +422,13 @@ object HiveQl { .map(StopAfter(_, withSort)) .getOrElse(withSort) + // There are two tokens for specifying where to sent the result that seem to be used almost + // interchangeably. + val resultDestination = + (intoClause orElse destClause).getOrElse(sys.error("No destination found.")) + nodeToDest( - destClause, + resultDestination, withLimit) } @@ -441,11 +448,27 @@ object HiveQl { Subquery(alias, nodeToPlan(query)) /* Table, No Alias */ - case Token("TOK_TABREF", - Token("TOK_TABNAME", - tableNameParts) :: Nil) => - val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".") - UnresolvedRelation(tableName, None) + case Token("TOK_TABREF", clauses) => + // If the last clause is not a token then it's the alias of the table. + val (nonAliasClauses, aliasClause) = + if(clauses.last.getText.startsWith("TOK")) + (clauses, None) + else + (clauses.dropRight(1), Some(clauses.last)) + + val (Some(tableNameParts) :: + sampleClause :: Nil) = getClauses(Seq("TOK_TABNAME", "TOK_TABLESPLITSAMPLE"), nonAliasClauses) + + val tableName = tableNameParts.getChildren.map { case Token(part, Nil) => part }.mkString(".") + val alias = aliasClause.map { case Token(a, Nil) => a } + val relation = UnresolvedRelation(tableName, alias) + // Apply sampling if requested. + sampleClause.map { + case Token("TOK_TABLESPLITSAMPLE", + Token("TOK_ROWCOUNT", Nil) :: + Token(count, Nil) :: Nil) => + StopAfter(Literal(count.toInt), relation) + }.getOrElse(relation) case Token("TOK_UNIQUEJOIN", joinArgs) => val tableOrdinals = @@ -492,14 +515,6 @@ object HiveQl { // named output expressions where some aggregate expression has been applied (i.e. First). ??? /// Aggregate(groups, Star(None, First(_)) :: Nil, joinedResult) - /* Table with Alias */ - case Token("TOK_TABREF", - Token("TOK_TABNAME", - tableNameParts) :: - Token(alias, Nil) :: Nil) => - val tableName = tableNameParts.map { case Token(part, Nil) => part }.mkString(".") - UnresolvedRelation(tableName, Some(alias)) - case Token(allJoinTokens(joinToken), relation1 :: relation2 :: other) => @@ -529,13 +544,14 @@ object HiveQl { throw new NotImplementedError(s"No parse rules for:\n ${dumpTree(a).toString} ") } + val destinationToken = "TOK_DESTINATION|TOK_INSERT_INTO".r protected def nodeToDest(node: Node, query: LogicalPlan): LogicalPlan = node match { - case Token("TOK_DESTINATION", + case Token(destinationToken(), Token("TOK_DIR", Token("TOK_TMP_FILE", Nil) :: Nil) :: Nil) => query - case Token("TOK_DESTINATION", + case Token(destinationToken(), Token("TOK_TAB", tableArgs) :: Nil) => val Some(nameClause) :: partitionClause :: Nil = From 4c5fb0f4d804e8767ced60e30dcb01db9a86117c Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:01:40 -0800 Subject: [PATCH 11/27] makefile target for building new whitelist. --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index bbcf31f353131..8c66f1833f588 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,8 @@ all: a b c d e f g h i j k l m n o p q r s t u v w x y" z +buildWhiteList: + sbt -Dshark.hive.alltests "test-only catalyst.execution.HiveCompatibility" + findBroken: sbt -Dshark.hive.alltests -Dshark.hive.failFast "test-only catalyst.execution.HiveCompatibility" From 4c6b454116025439dc6a5c3562aa203d30a1ffe3 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:02:31 -0800 Subject: [PATCH 12/27] add option for recomputing the cached golden answer when tests fail. --- .../execution/HiveComparisonTest.scala | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/test/scala/catalyst/execution/HiveComparisonTest.scala b/src/test/scala/catalyst/execution/HiveComparisonTest.scala index 996c643125a26..a6a3c3b172f28 100644 --- a/src/test/scala/catalyst/execution/HiveComparisonTest.scala +++ b/src/test/scala/catalyst/execution/HiveComparisonTest.scala @@ -13,11 +13,13 @@ import util._ * * The "golden" results from Hive are cached in an retrieved both from the classpath and * [[answerCache]] to speed up testing. + * + * TODO(marmbrus): Document system properties. */ abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with GivenWhenThen with Logging { protected val targetDir = new File("target") - /** The local directory with cached golden answer will be stored */ + /** The local directory with cached golden answer will be stored. */ protected val answerCache = new File(targetDir, "comparison-test-cache") if (!answerCache.exists) answerCache.mkdir() @@ -40,6 +42,12 @@ abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with G val failFast = System.getProperty("shark.hive.failFast") != null private var testFailed = false + /** + * Delete any cache files that result in test failures. Used when the test harness has been + * updated thus requiring new golden answers to be computed for some tests. + */ + val recomputeCache = System.getProperty("shark.hive.recomputeCache") != null + protected val cacheDigest = java.security.MessageDigest.getInstance("MD5") protected def getMd5(str: String): String = { val digest = java.security.MessageDigest.getInstance("MD5") @@ -66,7 +74,7 @@ abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with G line.replaceAll("\"lastUpdateTime\":\\d+", "") /** - * Removes non-deterministic paths from str` so cached answers will still pass. + * Removes non-deterministic paths from `str` so cached answers will still pass. */ protected def cleanPaths(str: String): String = { str.replaceAll("file:\\/.*\\/", "") @@ -98,7 +106,7 @@ abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with G } val hiveCachedResults = hiveCacheFiles.flatMap { cachedAnswerFile => - logger.warn(s"Looking for cached answer file $cachedAnswerFile.") + logger.debug(s"Looking for cached answer file $cachedAnswerFile.") if (cachedAnswerFile.exists) { Some(fileToString(cachedAnswerFile)) } else if (getClass.getClassLoader.getResourceAsStream(cachedAnswerFile.toString) != null) { @@ -173,13 +181,18 @@ abstract class HiveComparisonTest extends FunSuite with BeforeAndAfterAll with G val hivePrintOut = s"== HIVE - ${hive.size} row(s) ==" +: preparedHive val catalystPrintOut = s"== CATALYST - ${catalyst.size} row(s) ==" +: catalyst - val resultComparision = sideBySide(hivePrintOut, catalystPrintOut).mkString("\n") + val resultComparison = sideBySide(hivePrintOut, catalystPrintOut).mkString("\n") + + if(recomputeCache) { + logger.warn(s"Clearing cache files for failed test $testCaseName") + hiveCacheFiles.foreach(_.delete()) + } fail( s""" |Results do not match for query: |$sharkQuery\n${sharkQuery.analyzed.output.map(_.name).mkString("\t")} - |$resultComparision + |$resultComparison """.stripMargin) } } From b01468dfdd3d99c2159534fbff152d9ff46576fb Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:03:09 -0800 Subject: [PATCH 13/27] support path rewrites when the query begins with a comment. --- src/main/scala/catalyst/execution/TestShark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/catalyst/execution/TestShark.scala b/src/main/scala/catalyst/execution/TestShark.scala index e5cba37be9b1b..722baa3d8d594 100644 --- a/src/main/scala/catalyst/execution/TestShark.scala +++ b/src/main/scala/catalyst/execution/TestShark.scala @@ -85,7 +85,7 @@ object TestShark extends SharkInstance { * hive test cases assume the system is set up. */ private def rewritePaths(cmd: String): String = - if (cmd.toUpperCase startsWith "LOAD") + if (cmd.toUpperCase contains "LOAD DATA") cmd.replaceAll("\\.\\.", hiveDevHome.getCanonicalPath) else cmd From 8364ec2cc2a34461f15a7e9b19e426d7d9ce79b7 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:03:28 -0800 Subject: [PATCH 14/27] whitelist all possible partition values. --- src/main/scala/catalyst/execution/TestShark.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/catalyst/execution/TestShark.scala b/src/main/scala/catalyst/execution/TestShark.scala index 722baa3d8d594..506f43d8e312c 100644 --- a/src/main/scala/catalyst/execution/TestShark.scala +++ b/src/main/scala/catalyst/execution/TestShark.scala @@ -235,7 +235,7 @@ object TestShark extends SharkInstance { runSqlHive("set datanucleus.cache.collections=true") runSqlHive("set datanucleus.cache.collections.lazy=true") // Lots of tests fail if we do not change the partition whitelist from the default. - runSqlHive("set hive.metastore.partition.name.whitelist.pattern=[\\-A-Za-z0-9:_]*") + runSqlHive("set hive.metastore.partition.name.whitelist.pattern=.*") loadedTables.clear() catalog.client.getAllTables("default").foreach { t => From 78d730d657d0904b5a3c28b49737594aa40ce9cb Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:06:01 -0800 Subject: [PATCH 15/27] Load src test table on RESET. --- src/main/scala/catalyst/execution/TestShark.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/scala/catalyst/execution/TestShark.scala b/src/main/scala/catalyst/execution/TestShark.scala index 506f43d8e312c..736166326656b 100644 --- a/src/main/scala/catalyst/execution/TestShark.scala +++ b/src/main/scala/catalyst/execution/TestShark.scala @@ -263,6 +263,12 @@ object TestShark extends SharkInstance { configure() runSqlHive("USE default") + + // Just loading src makes a lot of tests pass. This is because some tests do something like + // drop an index on src at the beginning. Since we just pass DDL to hive this bypasses our + // Analyzer and thus the test table auto-loading mechanism. + // Remove after we handle more DDL operations natively. + loadTestTable("src") } catch { case e: Exception => logger.error(s"FATAL ERROR: Failed to reset TestDB state. $e") From 0d9d56aca705b856191a636e27723c57b74d7d93 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:07:20 -0800 Subject: [PATCH 16/27] add more native commands to parser --- src/main/scala/catalyst/frontend/Hive.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index 9b2b9997d2d69..c0a4de505aef9 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -82,6 +82,7 @@ object HiveQl { "TOK_ALTERINDEX_REBUILD", "TOK_ALTERTABLE_ADDCOLS", "TOK_ALTERTABLE_ADDPARTS", + "TOK_ALTERTABLE_ALTERPARTS", "TOK_ALTERTABLE_ARCHIVE", "TOK_ALTERTABLE_CLUSTER_SORT", "TOK_ALTERTABLE_DROPPARTS", @@ -90,6 +91,7 @@ object HiveQl { "TOK_ALTERTABLE_RENAME", "TOK_ALTERTABLE_RENAMECOL", "TOK_ALTERTABLE_REPLACECOLS", + "TOK_ALTERTABLE_SKEWED", "TOK_ALTERTABLE_TOUCH", "TOK_ALTERTABLE_UNARCHIVE", "TOK_ANALYZE", From 4cfc11a6c9dfc16cb216f2a18da4e4dd09bd246e Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 12:07:36 -0800 Subject: [PATCH 17/27] more test coverage. --- build.sbt | 2 +- .../execution/HiveCompatibility.scala | 117 +++++++++++++++++- 2 files changed, 116 insertions(+), 3 deletions(-) diff --git a/build.sbt b/build.sbt index 401fc04fd3cdd..e660fb5bbf584 100644 --- a/build.sbt +++ b/build.sbt @@ -14,7 +14,7 @@ resolvers += "Local Maven Repository" at "file://"+Path.userHome.absolutePath+"/ libraryDependencies += "org.apache.spark" %% "spark-core" % "0.9.0-incubating-SNAPSHOT" -libraryDependencies += "catalyst" % "hive-golden" % "2" % "test" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden2.jar" +libraryDependencies += "catalyst" % "hive-golden" % "3" % "test" from "http://repository-databricks.forge.cloudbees.com/snapshot/catalystGolden3.jar" // Hive 0.10.0 relies on a weird version of jdo that is not published anywhere... Remove when we upgrade to 0.11.0 libraryDependencies += "javax.jdo" % "jdo2-api" % "2.3-ec" from "http://www.datanucleus.org/downloads/maven2/javax/jdo/jdo2-api/2.3-ec/jdo2-api-2.3-ec.jar" diff --git a/src/test/scala/catalyst/execution/HiveCompatibility.scala b/src/test/scala/catalyst/execution/HiveCompatibility.scala index 32daa17156b92..5c1dcf78ba6fd 100644 --- a/src/test/scala/catalyst/execution/HiveCompatibility.scala +++ b/src/test/scala/catalyst/execution/HiveCompatibility.scala @@ -31,6 +31,7 @@ class HiveCompatibility extends HiveQueryFileTest { "authorization_5", "keyword_1", "misc_json", + "create_like_tbl_props", // Timezone specific test answers. "udf_unix_timestamp", @@ -52,6 +53,11 @@ class HiveCompatibility extends HiveQueryFileTest { // These tests fail and and exit the JVM. "auto_join18_multi_distinct", "join18_multi_distinct", + "input44", + "input42", + "input_dfs", + "metadata_export_drop", + "repair", // Uses a serde that isn't on the classpath... breaks other tests. "bucketizedhiveinputformat", @@ -64,7 +70,23 @@ class HiveCompatibility extends HiveQueryFileTest { "uniquejoin", // Hive seems to get the wrong answer on some outer joins. MySQL agrees with catalyst. - "auto_join29" + "auto_join29", + + // No support for multi-alias i.e. udf as (e1, e2, e3). + "allcolref_in_udf", + + // No support for TestSerDe (not published afaik) + "alter1", + "input16", + + // Shark does not support buckets. + ".*bucket.*", + + // No window support yet + ".* window.*", + + // Fails in hive with authorization errors. + "alter_rename_partition_authorization" ) /** @@ -77,7 +99,10 @@ class HiveCompatibility extends HiveQueryFileTest { "add_partition_with_whitelist", "alias_casted_column", "alter4", + "alter_index", + "alter_partition_format_loc", "alter_partition_with_whitelist", + "alter_table_serde", "ambiguous_col", "authorization_3", "authorization_5", @@ -87,10 +112,18 @@ class HiveCompatibility extends HiveQueryFileTest { "auto_join26", "auto_join28", "auto_join_nulls", + "auto_sortmerge_join_1", "auto_sortmerge_join_10", + "auto_sortmerge_join_11", + "auto_sortmerge_join_12", "auto_sortmerge_join_15", + "auto_sortmerge_join_2", + "auto_sortmerge_join_3", + "auto_sortmerge_join_4", "auto_sortmerge_join_5", "auto_sortmerge_join_6", + "auto_sortmerge_join_7", + "auto_sortmerge_join_8", "auto_sortmerge_join_9", "binarysortable_1", "bucket1", @@ -106,19 +139,34 @@ class HiveCompatibility extends HiveQueryFileTest { "correlationoptimizer6", "correlationoptimizer7", "count", + "create_like2", + "create_like_tbl_props", + "create_view_translate", "ct_case_insensitive", "database_properties", + "default_partition_name", "delimiter", "desc_non_existent_tbl", "describe_database_json", "describe_table_json", + "describe_formatted_view_partitioned", + "describe_formatted_view_partitioned_json", + "describe_pretty", + "describe_syntax", + "diff_part_input_formats", "disable_file_format_check", "drop_function", "drop_index", + "drop_partitions_filter", + "drop_partitions_filter2", + "drop_partitions_filter3", + "drop_partitions_ignore_protection", "drop_table", + "drop_table2", "drop_view", "escape_orderby1", "escape_sortby1", + "filter_join_breaktask", "groupby1", "groupby1_map", "groupby1_map_nomap", @@ -130,34 +178,65 @@ class HiveCompatibility extends HiveQueryFileTest { "groupby5_map", "groupby5_map_skew", "groupby5_noskew", + "groupby7", + "groupby7_map", + "groupby7_map_multi_single_reducer", + "groupby7_map_skew", + "groupby7_noskew", + "groupby8_map", + "groupby8_map_skew", + "groupby8_noskew", "groupby_multi_single_reducer2", "groupby_mutli_insert_common_distinct", + "groupby_sort_6", + "groupby_sort_8", "groupby_sort_test_1", "implicit_cast1", "index_auto_self_join", "index_auto_update", "index_stale", + "index_auth", + "index_auto_file_format", + "index_auto_mult_tables", + "index_auto_mult_tables_compact", + "index_auto_multiple", + "index_bitmap_compression", + "index_compression", "innerjoin", "inoutdriver", + "input", "input0", "input11", "input11_limit", "input1_limit", "input22", + "input23", + "input24", + "input25", + "input28", + "input2_limit", + "input41", "input4_cb_delim", "input4_limit", "input6", "input7", "input8", "input9", + "input_limit", + "input_part1", + "input_part2", "inputddl4", + "inputddl7", "inputddl8", + "insert_compressed", "join0", "join1", "join10", "join11", "join12", "join13", + "join14", + "join14_hadoop20", "join15", "join16", "join17", @@ -170,11 +249,15 @@ class HiveCompatibility extends HiveQueryFileTest { "join23", "join24", "join25", + "join26", "join27", + "join28", "join29", "join3", "join30", "join31", + "join32", + "join33", "join34", "join35", "join36", @@ -188,6 +271,7 @@ class HiveCompatibility extends HiveQueryFileTest { "join6", "join7", "join8", + "join9", "join_casesensitive", "join_empty", "join_hive_626", @@ -203,8 +287,14 @@ class HiveCompatibility extends HiveQueryFileTest { "literal_ints", "literal_string", "load_file_with_space_in_the_name", + "louter_join_ppr", + "mapjoin_mapjoin", + "mapjoin_subquery", "mapjoin_subquery2", + "mapjoin_test_outer", "mapreduce3", + "merge1", + "merge2", "mergejoins", "mergejoins_mixed", "misc_json", @@ -216,12 +306,22 @@ class HiveCompatibility extends HiveQueryFileTest { "notable_alias2", "nullgroup", "nullgroup2", + "nullgroup3", + "nullgroup5", "nullinput", "nullinput2", "nullscript", "optional_outer", + "order", + "order2", + "outer_join_ppr", + "part_inherit_tbl_props", + "part_inherit_tbl_props_empty", + "partition_schema1", + "partitions_json", "plan_json", "ppd1", + "ppd_constant_where", "ppd_gby", "ppd_gby_join", "ppd_join", @@ -237,28 +337,40 @@ class HiveCompatibility extends HiveQueryFileTest { "ppd_udf_col", "ppd_union", "progress_1", + "protectmode", + "push_or", "query_with_semi", "quote2", "rename_column", + "router_join_ppr", "select_as_omitted", + "select_unquote_and", + "select_unquote_not", + "select_unquote_or", + "serde_reported_schema", "set_variable_sub", "show_describe_func_quotes", "show_functions", "skewjoinopt13", "skewjoinopt18", "skewjoinopt9", + "smb_mapjoin_10", "smb_mapjoin_13", "smb_mapjoin_14", "smb_mapjoin_15", "smb_mapjoin_16", "smb_mapjoin_17", + "smb_mapjoin_21", + "sort", "sort_merge_join_desc_1", "sort_merge_join_desc_2", "sort_merge_join_desc_3", "sort_merge_join_desc_4", + "sort_merge_join_desc_5", + "sort_merge_join_desc_6", + "sort_merge_join_desc_7", "subq2", "tablename_with_select", - "type_cast_1", "udf2", "udf9", "udf_10_trims", @@ -288,6 +400,7 @@ class HiveCompatibility extends HiveQueryFileTest { "udf_datediff", "udf_day", "udf_dayofmonth", + "udf_div", "udf_double", "udf_exp", "udf_field", From 9ae740a589fb24aaf7635f3533abb570ca877281 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 14:11:09 -0800 Subject: [PATCH 18/27] blacklist more tests that require MR. --- src/test/scala/catalyst/execution/HiveCompatibility.scala | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/test/scala/catalyst/execution/HiveCompatibility.scala b/src/test/scala/catalyst/execution/HiveCompatibility.scala index 5c1dcf78ba6fd..ebd10286b4bd5 100644 --- a/src/test/scala/catalyst/execution/HiveCompatibility.scala +++ b/src/test/scala/catalyst/execution/HiveCompatibility.scala @@ -42,6 +42,13 @@ class HiveCompatibility extends HiveQueryFileTest { "index_auto_self_join", "index_stale", "type_cast_1", + "index_compression", + "index_bitmap_compression", + "index_auto_multiple", + "index_auto_mult_tables_compact", + "index_auto_mult_tables", + "index_auto_file_format", + "index_auth", // Hive seems to think 1.0 > NaN = true && 1.0 < NaN = false... which is wrong. // http://stackoverflow.com/a/1573715 From 755b2292e980e285d5f7e5bdd4b66f65aa8ee211 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 14:35:07 -0800 Subject: [PATCH 19/27] blacklist some ddl tests. --- src/test/scala/catalyst/execution/HiveCompatibility.scala | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/test/scala/catalyst/execution/HiveCompatibility.scala b/src/test/scala/catalyst/execution/HiveCompatibility.scala index ebd10286b4bd5..fd1595e86d102 100644 --- a/src/test/scala/catalyst/execution/HiveCompatibility.scala +++ b/src/test/scala/catalyst/execution/HiveCompatibility.scala @@ -33,6 +33,11 @@ class HiveCompatibility extends HiveQueryFileTest { "misc_json", "create_like_tbl_props", + // Weird DDL differences result in failures on jenkins. + "create_like2", + "create_view_translate", + "partitions_json", + // Timezone specific test answers. "udf_unix_timestamp", "udf_to_unix_timestamp", From e9f45889903455fc2e8f298af4f8aefec40d10d8 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Fri, 10 Jan 2014 14:55:17 -0800 Subject: [PATCH 20/27] fix > 100 char. --- src/test/scala/catalyst/execution/HiveCompatibility.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/scala/catalyst/execution/HiveCompatibility.scala b/src/test/scala/catalyst/execution/HiveCompatibility.scala index fd1595e86d102..f2eeebd6c886a 100644 --- a/src/test/scala/catalyst/execution/HiveCompatibility.scala +++ b/src/test/scala/catalyst/execution/HiveCompatibility.scala @@ -15,7 +15,8 @@ class HiveCompatibility extends HiveQueryFileTest { /** A list of tests deemed out of scope currently and thus completely disregarded */ override def blackList = Seq( - "hook_order", // These tests use hooks that are not on the classpath and thus break all subsequent SQL execution. + // These tests use hooks that are not on the classpath and thus break all subsequent execution. + "hook_order", "hook_context", "mapjoin_hook", "multi_sahooks", From ef7b9435dfa3a17ac66e8f54a6fe0d648610c07e Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:26:22 -0800 Subject: [PATCH 21/27] add metastore support for float --- src/main/scala/catalyst/execution/MetastoreCatalog.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/catalyst/execution/MetastoreCatalog.scala b/src/main/scala/catalyst/execution/MetastoreCatalog.scala index d83242583f541..12b06996bd66e 100644 --- a/src/main/scala/catalyst/execution/MetastoreCatalog.scala +++ b/src/main/scala/catalyst/execution/MetastoreCatalog.scala @@ -82,6 +82,7 @@ object HiveMetatoreTypes { def toDataType(metastoreType: String): DataType = metastoreType match { case "string" => StringType + case "float" => FloatType case "int" => IntegerType case "double" => DoubleType case "bigint" => LongType From f0faa264b84e670e0139bd1496aef895b6f705ee Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:31:02 -0800 Subject: [PATCH 22/27] add sample and distinct operators. --- .../scala/catalyst/plans/logical/basicOperators.scala | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/main/scala/catalyst/plans/logical/basicOperators.scala b/src/main/scala/catalyst/plans/logical/basicOperators.scala index 224e44d857051..0e34e7629a16a 100644 --- a/src/main/scala/catalyst/plans/logical/basicOperators.scala +++ b/src/main/scala/catalyst/plans/logical/basicOperators.scala @@ -68,6 +68,16 @@ case class Subquery(alias: String, child: LogicalPlan) extends UnaryNode { def references = Set.empty } +case class Sample(percentage: Double, child: LogicalPlan) extends UnaryNode { + def output = child.output + def references = Set.empty +} + +case class Distinct(child: LogicalPlan) extends UnaryNode { + def output = child.output + def references = child.outputSet +} + case object NoRelation extends LeafNode { def output = Nil } From f58d5a56ca93367658a4c34930c49225c7323952 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:27:02 -0800 Subject: [PATCH 23/27] support for parsing SELECT DISTINCT --- src/main/scala/catalyst/frontend/Hive.scala | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index c0a4de505aef9..0692c83543c1d 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -367,12 +367,13 @@ object HiveQl { val ( intoClause :: destClause :: - Some(selectClause) :: + selectClause :: + selectDistinctClause :: whereClause :: groupByClause :: orderByClause :: sortByClause :: - limitClause :: Nil) = getClauses(Seq("TOK_INSERT_INTO", "TOK_DESTINATION", "TOK_SELECT", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert) + limitClause :: Nil) = getClauses(Seq("TOK_INSERT_INTO", "TOK_DESTINATION", "TOK_SELECT", "TOK_SELECTDI", "TOK_WHERE", "TOK_GROUPBY", "TOK_ORDERBY", "TOK_SORTBY", "TOK_LIMIT"), singleInsert) val relations = nodeToRelation(fromClause) val withWhere = whereClause.map { whereNode => @@ -380,10 +381,12 @@ object HiveQl { Filter(nodeToExpr(whereExpr), relations) }.getOrElse(relations) + val select = + (selectClause orElse selectDistinctClause).getOrElse(sys.error("No select clause.")) // Script transformations are expressed as a select clause with a single expression of type // TOK_TRANSFORM - val transformation = selectClause.getChildren.head match { + val transformation = select.getChildren.head match { case Token("TOK_SELEXPR", Token("TOK_TRANSFORM", Token("TOK_EXPLIST", inputExprs) :: @@ -404,7 +407,7 @@ object HiveQl { // a script transformation. val withProject = transformation.getOrElse { // Not a transformation so must be either project or aggregation. - val selectExpressions = nameExpressions(selectClause.getChildren.flatMap(selExprNodeToExpr)) + val selectExpressions = nameExpressions(select.getChildren.flatMap(selExprNodeToExpr)) groupByClause match { case Some(groupBy) => Aggregate(groupBy.getChildren.map(nodeToExpr), selectExpressions, withWhere) @@ -412,13 +415,19 @@ object HiveQl { } } + val withDistinct = + if(selectDistinctClause.isDefined) + Distinct(withProject) + else + withProject + require(!(orderByClause.isDefined && sortByClause.isDefined), "Can't have both a sort by and order by.") // Right now we treat sorting and ordering as identical. val withSort = (orderByClause orElse sortByClause) .map(_.getChildren.map(nodeToSortOrder)) - .map(Sort(_, withProject)) - .getOrElse(withProject) + .map(Sort(_, withDistinct)) + .getOrElse(withDistinct) val withLimit = limitClause.map(l => nodeToExpr(l.getChildren.head)) .map(StopAfter(_, withSort)) From a92919d8ee1c1d3ce10082d0023b28dd63efd98b Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:27:15 -0800 Subject: [PATCH 24/27] add alter view as to native commands --- src/main/scala/catalyst/frontend/Hive.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index 0692c83543c1d..1f203288543f2 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -105,6 +105,7 @@ object HiveQl { // TODO(marmbrus): Figure out how view are expanded by hive, as we might need to handle this. "TOK_ALTERVIEW_ADDPARTS", + "TOK_ALTERVIEW_AS", "TOK_ALTERVIEW_DROPPARTS", "TOK_ALTERVIEW_PROPERTIES", "TOK_ALTERVIEW_RENAME", From 0e975eafb5c8e1a1b9fd1c5155beb51716f2668e Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:29:18 -0800 Subject: [PATCH 25/27] parse bucket sampling as percentage sampling --- src/main/scala/catalyst/frontend/Hive.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index 1f203288543f2..e094a921bbe40 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -459,7 +459,7 @@ object HiveQl { query :: Token(alias, Nil) :: Nil) => Subquery(alias, nodeToPlan(query)) - /* Table, No Alias */ + /* All relations, possibly with aliases or sampling clauses. */ case Token("TOK_TABREF", clauses) => // If the last clause is not a token then it's the alias of the table. val (nonAliasClauses, aliasClause) = @@ -469,17 +469,23 @@ object HiveQl { (clauses.dropRight(1), Some(clauses.last)) val (Some(tableNameParts) :: - sampleClause :: Nil) = getClauses(Seq("TOK_TABNAME", "TOK_TABLESPLITSAMPLE"), nonAliasClauses) + splitSampleClause :: + bucketSampleClause :: Nil) = getClauses(Seq("TOK_TABNAME", "TOK_TABLESPLITSAMPLE", "TOK_TABLEBUCKETSAMPLE"), nonAliasClauses) - val tableName = tableNameParts.getChildren.map { case Token(part, Nil) => part }.mkString(".") - val alias = aliasClause.map { case Token(a, Nil) => a } + val tableName = tableNameParts.getChildren.map { case Token(part, Nil) => cleanIdentifier(part) }.mkString(".") + val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) } val relation = UnresolvedRelation(tableName, alias) + // Apply sampling if requested. - sampleClause.map { + (bucketSampleClause orElse splitSampleClause).map { case Token("TOK_TABLESPLITSAMPLE", Token("TOK_ROWCOUNT", Nil) :: Token(count, Nil) :: Nil) => StopAfter(Literal(count.toInt), relation) + case Token("TOK_TABLEBUCKETSAMPLE", + Token(numerator, Nil) :: + Token(denominator, Nil) :: Nil) => + Sample(numerator.toDouble / denominator.toDouble, relation) }.getOrElse(relation) case Token("TOK_UNIQUEJOIN", joinArgs) => From c5842d274355ebe859e5cf4cd4c61ca0e4e409af Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:30:44 -0800 Subject: [PATCH 26/27] don't throw an error when a select clause outputs multiple copies of the same attribute. --- src/main/scala/catalyst/plans/logical/LogicalPlan.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/catalyst/plans/logical/LogicalPlan.scala b/src/main/scala/catalyst/plans/logical/LogicalPlan.scala index 27b96b5219085..c3eaecde3b221 100644 --- a/src/main/scala/catalyst/plans/logical/LogicalPlan.scala +++ b/src/main/scala/catalyst/plans/logical/LogicalPlan.scala @@ -41,7 +41,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] { option.name == remainingParts.head } - options match { + options.distinct match { case a :: Nil => Some(a) // One match, use it. case Nil => None // No matches. case ambiguousReferences => From 86355a65bf5036dbf05b0c9d0c75d7dc188d09c8 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Sat, 11 Jan 2014 14:31:19 -0800 Subject: [PATCH 27/27] throw error if there are unexpected join clauses. --- src/main/scala/catalyst/frontend/Hive.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/catalyst/frontend/Hive.scala b/src/main/scala/catalyst/frontend/Hive.scala index e094a921bbe40..ddd0af49a97f6 100644 --- a/src/main/scala/catalyst/frontend/Hive.scala +++ b/src/main/scala/catalyst/frontend/Hive.scala @@ -543,6 +543,7 @@ object HiveQl { case "TOK_LEFTOUTERJOIN" => LeftOuter case "TOK_FULLOUTERJOIN" => FullOuter } + assert(other.size <= 1, "Unhandled join clauses.") Join(nodeToRelation(relation1), nodeToRelation(relation2), joinType,