diff --git a/docs/sql-ref-syntax-ddl-alter-table.md b/docs/sql-ref-syntax-ddl-alter-table.md index 31eaf659b5c7a..adcfa8db06f12 100644 --- a/docs/sql-ref-syntax-ddl-alter-table.md +++ b/docs/sql-ref-syntax-ddl-alter-table.md @@ -233,6 +233,32 @@ ALTER TABLE table_identifier DROP [ IF EXISTS ] partition_spec [PURGE] Partition to be dropped. Note that one can use a typed literal (e.g., date'2019-01-02') in the partition spec. **Syntax:** `PARTITION ( partition_col_name = partition_col_val [ , ... ] )` + +#### CLUSTER BY + +`ALTER TABLE CLUSTER BY` command can also be used for changing or removing the clustering columns for existing tables. + +##### Syntax + +```sql +-- Changing Clustering Columns +ALTER TABLE table_identifier CLUSTER BY ( col_name [ , ... ] ) + +-- Removing Clustering Columns +ALTER TABLE table_identifier CLUSTER BY NONE +``` + +#### Parameters + +* **table_identifier** + + Specifies a table name, which may be optionally qualified with a database name. + + **Syntax:** `[ database_name. ] table_name` + +* **col_name** + + Specifies the name of the column. ### SET AND UNSET @@ -596,6 +622,51 @@ SHOW PARTITIONS StudentInfo; | age=20| +---------+ +-- CLUSTER BY +DESC Teacher; ++------------------------+---------+-------+ +| col_name|data_type|comment| ++------------------------+---------+-------+ +| name| string| NULL| +| gender| string| NULL| +| country| string| NULL| +| age| int| NULL| +|# Clustering Information| | | +| # col_name|data_type|comment| +| gender| string| NULL| ++------------------------+---------+-------+ + +ALTER TABLE Teacher CLUSTER BY (gender, country); + +-- After changing clustering columns +DESC Teacher; ++------------------------+---------+-------+ +| col_name|data_type|comment| ++------------------------+---------+-------+ +| name| string| NULL| +| gender| string| NULL| +| country| string| NULL| +| age| int| NULL| +|# Clustering Information| | | +| # col_name|data_type|comment| +| gender| string| NULL| +| country| string| NULL| ++------------------------+---------+-------+ + +ALTER TABLE Teacher CLUSTER BY NONE; + +-- After removing clustering columns +DESC Teacher; ++------------------------+---------+-------+ +| col_name|data_type|comment| ++------------------------+---------+-------+ +| name| string| NULL| +| gender| string| NULL| +| country| string| NULL| +| age| int| NULL| +|# Clustering Information| | | ++------------------------+---------+-------+ + -- Change the fileformat ALTER TABLE loc_orc SET fileformat orc; diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala index 70cfa93254190..f88c516f0019d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/DDLParserSuite.scala @@ -21,7 +21,6 @@ import java.util.Locale import org.apache.spark.SparkThrowable import org.apache.spark.sql.catalyst.analysis._ -import org.apache.spark.sql.catalyst.catalog.ClusterBySpec import org.apache.spark.sql.catalyst.expressions.{EqualTo, Hex, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition.{after, first} @@ -236,23 +235,6 @@ class DDLParserSuite extends AnalysisTest { } } - test("alter table cluster by") { - comparePlans( - parsePlan("ALTER TABLE table_name CLUSTER BY (`a.b`, c.d, none)"), - AlterTableClusterBy( - UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), - Some(ClusterBySpec(Seq( - FieldReference(Seq("a.b")), - FieldReference(Seq("c", "d")), - FieldReference(Seq("none"))))))) - - comparePlans( - parsePlan("ALTER TABLE table_name CLUSTER BY NONE"), - AlterTableClusterBy( - UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), - None)) - } - test("create/replace table - with comment") { val createSql = "CREATE TABLE my_tab(a INT, b STRING) USING parquet COMMENT 'abc'" val replaceSql = "REPLACE TABLE my_tab(a INT, b STRING) USING parquet COMMENT 'abc'" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala index d8fa48a72cf81..1f9419c41b746 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -103,13 +103,10 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) builder.build()) AlterTableChangeColumnCommand(table.catalogTable.identifier, colName, newColumn) - case AlterTableClusterBy(ResolvedTable(catalog, ident, table: V1Table, _), clusterBySpecOpt) + case AlterTableClusterBy(ResolvedTable(catalog, _, table: V1Table, _), clusterBySpecOpt) if isSessionCatalog(catalog) => - val prop = clusterBySpecOpt.map { clusterBySpec => - Map(ClusterBySpec.toProperty(table.schema, clusterBySpec, conf.resolver)) - }.getOrElse { - Map(ClusterBySpec.toProperty(table.schema, ClusterBySpec(Nil), conf.resolver)) - } + val prop = Map(ClusterBySpec.toProperty(table.schema, + clusterBySpecOpt.getOrElse(ClusterBySpec(Nil)), conf.resolver)) AlterTableSetPropertiesCommand(table.catalogTable.identifier, prop, isView = false) case RenameColumn(ResolvedV1TableIdentifier(ident), _, _) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala new file mode 100644 index 0000000000000..1bbb3eb35fedc --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterByParserSuite.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.command + +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedTable} +import org.apache.spark.sql.catalyst.catalog.ClusterBySpec +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan +import org.apache.spark.sql.catalyst.plans.logical.AlterTableClusterBy +import org.apache.spark.sql.connector.expressions.FieldReference +import org.apache.spark.sql.test.SharedSparkSession + +class AlterTableClusterByParserSuite extends AnalysisTest with SharedSparkSession { + + test("alter table cluster by") { + comparePlans( + parsePlan("ALTER TABLE table_name CLUSTER BY (`a.b`, c.d, none)"), + AlterTableClusterBy( + UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), + Some(ClusterBySpec(Seq( + FieldReference(Seq("a.b")), + FieldReference(Seq("c", "d")), + FieldReference(Seq("none"))))))) + + comparePlans( + parsePlan("ALTER TABLE table_name CLUSTER BY NONE"), + AlterTableClusterBy( + UnresolvedTable(Seq("table_name"), "ALTER TABLE ... CLUSTER BY"), + None)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala index 8961019f3f8d1..73a80cd910698 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/AlterTableClusterBySuiteBase.scala @@ -45,34 +45,33 @@ trait AlterTableClusterBySuiteBase extends QueryTest with DDLCommandTestUtils { test("test basic ALTER TABLE with clustering columns") { withNamespaceAndTable("ns", "table") { tbl => - spark.sql(s"CREATE TABLE $tbl (id INT, data STRING) $defaultUsing CLUSTER BY (id, data)") + sql(s"CREATE TABLE $tbl (id INT, data STRING) $defaultUsing CLUSTER BY (id, data)") validateClusterBy(tbl, Seq("id", "data")) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, id)") + sql(s"ALTER TABLE $tbl CLUSTER BY (data, id)") validateClusterBy(tbl, Seq("data", "id")) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY NONE") + sql(s"ALTER TABLE $tbl CLUSTER BY NONE") validateClusterBy(tbl, Seq.empty) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)") + sql(s"ALTER TABLE $tbl CLUSTER BY (id)") validateClusterBy(tbl, Seq("id")) } } test("test clustering columns with comma") { withNamespaceAndTable("ns", "table") { tbl => - spark.sql(s"CREATE TABLE $tbl (`i,d` INT, data STRING) $defaultUsing " + - "CLUSTER BY (`i,d`, data)") + sql(s"CREATE TABLE $tbl (`i,d` INT, data STRING) $defaultUsing CLUSTER BY (`i,d`, data)") validateClusterBy(tbl, Seq("`i,d`", "data")) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY (data, `i,d`)") + sql(s"ALTER TABLE $tbl CLUSTER BY (data, `i,d`)") validateClusterBy(tbl, Seq("data", "`i,d`")) } } test("test nested clustering columns") { withNamespaceAndTable("ns", "table") { tbl => - spark.sql(s"CREATE TABLE $tbl " + + sql(s"CREATE TABLE $tbl " + s"($nestedColumnSchema) " + s"$defaultUsing CLUSTER BY (${nestedClusteringColumns.mkString(",")})") validateClusterBy(tbl, nestedClusteringColumns) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY (${nestedClusteringColumnsNew.mkString(",")})") + sql(s"ALTER TABLE $tbl CLUSTER BY (${nestedClusteringColumnsNew.mkString(",")})") validateClusterBy(tbl, nestedClusteringColumnsNew) } } @@ -80,10 +79,18 @@ trait AlterTableClusterBySuiteBase extends QueryTest with DDLCommandTestUtils { test("clustering columns not defined in schema") { withNamespaceAndTable("ns", "table") { tbl => sql(s"CREATE TABLE $tbl (id bigint, data string) $defaultUsing CLUSTER BY (id)") - val err = intercept[AnalysisException] { - sql(s"ALTER TABLE $tbl CLUSTER BY (unknown)") - } - assert(err.message.contains("Couldn't find column unknown in:")) + checkError( + exception = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl CLUSTER BY (unknown)") + }, + errorClass = "_LEGACY_ERROR_TEMP_3060", + parameters = Map("i" -> "unknown", + "schema" -> + """root + | |-- id: long (nullable = true) + | |-- data: string (nullable = true) + |""".stripMargin) + ) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala index 385ef961127a2..f5f2b7517b484 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v1/AlterTableClusterBySuite.scala @@ -36,7 +36,7 @@ trait AlterTableClusterBySuiteBase extends command.AlterTableClusterBySuiteBase override def validateClusterBy(tableName: String, clusteringColumns: Seq[String]): Unit = { val catalog = spark.sessionState.catalog val (_, db, t) = parseTableName(tableName) - val table = catalog.getTableMetadata(TableIdentifier.apply(t, Some(db))) + val table = catalog.getTableMetadata(TableIdentifier(t, Some(db))) assert(table.clusterBySpec === Some(ClusterBySpec(clusteringColumns.map(FieldReference(_))))) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala index bbbe6cd758756..8a74d9c3572bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableClusterBySuite.scala @@ -40,13 +40,13 @@ class AlterTableClusterBySuite extends command.AlterTableClusterBySuiteBase test("test REPLACE TABLE with clustering columns") { withNamespaceAndTable("ns", "table") { tbl => - spark.sql(s"CREATE TABLE $tbl (id INT) $defaultUsing CLUSTER BY (id)") + sql(s"CREATE TABLE $tbl (id INT) $defaultUsing CLUSTER BY (id)") validateClusterBy(tbl, Seq("id")) - spark.sql(s"REPLACE TABLE $tbl (id INT, id2 INT) $defaultUsing CLUSTER BY (id2)") + sql(s"REPLACE TABLE $tbl (id INT, id2 INT) $defaultUsing CLUSTER BY (id2)") validateClusterBy(tbl, Seq("id2")) - spark.sql(s"ALTER TABLE $tbl CLUSTER BY (id)") + sql(s"ALTER TABLE $tbl CLUSTER BY (id)") validateClusterBy(tbl, Seq("id")) } }