Lowercase bucketing and sort column names

In the metastore, the bucketing and sorting column names can differ in case from its corresponding table column names. This change makes certain that, even though a table can be delivered by the metastore with such inconsistencies, Trino will lowercase the same bucketing and sort column names to ensure they correspond to the data column names.
martint · Apr 3, 2023 · f61f5e5 · f61f5e5
1 parent 8286075
commit f61f5e5
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 2 deletions.
diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketProperty.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/HiveBucketProperty.java
@@ -30,6 +30,7 @@
 import static com.google.common.base.MoreObjects.toStringHelper;
 import static com.google.common.collect.ImmutableList.toImmutableList;
 import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA;
+import static java.util.Locale.ENGLISH;
 import static java.util.Objects.requireNonNull;
 
 public class HiveBucketProperty
@@ -70,7 +71,11 @@ public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, Str
                     .collect(toImmutableList());
         }
         BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
-        return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
+        List<String> bucketColumnNames = storageDescriptor.getBucketCols().stream()
+                // Ensure that the names used for the bucket columns are specified in lower case to match the names of the table columns
+                .map(name -> name.toLowerCase(ENGLISH))
+                .collect(toImmutableList());
+        return Optional.of(new HiveBucketProperty(bucketColumnNames, bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
     }
 
     @JsonProperty

diff --git a/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SortingColumn.java b/plugin/trino-hive/src/main/java/io/trino/plugin/hive/metastore/SortingColumn.java
@@ -20,6 +20,7 @@
 
 import javax.annotation.concurrent.Immutable;
 
+import java.util.Locale;
 import java.util.Objects;
 
 import static com.google.common.base.MoreObjects.toStringHelper;
@@ -92,7 +93,9 @@ public Order getOrder()
 
     public static SortingColumn fromMetastoreApiOrder(io.trino.hive.thrift.metastore.Order order, String tablePartitionName)
     {
-        return new SortingColumn(order.getCol(), Order.fromMetastoreApiOrder(order.getOrder(), tablePartitionName));
+        // Ensure that the names used for the bucket columns are specified in lower case to match the names of the table columns
+        String orderColumnName = order.getCol().toLowerCase(Locale.ENGLISH);
+        return new SortingColumn(orderColumnName, Order.fromMetastoreApiOrder(order.getOrder(), tablePartitionName));
     }
 
     @Override

diff --git a/...o-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveSparkCompatibility.java b/...o-product-tests/src/main/java/io/trino/tests/product/hive/TestHiveSparkCompatibility.java
@@ -216,6 +216,33 @@ public void testSparkParquetTimestampCompatibility(String sparkTimestampFormat,
         onSpark().executeQuery("DROP TABLE " + sparkTableName);
     }
 
+    @Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
+    public void testSparkClusteringCaseSensitiveCompatibility()
+    {
+        String sparkTableNameWithClusteringDifferentCase = "test_spark_clustering_case_sensitive_" + randomNameSuffix();
+        onSpark().executeQuery(
+                String.format("CREATE TABLE %s (row_id int, `segment_id` int, value long) ", sparkTableNameWithClusteringDifferentCase) +
+                        "USING PARQUET " +
+                        "PARTITIONED BY (`part` string) " +
+                        "CLUSTERED BY (`SEGMENT_ID`) " +
+                        "  SORTED BY (`SEGMENT_ID`) " +
+                        "  INTO 10 BUCKETS");
+
+        onSpark().executeQuery(format("INSERT INTO %s ", sparkTableNameWithClusteringDifferentCase) +
+                "VALUES " +
+                "  (1, 1, 100, 'part1')," +
+                "  (100, 1, 123, 'part2')," +
+                "  (101, 2, 202, 'part2')");
+
+        // Ensure that Trino can successfully read from the Spark bucketed table even though the clustering
+        // column `SEGMENT_ID` is in a different case than the data column `segment_id`
+        assertThat(onTrino().executeQuery(format("SELECT * FROM %s.default.%s", TRINO_CATALOG, sparkTableNameWithClusteringDifferentCase)))
+                .containsOnly(List.of(
+                        row(1, 1, 100, "part1"),
+                        row(100, 1, 123, "part2"),
+                        row(101, 2, 202, "part2")));
+    }
+
     @Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
     public void testSparkParquetBloomFilterCompatibility()
     {