Skip to content

Commit

Permalink
Lowercase bucketing and sort column names
Browse files Browse the repository at this point in the history
In the metastore, the bucketing and sorting column names can differ
in case from its corresponding table column names.
This change makes certain that, even though a table can be
delivered by the metastore with such inconsistencies, Trino will lowercase
the same bucketing and sort column names to ensure they correspond to the
data column names.
  • Loading branch information
findinpath authored and findepi committed Apr 3, 2023
1 parent 8286075 commit f61f5e5
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_INVALID_METADATA;
import static java.util.Locale.ENGLISH;
import static java.util.Objects.requireNonNull;

public class HiveBucketProperty
Expand Down Expand Up @@ -70,7 +71,11 @@ public static Optional<HiveBucketProperty> fromStorageDescriptor(Map<String, Str
.collect(toImmutableList());
}
BucketingVersion bucketingVersion = HiveBucketing.getBucketingVersion(tableParameters);
return Optional.of(new HiveBucketProperty(storageDescriptor.getBucketCols(), bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
List<String> bucketColumnNames = storageDescriptor.getBucketCols().stream()
// Ensure that the names used for the bucket columns are specified in lower case to match the names of the table columns
.map(name -> name.toLowerCase(ENGLISH))
.collect(toImmutableList());
return Optional.of(new HiveBucketProperty(bucketColumnNames, bucketingVersion, storageDescriptor.getNumBuckets(), sortedBy));
}

@JsonProperty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import javax.annotation.concurrent.Immutable;

import java.util.Locale;
import java.util.Objects;

import static com.google.common.base.MoreObjects.toStringHelper;
Expand Down Expand Up @@ -92,7 +93,9 @@ public Order getOrder()

public static SortingColumn fromMetastoreApiOrder(io.trino.hive.thrift.metastore.Order order, String tablePartitionName)
{
return new SortingColumn(order.getCol(), Order.fromMetastoreApiOrder(order.getOrder(), tablePartitionName));
// Ensure that the names used for the bucket columns are specified in lower case to match the names of the table columns
String orderColumnName = order.getCol().toLowerCase(Locale.ENGLISH);
return new SortingColumn(orderColumnName, Order.fromMetastoreApiOrder(order.getOrder(), tablePartitionName));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,33 @@ public void testSparkParquetTimestampCompatibility(String sparkTimestampFormat,
onSpark().executeQuery("DROP TABLE " + sparkTableName);
}

@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testSparkClusteringCaseSensitiveCompatibility()
{
String sparkTableNameWithClusteringDifferentCase = "test_spark_clustering_case_sensitive_" + randomNameSuffix();
onSpark().executeQuery(
String.format("CREATE TABLE %s (row_id int, `segment_id` int, value long) ", sparkTableNameWithClusteringDifferentCase) +
"USING PARQUET " +
"PARTITIONED BY (`part` string) " +
"CLUSTERED BY (`SEGMENT_ID`) " +
" SORTED BY (`SEGMENT_ID`) " +
" INTO 10 BUCKETS");

onSpark().executeQuery(format("INSERT INTO %s ", sparkTableNameWithClusteringDifferentCase) +
"VALUES " +
" (1, 1, 100, 'part1')," +
" (100, 1, 123, 'part2')," +
" (101, 2, 202, 'part2')");

// Ensure that Trino can successfully read from the Spark bucketed table even though the clustering
// column `SEGMENT_ID` is in a different case than the data column `segment_id`
assertThat(onTrino().executeQuery(format("SELECT * FROM %s.default.%s", TRINO_CATALOG, sparkTableNameWithClusteringDifferentCase)))
.containsOnly(List.of(
row(1, 1, 100, "part1"),
row(100, 1, 123, "part2"),
row(101, 2, 202, "part2")));
}

@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testSparkParquetBloomFilterCompatibility()
{
Expand Down

0 comments on commit f61f5e5

Please sign in to comment.