Skip to content

Commit

Permalink
Core: Support case-insensitivity for column names in PartitionSpec (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
sl255051 committed Aug 20, 2024
1 parent 24afc1f commit 40d5204
Show file tree
Hide file tree
Showing 6 changed files with 1,093 additions and 19 deletions.
85 changes: 67 additions & 18 deletions api/src/main/java/org/apache/iceberg/PartitionSpec.java
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,7 @@ public static class Builder {
new AtomicInteger(unpartitionedLastAssignedId());
// check if there are conflicts between partition and schema field name
private boolean checkConflicts = true;
private boolean caseSensitive = true;

private Builder(Schema schema) {
this.schema = schema;
Expand All @@ -390,7 +391,8 @@ Builder checkConflicts(boolean check) {
}

private void checkAndAddPartitionName(String name, Integer sourceColumnId) {
Types.NestedField schemaField = schema.findField(name);
Types.NestedField schemaField =
this.caseSensitive ? schema.findField(name) : schema.caseInsensitiveFindField(name);
if (checkConflicts) {
if (sourceColumnId != null) {
// for identity transform case we allow conflicts between partition and schema field name
Expand Down Expand Up @@ -427,20 +429,31 @@ private void checkForRedundantPartitions(PartitionField field) {
dedupFields.put(dedupKey, field);
}

public Builder caseSensitive(boolean sensitive) {
this.caseSensitive = sensitive;
return this;
}

public Builder withSpecId(int newSpecId) {
this.specId = newSpecId;
return this;
}

private Types.NestedField findSourceColumn(String sourceName) {
Types.NestedField sourceColumn = schema.findField(sourceName);
Types.NestedField sourceColumn =
this.caseSensitive
? schema.findField(sourceName)
: schema.caseInsensitiveFindField(sourceName);
Preconditions.checkArgument(
sourceColumn != null, "Cannot find source column: %s", sourceName);
return sourceColumn;
}

Builder identity(String sourceName, String targetName) {
Types.NestedField sourceColumn = findSourceColumn(sourceName);
return identity(findSourceColumn(sourceName), targetName);
}

private Builder identity(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(targetName, sourceColumn.fieldId());
PartitionField field =
new PartitionField(
Expand All @@ -451,12 +464,16 @@ Builder identity(String sourceName, String targetName) {
}

public Builder identity(String sourceName) {
return identity(sourceName, sourceName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
return identity(sourceColumn, schema.findColumnName(sourceColumn.fieldId()));
}

public Builder year(String sourceName, String targetName) {
return year(findSourceColumn(sourceName), targetName);
}

private Builder year(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
PartitionField field =
new PartitionField(sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.year());
checkForRedundantPartitions(field);
Expand All @@ -465,12 +482,17 @@ public Builder year(String sourceName, String targetName) {
}

public Builder year(String sourceName) {
return year(sourceName, sourceName + "_year");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return year(sourceColumn, columnName + "_year");
}

public Builder month(String sourceName, String targetName) {
return month(findSourceColumn(sourceName), targetName);
}

private Builder month(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
PartitionField field =
new PartitionField(sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.month());
checkForRedundantPartitions(field);
Expand All @@ -479,12 +501,17 @@ public Builder month(String sourceName, String targetName) {
}

public Builder month(String sourceName) {
return month(sourceName, sourceName + "_month");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return month(sourceColumn, columnName + "_month");
}

public Builder day(String sourceName, String targetName) {
return day(findSourceColumn(sourceName), targetName);
}

private Builder day(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
PartitionField field =
new PartitionField(sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.day());
checkForRedundantPartitions(field);
Expand All @@ -493,12 +520,17 @@ public Builder day(String sourceName, String targetName) {
}

public Builder day(String sourceName) {
return day(sourceName, sourceName + "_day");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return day(sourceColumn, columnName + "_day");
}

public Builder hour(String sourceName, String targetName) {
return hour(findSourceColumn(sourceName), targetName);
}

private Builder hour(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
PartitionField field =
new PartitionField(sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.hour());
checkForRedundantPartitions(field);
Expand All @@ -507,37 +539,52 @@ public Builder hour(String sourceName, String targetName) {
}

public Builder hour(String sourceName) {
return hour(sourceName, sourceName + "_hour");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return hour(sourceColumn, columnName + "_hour");
}

public Builder bucket(String sourceName, int numBuckets, String targetName) {
return bucket(findSourceColumn(sourceName), numBuckets, targetName);
}

private Builder bucket(Types.NestedField sourceColumn, int numBuckets, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
fields.add(
new PartitionField(
sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.bucket(numBuckets)));
return this;
}

public Builder bucket(String sourceName, int numBuckets) {
return bucket(sourceName, numBuckets, sourceName + "_bucket");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return bucket(sourceColumn, numBuckets, columnName + "_bucket");
}

public Builder truncate(String sourceName, int width, String targetName) {
return truncate(findSourceColumn(sourceName), width, targetName);
}

private Builder truncate(Types.NestedField sourceColumn, int width, String targetName) {
checkAndAddPartitionName(targetName);
Types.NestedField sourceColumn = findSourceColumn(sourceName);
fields.add(
new PartitionField(
sourceColumn.fieldId(), nextFieldId(), targetName, Transforms.truncate(width)));
return this;
}

public Builder truncate(String sourceName, int width) {
return truncate(sourceName, width, sourceName + "_trunc");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return truncate(sourceColumn, width, columnName + "_trunc");
}

public Builder alwaysNull(String sourceName, String targetName) {
Types.NestedField sourceColumn = findSourceColumn(sourceName);
return alwaysNull(findSourceColumn(sourceName), targetName);
}

private Builder alwaysNull(Types.NestedField sourceColumn, String targetName) {
checkAndAddPartitionName(
targetName, sourceColumn.fieldId()); // can duplicate a source column name
fields.add(
Expand All @@ -547,7 +594,9 @@ public Builder alwaysNull(String sourceName, String targetName) {
}

public Builder alwaysNull(String sourceName) {
return alwaysNull(sourceName, sourceName + "_null");
Types.NestedField sourceColumn = findSourceColumn(sourceName);
String columnName = schema.findColumnName(sourceColumn.fieldId());
return alwaysNull(sourceColumn, columnName + "_null");
}

// add a partition field with an auto-increment partition field id starting from
Expand Down
28 changes: 27 additions & 1 deletion api/src/main/java/org/apache/iceberg/types/TypeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,37 @@ public static Map<Integer, String> indexQuotedNameById(
return indexer.byId();
}

/**
* Creates a mapping from lower-case field names to their corresponding field IDs.
*
* <p>This method iterates over the fields of the provided struct and maps each field's name
* (converted to lower-case) to its ID. If two fields have the same lower-case name, an
* `IllegalArgumentException` is thrown.
*
* @param struct the struct type whose fields are to be indexed
* @return a map where the keys are lower-case field names and the values are field IDs
* @throws IllegalArgumentException if two fields have the same lower-case name
*/
public static Map<String, Integer> indexByLowerCaseName(Types.StructType struct) {
Map<String, Integer> indexByLowerCaseName = Maps.newHashMap();

IndexByName indexer = new IndexByName();
visit(struct, indexer);
Map<String, Integer> byName = indexer.byName();
Map<Integer, String> byId = indexer.byId();

indexByName(struct)
.forEach(
(name, integer) -> indexByLowerCaseName.put(name.toLowerCase(Locale.ROOT), integer));
(name, fieldId) -> {
String key = name.toLowerCase(Locale.ROOT);
Integer existingId = indexByLowerCaseName.put(key, fieldId);
Preconditions.checkArgument(
existingId == null || existingId.equals(fieldId),
"Cannot build lower case index: %s and %s collide",
byId.get(existingId),
byId.get(fieldId));
indexByLowerCaseName.put(key, fieldId);
});
return indexByLowerCaseName;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;

import static org.apache.iceberg.types.Types.NestedField.required;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;

import org.apache.iceberg.types.Types;
import org.junit.jupiter.api.Test;

public class TestSchemaCaseSensitivity {

@Test
public void testCaseInsensitiveFieldCollision() {
Schema schema =
new Schema(
required(1, "id", Types.LongType.get()),
required(2, "data", Types.StringType.get()),
required(3, "DATA", Types.StringType.get()));
assertThatIllegalArgumentException()
.isThrownBy(() -> schema.caseInsensitiveFindField("DATA"))
.withMessage("Cannot build lower case index: data and DATA collide");
}

@Test
public void testCaseSensitiveFindField() {
Schema schema =
new Schema(
required(1, "id", Types.LongType.get()),
required(2, "data", Types.StringType.get()),
required(3, "DATA", Types.StringType.get()));

Types.NestedField actual1 = schema.findField("data");
assertThat(actual1).isEqualTo(Types.NestedField.required(2, "data", Types.StringType.get()));
Types.NestedField actual2 = schema.findField("DATA");
assertThat(actual2).isEqualTo(Types.NestedField.required(3, "DATA", Types.StringType.get()));
}

@Test
public void testCaseInsensitiveField() {
Schema schema =
new Schema(
required(1, "id", Types.LongType.get()), required(2, "data", Types.StringType.get()));

Types.NestedField actual1 = schema.caseInsensitiveFindField("DATA");
assertThat(actual1).isEqualTo(Types.NestedField.required(2, "data", Types.StringType.get()));
}
}
Loading

0 comments on commit 40d5204

Please sign in to comment.