forked from apache/hudi
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[HUDI-5342] Add new bulk insert sort modes repartitioning data by par…
…tition path (apache#7402) This PR adds two new bulk insert sort modes, PARTITION_PATH_REPARTITION and PARTITION_PATH_REPARTITION_AND_SORT, which does the following For a physically partitioned table, repartition the input records based on the partition path, limiting the shuffle parallelism to specified outputSparkPartitions. For PARTITION_PATH_REPARTITION_AND_SORT, an additional step of sorting the records based on the partition path within each Spark partition is done. For a physically non-partitioned table, simply does coalesce for the input rows with outputSparkPartitions. New unit tests are added to verify the added functionality.
- Loading branch information
Showing
17 changed files
with
444 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
...lient/src/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRDDPartitioner.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.hudi.execution.bulkinsert; | ||
|
||
import org.apache.hudi.common.function.SerializableFunctionUnchecked; | ||
|
||
import org.apache.spark.Partitioner; | ||
|
||
import java.io.Serializable; | ||
import java.util.Objects; | ||
|
||
/** | ||
* A Spark RDD partitioner implementation that determines the Spark partition | ||
* based on the table partition path, generating targeted number of Spark partitions. | ||
*/ | ||
public class PartitionPathRDDPartitioner extends Partitioner implements Serializable { | ||
private final SerializableFunctionUnchecked<Object, String> partitionPathExtractor; | ||
private final int numPartitions; | ||
|
||
PartitionPathRDDPartitioner(SerializableFunctionUnchecked<Object, String> partitionPathExtractor, int numPartitions) { | ||
this.partitionPathExtractor = partitionPathExtractor; | ||
this.numPartitions = numPartitions; | ||
} | ||
|
||
@Override | ||
public int numPartitions() { | ||
return numPartitions; | ||
} | ||
|
||
@SuppressWarnings("unchecked") | ||
@Override | ||
public int getPartition(Object o) { | ||
return Math.abs(Objects.hash(partitionPathExtractor.apply(o))) % numPartitions; | ||
} | ||
} |
71 changes: 71 additions & 0 deletions
71
...java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitioner.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.hudi.execution.bulkinsert; | ||
|
||
import org.apache.hudi.common.model.HoodieRecord; | ||
import org.apache.hudi.common.model.HoodieRecordPayload; | ||
import org.apache.hudi.table.BulkInsertPartitioner; | ||
|
||
import org.apache.spark.api.java.JavaRDD; | ||
|
||
import scala.Tuple2; | ||
|
||
/** | ||
* A built-in partitioner that does the following for input records for bulk insert operation | ||
* <p> | ||
* - For physically partitioned table, repartition the input records based on the partition path, | ||
* and sort records within Spark partitions, limiting the shuffle parallelism to specified | ||
* `outputSparkPartitions` | ||
* <p> | ||
* - For physically non-partitioned table, simply does coalesce for the input records with | ||
* `outputSparkPartitions` | ||
* <p> | ||
* Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT} mode. | ||
* | ||
* @param <T> HoodieRecordPayload type | ||
*/ | ||
public class PartitionPathRepartitionAndSortPartitioner<T extends HoodieRecordPayload> | ||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> { | ||
|
||
private final boolean isTablePartitioned; | ||
|
||
public PartitionPathRepartitionAndSortPartitioner(boolean isTablePartitioned) { | ||
this.isTablePartitioned = isTablePartitioned; | ||
} | ||
|
||
@Override | ||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, | ||
int outputSparkPartitions) { | ||
if (isTablePartitioned) { | ||
PartitionPathRDDPartitioner partitioner = new PartitionPathRDDPartitioner( | ||
(partitionPath) -> (String) partitionPath, outputSparkPartitions); | ||
return records | ||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record)) | ||
.repartitionAndSortWithinPartitions(partitioner) | ||
.values(); | ||
} | ||
return records.coalesce(outputSparkPartitions); | ||
} | ||
|
||
@Override | ||
public boolean arePartitionRecordsSorted() { | ||
return isTablePartitioned; | ||
} | ||
} |
62 changes: 62 additions & 0 deletions
62
.../apache/hudi/execution/bulkinsert/PartitionPathRepartitionAndSortPartitionerWithRows.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.hudi.execution.bulkinsert; | ||
|
||
import org.apache.hudi.common.model.HoodieRecord; | ||
import org.apache.hudi.table.BulkInsertPartitioner; | ||
|
||
import org.apache.spark.sql.Column; | ||
import org.apache.spark.sql.Dataset; | ||
import org.apache.spark.sql.Row; | ||
|
||
/** | ||
* A built-in partitioner that does the following for input rows for bulk insert operation | ||
* <p> | ||
* - For physically partitioned table, repartition the input rows based on the partition path, | ||
* and sort rows within Spark partitions, limiting the shuffle parallelism to specified | ||
* `outputSparkPartitions` | ||
* <p> | ||
* - For physically non-partitioned table, simply does coalesce for the input rows with | ||
* `outputSparkPartitions` | ||
* <p> | ||
* Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION_AND_SORT} mode. | ||
*/ | ||
public class PartitionPathRepartitionAndSortPartitionerWithRows implements BulkInsertPartitioner<Dataset<Row>> { | ||
|
||
private final boolean isTablePartitioned; | ||
|
||
public PartitionPathRepartitionAndSortPartitionerWithRows(boolean isTablePartitioned) { | ||
this.isTablePartitioned = isTablePartitioned; | ||
} | ||
|
||
@Override | ||
public Dataset<Row> repartitionRecords(Dataset<Row> rows, int outputSparkPartitions) { | ||
if (isTablePartitioned) { | ||
return rows.repartition(outputSparkPartitions, new Column(HoodieRecord.PARTITION_PATH_METADATA_FIELD)) | ||
.sortWithinPartitions(new Column(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); | ||
} | ||
return rows.coalesce(outputSparkPartitions); | ||
} | ||
|
||
@Override | ||
public boolean arePartitionRecordsSorted() { | ||
return isTablePartitioned; | ||
} | ||
} |
70 changes: 70 additions & 0 deletions
70
...c/main/java/org/apache/hudi/execution/bulkinsert/PartitionPathRepartitionPartitioner.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.apache.hudi.execution.bulkinsert; | ||
|
||
import org.apache.hudi.common.model.HoodieRecord; | ||
import org.apache.hudi.common.model.HoodieRecordPayload; | ||
import org.apache.hudi.table.BulkInsertPartitioner; | ||
|
||
import org.apache.spark.api.java.JavaRDD; | ||
|
||
import scala.Tuple2; | ||
|
||
/** | ||
* A built-in partitioner that does the following for input records for bulk insert operation | ||
* <p> | ||
* - For physically partitioned table, repartition the input records based on the partition path, | ||
* limiting the shuffle parallelism to specified `outputSparkPartitions` | ||
* <p> | ||
* - For physically non-partitioned table, simply does coalesce for the input records with | ||
* `outputSparkPartitions` | ||
* <p> | ||
* Corresponding to the {@code BulkInsertSortMode.PARTITION_PATH_REPARTITION} mode. | ||
* | ||
* @param <T> HoodieRecordPayload type | ||
*/ | ||
public class PartitionPathRepartitionPartitioner<T extends HoodieRecordPayload> | ||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> { | ||
|
||
private final boolean isTablePartitioned; | ||
|
||
public PartitionPathRepartitionPartitioner(boolean isTablePartitioned) { | ||
this.isTablePartitioned = isTablePartitioned; | ||
} | ||
|
||
@Override | ||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, | ||
int outputSparkPartitions) { | ||
if (isTablePartitioned) { | ||
PartitionPathRDDPartitioner partitioner = new PartitionPathRDDPartitioner( | ||
(partitionPath) -> (String) partitionPath, outputSparkPartitions); | ||
return records | ||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record)) | ||
.partitionBy(partitioner) | ||
.values(); | ||
} | ||
return records.coalesce(outputSparkPartitions); | ||
} | ||
|
||
@Override | ||
public boolean arePartitionRecordsSorted() { | ||
return false; | ||
} | ||
} |
Oops, something went wrong.