From 679fd111f43f4eea205f4eee81972f1132a5ae1e Mon Sep 17 00:00:00 2001 From: Sagar Sumit Date: Mon, 8 Aug 2022 23:04:30 +0530 Subject: [PATCH 1/2] [HUDI-4571] Fix partition extractor infer function when partition field mismatch --- .../hudi/sync/common/HoodieSyncConfig.java | 34 +++++++---- .../sync/common/TestHoodieSyncConfig.java | 60 +++++++++++++++---- 2 files changed, 69 insertions(+), 25 deletions(-) diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java index b2df64133e96..7283fa4f2ce2 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java @@ -23,10 +23,8 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; -import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; -import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.sync.common.util.ConfigUtils; import com.beust.jcommander.Parameter; @@ -41,9 +39,14 @@ import java.util.stream.Collectors; import static org.apache.hudi.common.config.HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS; +import static org.apache.hudi.common.table.HoodieTableConfig.BASE_FILE_FORMAT; import static org.apache.hudi.common.table.HoodieTableConfig.DATABASE_NAME; +import static org.apache.hudi.common.table.HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE; import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_TABLE_NAME_KEY; import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY; +import static org.apache.hudi.common.table.HoodieTableConfig.PARTITION_FIELDS; +import static org.apache.hudi.common.table.HoodieTableConfig.URL_ENCODE_PARTITIONING; +import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME; /** * Configs needed to sync data into external meta stores, catalogs, etc. @@ -72,31 +75,38 @@ public class HoodieSyncConfig extends HoodieConfig { public static final ConfigProperty META_SYNC_TABLE_NAME = ConfigProperty .key("hoodie.datasource.hive_sync.table") .defaultValue("unknown") - .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HOODIE_WRITE_TABLE_NAME_KEY)) - .or(() -> Option.ofNullable(cfg.getString(HOODIE_TABLE_NAME_KEY)))) + .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HOODIE_TABLE_NAME_KEY)) + .or(() -> Option.ofNullable(cfg.getString(HOODIE_WRITE_TABLE_NAME_KEY)))) .withDocumentation("The name of the destination table that we should sync the hudi table to."); public static final ConfigProperty META_SYNC_BASE_FILE_FORMAT = ConfigProperty .key("hoodie.datasource.hive_sync.base_file_format") .defaultValue("PARQUET") - .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieTableConfig.BASE_FILE_FORMAT))) + .withInferFunction(cfg -> Option.ofNullable(cfg.getString(BASE_FILE_FORMAT))) .withDocumentation("Base file format for the sync."); public static final ConfigProperty META_SYNC_PARTITION_FIELDS = ConfigProperty .key("hoodie.datasource.hive_sync.partition_fields") .defaultValue("") - .withInferFunction(cfg -> Option.ofNullable(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME))) + .withInferFunction(cfg -> Option.ofNullable(cfg.getString(PARTITION_FIELDS)) + .or(() -> Option.ofNullable(cfg.getString(PARTITIONPATH_FIELD_NAME)))) .withDocumentation("Field in the table to use for determining hive partition columns."); public static final ConfigProperty META_SYNC_PARTITION_EXTRACTOR_CLASS = ConfigProperty .key("hoodie.datasource.hive_sync.partition_extractor_class") .defaultValue("org.apache.hudi.hive.MultiPartKeysValueExtractor") .withInferFunction(cfg -> { - if (StringUtils.nonEmpty(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME))) { - int numOfPartFields = cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME).split(",").length; + Option partitionFieldsOpt = Option.ofNullable(cfg.getString(PARTITION_FIELDS)) + .or(() -> Option.ofNullable(cfg.getString(PARTITIONPATH_FIELD_NAME))); + if (!partitionFieldsOpt.isPresent()) { + return Option.empty(); + } + String partitionFields = partitionFieldsOpt.get(); + if (StringUtils.nonEmpty(partitionFields)) { + int numOfPartFields = partitionFields.split(",").length; if (numOfPartFields == 1 - && cfg.contains(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE) - && cfg.getString(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE).equals("true")) { + && cfg.contains(HIVE_STYLE_PARTITIONING_ENABLE) + && cfg.getString(HIVE_STYLE_PARTITIONING_ENABLE).equals("true")) { return Option.of("org.apache.hudi.hive.HiveStylePartitionValueExtractor"); } else { return Option.of("org.apache.hudi.hive.MultiPartKeysValueExtractor"); @@ -106,7 +116,7 @@ public class HoodieSyncConfig extends HoodieConfig { } }) .withDocumentation("Class which implements PartitionValueExtractor to extract the partition values, " - + "default 'SlashEncodedDayPartitionValueExtractor'."); + + "default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'."); public static final ConfigProperty META_SYNC_ASSUME_DATE_PARTITION = ConfigProperty .key("hoodie.datasource.hive_sync.assume_date_partitioning") @@ -117,7 +127,7 @@ public class HoodieSyncConfig extends HoodieConfig { public static final ConfigProperty META_SYNC_DECODE_PARTITION = ConfigProperty .key("hoodie.meta.sync.decode_partition") .defaultValue(false) - .withInferFunction(cfg -> Option.ofNullable(cfg.getBoolean(HoodieTableConfig.URL_ENCODE_PARTITIONING))) + .withInferFunction(cfg -> Option.ofNullable(cfg.getBoolean(URL_ENCODE_PARTITIONING))) .withDocumentation("If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false."); public static final ConfigProperty META_SYNC_USE_FILE_LISTING_FROM_METADATA = ConfigProperty diff --git a/hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/TestHoodieSyncConfig.java b/hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/TestHoodieSyncConfig.java index ddf07b836e5f..f8e4eff30a5a 100644 --- a/hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/TestHoodieSyncConfig.java +++ b/hudi-sync/hudi-sync-common/src/test/java/org/apache/hudi/sync/common/TestHoodieSyncConfig.java @@ -76,35 +76,69 @@ void testInferBaseFileFormat() { @Test void testInferPartitionFields() { + Properties props0 = new Properties(); + HoodieSyncConfig config0 = new HoodieSyncConfig(props0, new Configuration()); + assertEquals("", config0.getStringOrDefault(META_SYNC_PARTITION_FIELDS), + String.format("should get default value due to absence of both %s and %s", + HoodieTableConfig.PARTITION_FIELDS.key(), KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); + Properties props1 = new Properties(); - props1.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); + props1.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz"); HoodieSyncConfig config1 = new HoodieSyncConfig(props1, new Configuration()); - assertEquals("foo,bar", config1.getStringOrDefault(META_SYNC_PARTITION_FIELDS)); + assertEquals("foo,bar,baz", config1.getStringOrDefault(META_SYNC_PARTITION_FIELDS), + String.format("should infer from %s", HoodieTableConfig.PARTITION_FIELDS.key())); + + Properties props2 = new Properties(); + props2.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); + HoodieSyncConfig config2 = new HoodieSyncConfig(props2, new Configuration()); + assertEquals("foo,bar", config2.getStringOrDefault(META_SYNC_PARTITION_FIELDS), + String.format("should infer from %s", KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); + + Properties props3 = new Properties(); + props3.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo,bar,baz"); + props3.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); + HoodieSyncConfig config3 = new HoodieSyncConfig(props3, new Configuration()); + assertEquals("foo,bar,baz", config3.getStringOrDefault(META_SYNC_PARTITION_FIELDS), + String.format("should infer from %s, which has higher precedence.", HoodieTableConfig.PARTITION_FIELDS.key())); + } @Test void testInferPartitonExtractorClass() { + Properties props0 = new Properties(); + HoodieSyncConfig config0 = new HoodieSyncConfig(props0, new Configuration()); + assertEquals("org.apache.hudi.hive.MultiPartKeysValueExtractor", + config0.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), + String.format("should get default value due to absence of both %s and %s", + HoodieTableConfig.PARTITION_FIELDS.key(), KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); + Properties props1 = new Properties(); - props1.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); + props1.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), ""); HoodieSyncConfig config1 = new HoodieSyncConfig(props1, new Configuration()); - assertEquals("org.apache.hudi.hive.MultiPartKeysValueExtractor", - config1.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)); + assertEquals("org.apache.hudi.hive.NonPartitionedExtractor", + config1.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), + String.format("should infer from %s", HoodieTableConfig.PARTITION_FIELDS.key())); Properties props2 = new Properties(); - props2.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo"); - props2.setProperty(KeyGeneratorOptions.HIVE_STYLE_PARTITIONING_ENABLE.key(), "true"); + props2.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); HoodieSyncConfig config2 = new HoodieSyncConfig(props2, new Configuration()); - assertEquals("org.apache.hudi.hive.HiveStylePartitionValueExtractor", - config2.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)); + assertEquals("org.apache.hudi.hive.MultiPartKeysValueExtractor", + config2.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), + String.format("should infer from %s", KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())); - HoodieSyncConfig config3 = new HoodieSyncConfig(new Properties(), new Configuration()); + Properties props3 = new Properties(); + props3.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), ""); + props3.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "foo,bar"); + HoodieSyncConfig config3 = new HoodieSyncConfig(props3, new Configuration()); assertEquals("org.apache.hudi.hive.NonPartitionedExtractor", - config3.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)); + config3.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS), + String.format("should infer from %s, which has higher precedence.", HoodieTableConfig.PARTITION_FIELDS.key())); Properties props4 = new Properties(); - props4.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), ""); + props4.setProperty(HoodieTableConfig.PARTITION_FIELDS.key(), "foo"); + props4.setProperty(HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE.key(), "true"); HoodieSyncConfig config4 = new HoodieSyncConfig(props4, new Configuration()); - assertEquals("org.apache.hudi.hive.NonPartitionedExtractor", + assertEquals("org.apache.hudi.hive.HiveStylePartitionValueExtractor", config4.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)); } From e37527b7c57836656d2255967d5781226c5b76c6 Mon Sep 17 00:00:00 2001 From: Raymond Xu <2701446+xushiyan@users.noreply.github.com> Date: Mon, 8 Aug 2022 19:50:37 -0500 Subject: [PATCH 2/2] fix UT --- .../replication/TestHiveSyncGlobalCommitTool.java | 3 +++ .../apache/hudi/sync/common/HoodieSyncConfig.java | 12 ++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/replication/TestHiveSyncGlobalCommitTool.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/replication/TestHiveSyncGlobalCommitTool.java index 045760956f1b..b4058b3051a8 100644 --- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/replication/TestHiveSyncGlobalCommitTool.java +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/replication/TestHiveSyncGlobalCommitTool.java @@ -19,6 +19,7 @@ package org.apache.hudi.hive.replication; +import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor; import org.apache.hudi.hive.testutils.HiveTestCluster; import org.apache.hadoop.fs.Path; @@ -41,6 +42,7 @@ import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_ASSUME_DATE_PARTITION; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_BASE_PATH; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_DATABASE_NAME; +import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_EXTRACTOR_CLASS; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_PARTITION_FIELDS; import static org.apache.hudi.sync.common.HoodieSyncConfig.META_SYNC_TABLE_NAME; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -75,6 +77,7 @@ private HiveSyncGlobalCommitParams getGlobalCommitConfig(String commitTime) thro params.loadedProps.setProperty(META_SYNC_ASSUME_DATE_PARTITION.key(), "true"); params.loadedProps.setProperty(HIVE_USE_PRE_APACHE_INPUT_FORMAT.key(), "false"); params.loadedProps.setProperty(META_SYNC_PARTITION_FIELDS.key(), "datestr"); + params.loadedProps.setProperty(META_SYNC_PARTITION_EXTRACTOR_CLASS.key(), SlashEncodedDayPartitionValueExtractor.class.getName()); return params; } diff --git a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java index 7283fa4f2ce2..43502f612f92 100644 --- a/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java +++ b/hudi-sync/hudi-sync-common/src/main/java/org/apache/hudi/sync/common/HoodieSyncConfig.java @@ -23,8 +23,10 @@ import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.sync.common.util.ConfigUtils; import com.beust.jcommander.Parameter; @@ -44,9 +46,7 @@ import static org.apache.hudi.common.table.HoodieTableConfig.HIVE_STYLE_PARTITIONING_ENABLE; import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_TABLE_NAME_KEY; import static org.apache.hudi.common.table.HoodieTableConfig.HOODIE_WRITE_TABLE_NAME_KEY; -import static org.apache.hudi.common.table.HoodieTableConfig.PARTITION_FIELDS; import static org.apache.hudi.common.table.HoodieTableConfig.URL_ENCODE_PARTITIONING; -import static org.apache.hudi.keygen.constant.KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME; /** * Configs needed to sync data into external meta stores, catalogs, etc. @@ -88,16 +88,16 @@ public class HoodieSyncConfig extends HoodieConfig { public static final ConfigProperty META_SYNC_PARTITION_FIELDS = ConfigProperty .key("hoodie.datasource.hive_sync.partition_fields") .defaultValue("") - .withInferFunction(cfg -> Option.ofNullable(cfg.getString(PARTITION_FIELDS)) - .or(() -> Option.ofNullable(cfg.getString(PARTITIONPATH_FIELD_NAME)))) + .withInferFunction(cfg -> Option.ofNullable(cfg.getString(HoodieTableConfig.PARTITION_FIELDS)) + .or(() -> Option.ofNullable(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME)))) .withDocumentation("Field in the table to use for determining hive partition columns."); public static final ConfigProperty META_SYNC_PARTITION_EXTRACTOR_CLASS = ConfigProperty .key("hoodie.datasource.hive_sync.partition_extractor_class") .defaultValue("org.apache.hudi.hive.MultiPartKeysValueExtractor") .withInferFunction(cfg -> { - Option partitionFieldsOpt = Option.ofNullable(cfg.getString(PARTITION_FIELDS)) - .or(() -> Option.ofNullable(cfg.getString(PARTITIONPATH_FIELD_NAME))); + Option partitionFieldsOpt = Option.ofNullable(cfg.getString(HoodieTableConfig.PARTITION_FIELDS)) + .or(() -> Option.ofNullable(cfg.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME))); if (!partitionFieldsOpt.isPresent()) { return Option.empty(); }