Fixing schemas used for bootstrap reader

danny0405 · Dec 14, 2022 · ee8c9df · ee8c9df
1 parent 292630b
commit ee8c9df
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 4 deletions.
diff --git a/...di-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java b/...di-client-common/src/main/java/org/apache/hudi/table/action/commit/HoodieMergeHelper.java
@@ -77,7 +77,6 @@ public static HoodieMergeHelper newInstance() {
   public void runMerge(HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table,
                        HoodieMergeHandle<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> mergeHandle) throws IOException {
     final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
-    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
     HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
 
     Configuration hadoopConf = new Configuration(table.getHadoopConf());
@@ -134,9 +133,16 @@ public void runMerge(HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<Hood
         Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath());
         Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf());
         bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath);
+        // NOTE: It's important for us to rely on writer's schema here
+        //         - When records will be read by Parquet reader, if schema will be decoded from the
+        //         file itself by taking its Parquet one and converting it to Avro. This will be problematic
+        //         w/ schema validations of the records since Avro's schemas also validate corresponding
+        //         qualified names of the structs, which could not be reconstructed when converting from
+        //         Parquet to Avro (b/c Parquet doesn't bear these)
+        Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema();
         readerIterator = new MergingIterator<>(
             baseFileReader.getRecordIterator(readSchema),
-            bootstrapFileReader.getRecordIterator(),
+            bootstrapFileReader.getRecordIterator(bootstrapSchema),
             (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields()));
       } else {
         if (needToReWriteRecord) {

diff --git a/...hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java b/...hudi-flink-client/src/main/java/org/apache/hudi/table/action/commit/FlinkMergeHelper.java
@@ -93,10 +93,17 @@ public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List
       if (baseFile.getBootstrapBaseFile().isPresent()) {
         Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath());
         Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf());
+        // NOTE: It's important for us to rely on writer's schema here
+        //         - When records will be read by Parquet reader, if schema will be decoded from the
+        //         file itself by taking its Parquet one and converting it to Avro. This will be problematic
+        //         w/ schema validations of the records since Avro's schemas also validate corresponding
+        //         qualified names of the structs, which could not be reconstructed when converting from
+        //         Parquet to Avro (b/c Parquet doesn't bear these)
+        Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema();
         bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath);
         readerIterator = new MergingIterator<>(
             baseFileReader.getRecordIterator(readSchema),
-            bootstrapFileReader.getRecordIterator(),
+            bootstrapFileReader.getRecordIterator(bootstrapSchema),
             (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields()));
       } else {
         readerIterator = baseFileReader.getRecordIterator(readSchema);

diff --git a/...t/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java b/...t/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaMergeHelper.java
@@ -93,9 +93,16 @@ public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List
         Path bootstrapFilePath = new Path(baseFile.getBootstrapBaseFile().get().getPath());
         Configuration bootstrapFileConfig = new Configuration(table.getHadoopConf());
         bootstrapFileReader = HoodieFileReaderFactory.getFileReader(bootstrapFileConfig, bootstrapFilePath);
+        // NOTE: It's important for us to rely on writer's schema here
+        //         - When records will be read by Parquet reader, if schema will be decoded from the
+        //         file itself by taking its Parquet one and converting it to Avro. This will be problematic
+        //         w/ schema validations of the records since Avro's schemas also validate corresponding
+        //         qualified names of the structs, which could not be reconstructed when converting from
+        //         Parquet to Avro (b/c Parquet doesn't bear these)
+        Schema bootstrapSchema = externalSchemaTransformation ? bootstrapFileReader.getSchema() : mergeHandle.getWriterSchema();
         readerIterator = new MergingIterator<>(
             baseFileReader.getRecordIterator(readSchema),
-            bootstrapFileReader.getRecordIterator(),
+            bootstrapFileReader.getRecordIterator(bootstrapSchema),
             (inputRecordPair) -> HoodieAvroUtils.stitchRecords(inputRecordPair.getLeft(), inputRecordPair.getRight(), mergeHandle.getWriterSchemaWithMetaFields()));
       } else {
         readerIterator = baseFileReader.getRecordIterator(readSchema);