From 9c8fde0c7a8bacaba28c9e0a953aba0a83b25fdd Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 31 May 2024 09:39:23 +0800 Subject: [PATCH] PARQUET-2468: ParquetMetadata must convert to json (#1349) (#1360) Co-authored-by: Michel Davit --- parquet-hadoop/pom.xml | 10 +++++ .../hadoop/metadata/ColumnChunkMetaData.java | 4 ++ .../metadata/ColumnChunkProperties.java | 2 + .../parquet/hadoop/metadata/FileMetaData.java | 2 + .../hadoop/metadata/ParquetMetadata.java | 35 +++++++++++----- .../TestParquetMetadataConverter.java | 41 +++++++++++++++++-- parquet-jackson/pom.xml | 18 ++++++++ pom.xml | 1 + 8 files changed, 100 insertions(+), 13 deletions(-) diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index 7d02ac54ad..465d7c95fd 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -118,11 +118,21 @@ jackson-core ${jackson.version} + + ${jackson.groupId} + jackson-annotations + ${jackson.version} + ${jackson.groupId} jackson-databind ${jackson-databind.version} + + ${jackson.datatype.groupId} + jackson-datatype-jdk8 + ${jackson-modules-java8.version} + org.xerial.snappy snappy-java diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 3dac15ba7c..14a949b0e0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -22,6 +22,7 @@ import static org.apache.parquet.column.Encoding.RLE_DICTIONARY; import static org.apache.parquet.format.Util.readColumnMetaData; +import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Set; @@ -338,6 +339,7 @@ public ColumnPath getPath() { * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ @Deprecated + @JsonIgnore public PrimitiveTypeName getType() { decryptIfNeeded(); return properties.getType(); @@ -380,6 +382,7 @@ public PrimitiveType getPrimitiveType() { /** * @return the stats for this column */ + @JsonIgnore public abstract Statistics getStatistics(); /** @@ -387,6 +390,7 @@ public PrimitiveType getPrimitiveType() { * * @return the size stats for this column */ + @JsonIgnore public SizeStatistics getSizeStatistics() { throw new UnsupportedOperationException("SizeStatistics is not implemented"); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java index 3b0a33b144..026e37a1c0 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java @@ -18,6 +18,7 @@ */ package org.apache.parquet.hadoop.metadata; +import com.fasterxml.jackson.annotation.JsonIgnore; import java.util.Arrays; import java.util.Set; import org.apache.parquet.column.Encoding; @@ -76,6 +77,7 @@ public ColumnPath getPath() { * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ @Deprecated + @JsonIgnore public PrimitiveTypeName getType() { return type.getPrimitiveTypeName(); } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/FileMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/FileMetaData.java index c608cd405f..4143dd805a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/FileMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/FileMetaData.java @@ -20,6 +20,7 @@ import static java.util.Collections.unmodifiableMap; +import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; import java.util.Map; import java.util.Objects; @@ -109,6 +110,7 @@ public String getCreatedBy() { return createdBy; } + @JsonIgnore public InternalFileDecryptor getFileDecryptor() { return fileDecryptor; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java index e30e872a6a..640ecfba1b 100755 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ParquetMetadata.java @@ -19,6 +19,9 @@ package org.apache.parquet.hadoop.metadata; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; @@ -32,6 +35,14 @@ public class ParquetMetadata { private static final ObjectMapper objectMapper = new ObjectMapper(); + static { + // Enable FAIL_ON_EMPTY_BEANS on objectmapper. Without this feature parquet-casdacing tests fail, + // because LogicalTypeAnnotation implementations are classes without any property. + objectMapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS); + // Add support for Java 8 Optional + objectMapper.registerModule(new Jdk8Module()); + } + /** * @param parquetMetaData an instance of parquet metadata to convert * @return the json representation @@ -50,19 +61,23 @@ public static String toPrettyJSON(ParquetMetadata parquetMetaData) { private static String toJSON(ParquetMetadata parquetMetaData, boolean isPrettyPrint) { try (StringWriter stringWriter = new StringWriter()) { + Object objectToPrint; + if (parquetMetaData.getFileMetaData() == null + || parquetMetaData.getFileMetaData().getEncryptionType() + == FileMetaData.EncryptionType.UNENCRYPTED) { + objectToPrint = parquetMetaData; + } else { + objectToPrint = parquetMetaData.getFileMetaData(); + } + + ObjectWriter writer; if (isPrettyPrint) { - Object objectToPrint; - if (parquetMetaData.getFileMetaData() == null - || parquetMetaData.getFileMetaData().getEncryptionType() - == FileMetaData.EncryptionType.UNENCRYPTED) { - objectToPrint = parquetMetaData; - } else { - objectToPrint = parquetMetaData.getFileMetaData(); - } - objectMapper.writerWithDefaultPrettyPrinter().writeValue(stringWriter, objectToPrint); + writer = objectMapper.writerWithDefaultPrettyPrinter(); } else { - objectMapper.writeValue(stringWriter, parquetMetaData); + writer = objectMapper.writer(); } + + writer.writeValue(stringWriter, objectToPrint); return stringWriter.toString(); } catch (IOException e) { throw new RuntimeException(e); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 4dcede624f..2cffb51860 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -87,6 +87,10 @@ import org.apache.parquet.column.statistics.LongStatistics; import org.apache.parquet.column.statistics.SizeStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.crypto.DecryptionPropertiesFactory; +import org.apache.parquet.crypto.EncryptionPropertiesFactory; +import org.apache.parquet.crypto.FileDecryptionProperties; +import org.apache.parquet.crypto.InternalFileDecryptor; import org.apache.parquet.example.Paper; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroup; @@ -635,11 +639,16 @@ public void randomTestFilterMetaData() { } @Test - public void testNullFieldMetadataDebugLogging() { + public void testFieldMetadataDebugLogging() { MessageType schema = parseMessageType("message test { optional binary some_null_field; }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = - new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null); - List blockMetaDataList = new ArrayList(); + new org.apache.parquet.hadoop.metadata.FileMetaData( + schema, + new HashMap<>(), + null, + org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType.UNENCRYPTED, + null); + List blockMetaDataList = new ArrayList<>(); BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.addColumn(createColumnChunkMetaData()); blockMetaDataList.add(blockMetaData); @@ -647,6 +656,32 @@ public void testNullFieldMetadataDebugLogging() { ParquetMetadata.toJSON(metadata); } + @Test + public void testEncryptedFieldMetadataDebugLogging() { + Configuration conf = new Configuration(); + conf.set( + EncryptionPropertiesFactory.CRYPTO_FACTORY_CLASS_PROPERTY_NAME, + "org.apache.parquet.crypto.SampleDecryptionPropertiesFactory"); + DecryptionPropertiesFactory decryptionPropertiesFactory = DecryptionPropertiesFactory.loadFactory(conf); + FileDecryptionProperties decryptionProperties = + decryptionPropertiesFactory.getFileDecryptionProperties(conf, null); + + MessageType schema = parseMessageType("message test { optional binary some_null_field; }"); + + org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = + new org.apache.parquet.hadoop.metadata.FileMetaData( + schema, + new HashMap<>(), + null, + org.apache.parquet.hadoop.metadata.FileMetaData.EncryptionType.ENCRYPTED_FOOTER, + new InternalFileDecryptor(decryptionProperties)); + + List blockMetaDataList = new ArrayList<>(); + ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList); + ParquetMetadata.toJSON(metadata); + System.out.println(ParquetMetadata.toPrettyJSON(metadata)); + } + @Test public void testMetadataToJson() { ParquetMetadata metadata = new ParquetMetadata(null, null); diff --git a/parquet-jackson/pom.xml b/parquet-jackson/pom.xml index 6bd860fd8f..22453aae1c 100644 --- a/parquet-jackson/pom.xml +++ b/parquet-jackson/pom.xml @@ -38,11 +38,22 @@ jackson-core ${jackson.version} + + ${jackson.groupId} + jackson-annotations + ${jackson.version} + com.fasterxml.jackson.core jackson-databind ${jackson-databind.version} + + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + ${jackson-modules-java8.version} + @@ -70,6 +81,7 @@ ${jackson.groupId}:* + ${jackson.datatype.groupId}:* @@ -79,6 +91,12 @@ ** + + ${jackson.datatype.groupId}:* + + ** + + diff --git a/pom.xml b/pom.xml index 59ad34d94c..73d89ebb56 100644 --- a/pom.xml +++ b/pom.xml @@ -68,6 +68,7 @@ com.fasterxml.jackson 2.17.0 2.17.0 + 2.17.0 0.21.0 1.3.2 2.30.0