From 7fba7d878f023fd77651b84f4b0c911203ddef6b Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Tue, 21 May 2024 09:11:21 -0700 Subject: [PATCH] feat: Supports UUID column (#395) * fix uuid * address comments --------- Co-authored-by: Huaxin Gao (cherry picked from commit 7b0a7e07ad36b5f4e0b19bc8231deaa637ac03c7) --- .../org/apache/comet/parquet/ColumnReader.java | 14 +++++++++++--- .../apache/comet/vector/CometDecodedVector.java | 7 +++++++ .../apache/comet/vector/CometDictionaryVector.java | 9 +++++---- .../org/apache/comet/vector/CometPlainVector.java | 13 ++++++++----- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java index 7e45f4f9a2..46fd87f6b8 100644 --- a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java +++ b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java @@ -41,6 +41,7 @@ import org.apache.parquet.column.page.DataPageV2; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.spark.sql.types.DataType; import org.apache.comet.CometConf; @@ -199,6 +200,11 @@ public CometDecodedVector loadVector() { currentVector.close(); } + LogicalTypeAnnotation logicalTypeAnnotation = + descriptor.getPrimitiveType().getLogicalTypeAnnotation(); + boolean isUuid = + logicalTypeAnnotation instanceof LogicalTypeAnnotation.UUIDLogicalTypeAnnotation; + long[] addresses = Native.currentBatch(nativeHandle); try (ArrowArray array = ArrowArray.wrap(addresses[0]); @@ -206,7 +212,7 @@ public CometDecodedVector loadVector() { FieldVector vector = Data.importVector(ALLOCATOR, array, schema, dictionaryProvider); DictionaryEncoding dictionaryEncoding = vector.getField().getDictionary(); - CometPlainVector cometVector = new CometPlainVector(vector, useDecimal128); + CometPlainVector cometVector = new CometPlainVector(vector, useDecimal128, isUuid); // Update whether the current vector contains any null values. This is used in the following // batch(s) to determine whether we can skip loading the native vector. @@ -229,12 +235,14 @@ public CometDecodedVector loadVector() { // initialized yet. Dictionary arrowDictionary = dictionaryProvider.lookup(dictionaryEncoding.getId()); CometPlainVector dictionaryVector = - new CometPlainVector(arrowDictionary.getVector(), useDecimal128); + new CometPlainVector(arrowDictionary.getVector(), useDecimal128, isUuid); dictionary = new CometDictionary(dictionaryVector); } currentVector = - new CometDictionaryVector(cometVector, dictionary, dictionaryProvider, useDecimal128); + new CometDictionaryVector( + cometVector, dictionary, dictionaryProvider, useDecimal128, false, isUuid); + return currentVector; } } diff --git a/common/src/main/java/org/apache/comet/vector/CometDecodedVector.java b/common/src/main/java/org/apache/comet/vector/CometDecodedVector.java index 5ebe6923a7..f699134f85 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDecodedVector.java +++ b/common/src/main/java/org/apache/comet/vector/CometDecodedVector.java @@ -38,13 +38,20 @@ public abstract class CometDecodedVector extends CometVector { private int numValues; private int validityByteCacheIndex = -1; private byte validityByteCache; + protected boolean isUuid; protected CometDecodedVector(ValueVector vector, Field valueField, boolean useDecimal128) { + this(vector, valueField, useDecimal128, false); + } + + protected CometDecodedVector( + ValueVector vector, Field valueField, boolean useDecimal128, boolean isUuid) { super(Utils.fromArrowField(valueField), useDecimal128); this.valueVector = vector; this.numNulls = valueVector.getNullCount(); this.numValues = valueVector.getValueCount(); this.hasNull = numNulls != 0; + this.isUuid = isUuid; } @Override diff --git a/common/src/main/java/org/apache/comet/vector/CometDictionaryVector.java b/common/src/main/java/org/apache/comet/vector/CometDictionaryVector.java index 2cd9c5d18e..a74f4ff6b6 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDictionaryVector.java +++ b/common/src/main/java/org/apache/comet/vector/CometDictionaryVector.java @@ -39,7 +39,7 @@ public CometDictionaryVector( CometDictionary values, DictionaryProvider provider, boolean useDecimal128) { - this(indices, values, provider, useDecimal128, false); + this(indices, values, provider, useDecimal128, false, false); } public CometDictionaryVector( @@ -47,8 +47,9 @@ public CometDictionaryVector( CometDictionary values, DictionaryProvider provider, boolean useDecimal128, - boolean isAlias) { - super(indices.valueVector, values.getValueVector().getField(), useDecimal128); + boolean isAlias, + boolean isUuid) { + super(indices.valueVector, values.getValueVector().getField(), useDecimal128, isUuid); Preconditions.checkArgument( indices.valueVector instanceof IntVector, "'indices' should be a IntVector"); this.values = values; @@ -130,6 +131,6 @@ public CometVector slice(int offset, int length) { // Set the alias flag to true so that the sliced vector will not close the dictionary vector. // Otherwise, if the dictionary is closed, the sliced vector will not be able to access the // dictionary. - return new CometDictionaryVector(sliced, values, provider, useDecimal128, true); + return new CometDictionaryVector(sliced, values, provider, useDecimal128, true, isUuid); } } diff --git a/common/src/main/java/org/apache/comet/vector/CometPlainVector.java b/common/src/main/java/org/apache/comet/vector/CometPlainVector.java index 55df3c2580..6d5ace1da8 100644 --- a/common/src/main/java/org/apache/comet/vector/CometPlainVector.java +++ b/common/src/main/java/org/apache/comet/vector/CometPlainVector.java @@ -39,7 +39,11 @@ public class CometPlainVector extends CometDecodedVector { private int booleanByteCacheIndex = -1; public CometPlainVector(ValueVector vector, boolean useDecimal128) { - super(vector, vector.getField(), useDecimal128); + this(vector, useDecimal128, false); + } + + public CometPlainVector(ValueVector vector, boolean useDecimal128, boolean isUuid) { + super(vector, vector.getField(), useDecimal128, isUuid); // NullType doesn't have data buffer. if (vector instanceof NullVector) { this.valueBufferAddress = -1; @@ -111,11 +115,10 @@ public UTF8String getUTF8String(int rowId) { byte[] result = new byte[length]; Platform.copyMemory( null, valueBufferAddress + offset, result, Platform.BYTE_ARRAY_OFFSET, length); - // FIXME Replace with Iceberg support when it is available - if (length == 16) { - return UTF8String.fromString(convertToUuid(result).toString()); - } else { + if (!isUuid) { return UTF8String.fromBytes(result); + } else { + return UTF8String.fromString(convertToUuid(result).toString()); } } }