From e82b95de1fcf7e67470888e25d3082ec07f04371 Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Mon, 22 Jul 2024 14:52:36 -0700 Subject: [PATCH 1/4] fix: dictionary decimal vector optimization --- .../apache/comet/vector/CometDictionary.java | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/common/src/main/java/org/apache/comet/vector/CometDictionary.java b/common/src/main/java/org/apache/comet/vector/CometDictionary.java index 24a6d6d8c..97a033431 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDictionary.java +++ b/common/src/main/java/org/apache/comet/vector/CometDictionary.java @@ -19,7 +19,9 @@ package org.apache.comet.vector; +import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.ValueVector; +import org.apache.spark.sql.types.Decimal; import org.apache.spark.unsafe.types.UTF8String; /** A dictionary which maps indices (integers) to values. */ @@ -100,17 +102,21 @@ public void close() { } private void initialize() { - switch (values.getValueVector().getMinorType()) { + ValueVector vector = values.getValueVector(); + switch (vector.getMinorType()) { case DECIMAL: - // We only need to copy values for decimal type as random access - // to the dictionary is not efficient for decimal (it needs to copy - // the value to a new byte array everytime). - binaries = new ByteArrayWrapper[numValues]; - for (int i = 0; i < numValues; i++) { - // Need copying here since we re-use byte array for decimal - byte[] bytes = new byte[DECIMAL_BYTE_WIDTH]; - bytes = values.copyBinaryDecimal(i, bytes); - binaries[i] = new ByteArrayWrapper(bytes); + if (values.useDecimal128 + || ((DecimalVector) vector).getPrecision() > Decimal.MAX_INT_DIGITS()) { + // We only need to copy values for decimal 128 type as random access + // to the dictionary is not efficient for decimal (it needs to copy + // the value to a new byte array everytime). + binaries = new ByteArrayWrapper[numValues]; + for (int i = 0; i < numValues; i++) { + // Need copying here since we re-use byte array for decimal + byte[] bytes = new byte[DECIMAL_BYTE_WIDTH]; + bytes = values.copyBinaryDecimal(i, bytes); + binaries[i] = new ByteArrayWrapper(bytes); + } } break; } From 031ae48dd8f256bb61b9976dd808cc99146dd858 Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Mon, 22 Jul 2024 17:27:23 -0700 Subject: [PATCH 2/4] address review comments --- .../src/main/java/org/apache/comet/vector/CometDictionary.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/main/java/org/apache/comet/vector/CometDictionary.java b/common/src/main/java/org/apache/comet/vector/CometDictionary.java index 97a033431..a1a3893a0 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDictionary.java +++ b/common/src/main/java/org/apache/comet/vector/CometDictionary.java @@ -106,7 +106,7 @@ private void initialize() { switch (vector.getMinorType()) { case DECIMAL: if (values.useDecimal128 - || ((DecimalVector) vector).getPrecision() > Decimal.MAX_INT_DIGITS()) { + || ((DecimalVector) vector).getPrecision() > Decimal.MAX_LONG_DIGITS()) { // We only need to copy values for decimal 128 type as random access // to the dictionary is not efficient for decimal (it needs to copy // the value to a new byte array everytime). From f3240225dda7777ebcc17c174301b714efd7ec80 Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Tue, 23 Jul 2024 11:15:53 -0700 Subject: [PATCH 3/4] address review comments --- .../apache/comet/vector/CometDictionary.java | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/common/src/main/java/org/apache/comet/vector/CometDictionary.java b/common/src/main/java/org/apache/comet/vector/CometDictionary.java index a1a3893a0..88c9a5439 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDictionary.java +++ b/common/src/main/java/org/apache/comet/vector/CometDictionary.java @@ -19,9 +19,7 @@ package org.apache.comet.vector; -import org.apache.arrow.vector.DecimalVector; import org.apache.arrow.vector.ValueVector; -import org.apache.spark.sql.types.Decimal; import org.apache.spark.unsafe.types.UTF8String; /** A dictionary which maps indices (integers) to values. */ @@ -37,7 +35,6 @@ public class CometDictionary implements AutoCloseable { public CometDictionary(CometPlainVector values) { this.values = values; this.numValues = values.numValues(); - initialize(); } public void setDictionaryVector(CometPlainVector values) { @@ -85,6 +82,18 @@ public byte[] decodeToBinary(int index) { case FIXEDSIZEBINARY: return values.getBinary(index); case DECIMAL: + if (binaries == null) { + // We only need to copy values for decimal 128 type as random access + // to the dictionary is not efficient for decimal (it needs to copy + // the value to a new byte array everytime). + binaries = new ByteArrayWrapper[numValues]; + for (int i = 0; i < numValues; i++) { + // Need copying here since we re-use byte array for decimal + byte[] bytes = new byte[DECIMAL_BYTE_WIDTH]; + bytes = values.copyBinaryDecimal(i, bytes); + binaries[i] = new ByteArrayWrapper(bytes); + } + } return binaries[index].bytes; default: throw new IllegalArgumentException( @@ -101,27 +110,6 @@ public void close() { values.close(); } - private void initialize() { - ValueVector vector = values.getValueVector(); - switch (vector.getMinorType()) { - case DECIMAL: - if (values.useDecimal128 - || ((DecimalVector) vector).getPrecision() > Decimal.MAX_LONG_DIGITS()) { - // We only need to copy values for decimal 128 type as random access - // to the dictionary is not efficient for decimal (it needs to copy - // the value to a new byte array everytime). - binaries = new ByteArrayWrapper[numValues]; - for (int i = 0; i < numValues; i++) { - // Need copying here since we re-use byte array for decimal - byte[] bytes = new byte[DECIMAL_BYTE_WIDTH]; - bytes = values.copyBinaryDecimal(i, bytes); - binaries[i] = new ByteArrayWrapper(bytes); - } - } - break; - } - } - private static class ByteArrayWrapper { private final byte[] bytes; From 6d85a7c2a3fd87ebc89cca835f284ea94f377a7f Mon Sep 17 00:00:00 2001 From: Kazuyuki Tanimura Date: Tue, 23 Jul 2024 16:27:50 -0700 Subject: [PATCH 4/4] address review comments --- .../src/main/java/org/apache/comet/vector/CometDictionary.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/src/main/java/org/apache/comet/vector/CometDictionary.java b/common/src/main/java/org/apache/comet/vector/CometDictionary.java index 88c9a5439..fd5c2410b 100644 --- a/common/src/main/java/org/apache/comet/vector/CometDictionary.java +++ b/common/src/main/java/org/apache/comet/vector/CometDictionary.java @@ -86,13 +86,14 @@ public byte[] decodeToBinary(int index) { // We only need to copy values for decimal 128 type as random access // to the dictionary is not efficient for decimal (it needs to copy // the value to a new byte array everytime). - binaries = new ByteArrayWrapper[numValues]; + ByteArrayWrapper[] binaries = new ByteArrayWrapper[numValues]; for (int i = 0; i < numValues; i++) { // Need copying here since we re-use byte array for decimal byte[] bytes = new byte[DECIMAL_BYTE_WIDTH]; bytes = values.copyBinaryDecimal(i, bytes); binaries[i] = new ByteArrayWrapper(bytes); } + this.binaries = binaries; } return binaries[index].bytes; default: