From 6f85f1f5477eb23ef87edb01dbcc422ca2f84f37 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:46:40 +0200 Subject: [PATCH 1/3] Initial commit --- .../org/apache/spark/unsafe/types/UTF8String.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 49d3088f8a2f0..4156287541252 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.Map; import java.util.regex.Pattern; @@ -58,6 +59,7 @@ public final class UTF8String implements Comparable, Externalizable, private Object base; private long offset; private int numBytes; + private AtomicInteger numChars = new AtomicInteger(0); public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -253,6 +255,16 @@ public int numBytes() { * Returns the number of code points in it. */ public int numChars() { + numChars.compareAndSet(0, getNumChars()); + return numChars.get(); + } + + /** + * Private helper method to calculate the number of code points in the UTF-8 string. Counting + * the code points is a linear time operation, as we need to scan the entire UTF-8 string. + * Hence, this method should generally only be called once for non-empty UTF-8 strings. + */ + private int getNumChars() { int len = 0; for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) { len += 1; From 9fd254b839c522873c6e383adc801e35489f3850 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:52:17 +0200 Subject: [PATCH 2/3] Use -1 initially --- .../main/java/org/apache/spark/unsafe/types/UTF8String.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 4156287541252..ec59a3248cb17 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -59,7 +59,7 @@ public final class UTF8String implements Comparable, Externalizable, private Object base; private long offset; private int numBytes; - private AtomicInteger numChars = new AtomicInteger(0); + private AtomicInteger numChars = new AtomicInteger(-1); public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -255,7 +255,7 @@ public int numBytes() { * Returns the number of code points in it. */ public int numChars() { - numChars.compareAndSet(0, getNumChars()); + numChars.compareAndSet(-1, getNumChars()); return numChars.get(); } From 913b79a558a14e765f961e217fb4226954713142 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 28 Jun 2024 14:10:25 +0200 Subject: [PATCH 3/3] Use volatile --- .../java/org/apache/spark/unsafe/types/UTF8String.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index ec59a3248cb17..b580f5fb4b4f6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -23,7 +23,6 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; -import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.Map; import java.util.regex.Pattern; @@ -59,7 +58,7 @@ public final class UTF8String implements Comparable, Externalizable, private Object base; private long offset; private int numBytes; - private AtomicInteger numChars = new AtomicInteger(-1); + private volatile int numChars = -1; public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -255,8 +254,8 @@ public int numBytes() { * Returns the number of code points in it. */ public int numChars() { - numChars.compareAndSet(-1, getNumChars()); - return numChars.get(); + if (numChars == -1) numChars = getNumChars(); + return numChars; } /**