From 0487d7857ecf818f86d1a3cb34506c0307707b9a Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:53:11 +0800 Subject: [PATCH] [SPARK-48748][SQL] Cache numChars in UTF8String ### What changes were proposed in this pull request? Cache `numChars` value in a thread-safe way. ### Why are the changes needed? Faster access to `numChars()` method, which currently requires entire UTF8String scan every time. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47142 from uros-db/cache-numchars. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Kent Yao --- .../org/apache/spark/unsafe/types/UTF8String.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 38b9b803acbe4..e4c9c6c8e8687 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -59,6 +59,7 @@ public final class UTF8String implements Comparable, Externalizable, private Object base; private long offset; private int numBytes; + private volatile int numChars = -1; public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } @@ -254,6 +255,16 @@ public int numBytes() { * Returns the number of code points in it. */ public int numChars() { + if (numChars == -1) numChars = getNumChars(); + return numChars; + } + + /** + * Private helper method to calculate the number of code points in the UTF-8 string. Counting + * the code points is a linear time operation, as we need to scan the entire UTF-8 string. + * Hence, this method should generally only be called once for non-empty UTF-8 strings. + */ + private int getNumChars() { int len = 0; for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) { len += 1;