[SPARK-48947][SQL] Use lowercased charset name to decrease cache miss…

…ing in Charset.forName ### What changes were proposed in this pull request? Since charset names can be non-literal values, they might be case inconsistent. In this pull request, we are converting the charset name to lowercase before calling Charset.forName. This allows values like 'ISO-8859-1' and 'Iso-8859-1' to hit the 2-level cached Charset. By using lowercase instead of uppercase charset names, we align the way `Charset.forName` does further lookup after cache missing. ### Why are the changes needed? performance improvement - L1 lookup ```java private static Charset lookup(String charsetName) { if (charsetName == null) throw new IllegalArgumentException("Null charset name"); Object[] a; if ((a = cache1) != null && charsetName.equals(a[0])) return (Charset)a[1]; // We expect most programs to use one Charset repeatedly. // We convey a hint to this effect to the VM by putting the // level 1 cache miss code in a separate method. return lookup2(charsetName); } ``` - L2 lookup ```java private static Charset lookup2(String charsetName) { Object[] a; if ((a = cache2) != null && charsetName.equals(a[0])) { cache2 = cache1; cache1 = a; return (Charset)a[1]; } Charset cs; if ((cs = standardProvider.charsetForName(charsetName)) != null || (cs = lookupExtendedCharset(charsetName)) != null || (cs = lookupViaProviders(charsetName)) != null) { cache(charsetName, cs); return cs; } /* Only need to check the name if we didn't find a charset for it */ checkName(charsetName); return null; } ``` - After missing ```java private Map<String,Charset> cache() { Map<String,Charset> map = cache; if (map == null) { map = new Cache(); map.put("utf-8", UTF_8.INSTANCE); map.put("iso-8859-1", ISO_8859_1.INSTANCE); map.put("us-ascii", US_ASCII.INSTANCE); map.put("utf-16", java.nio.charset.StandardCharsets.UTF_16); map.put("utf-16be", java.nio.charset.StandardCharsets.UTF_16BE); map.put("utf-16le", java.nio.charset.StandardCharsets.UTF_16LE); cache = map; } return map; } ``` ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#47420 from yaooqinn/SPARK-48947. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Kent Yao <yao@apache.org>
attilapiros · Oct 4, 2024 · 7cf7e7f · 7cf7e7f
1 parent 886f18a
commit 7cf7e7f
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 19 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharsetProvider.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharsetProvider.scala
@@ -23,15 +23,16 @@
 private[sql] object CharsetProvider {
 
   final lazy val VALID_CHARSETS =
-    Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")
+    Set("us-ascii", "iso-8859-1", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32")
 
   def forName(
       charset: String,
       legacyCharsets: Boolean,
       caller: String = ""): Charset = {
-    if (legacyCharsets || VALID_CHARSETS.contains(charset.toUpperCase(Locale.ROOT))) {
+    val lowercasedCharset = charset.toLowerCase(Locale.ROOT)
+    if (legacyCharsets || VALID_CHARSETS.contains(lowercasedCharset)) {
       try {
-        Charset.forName(charset)
+        Charset.forName(lowercasedCharset)
       } catch {
         case _: IllegalCharsetNameException |
              _: UnsupportedCharsetException |

diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -846,7 +846,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -864,7 +864,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -882,7 +882,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -900,7 +900,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -1144,7 +1144,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1162,7 +1162,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1212,7 +1212,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1230,7 +1230,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }

diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -778,7 +778,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -796,7 +796,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -814,7 +814,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -832,7 +832,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`encode`",
     "parameter" : "`charset`"
   }
@@ -1076,7 +1076,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1094,7 +1094,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "Windows-xxx",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1144,7 +1144,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }
@@ -1162,7 +1162,7 @@ org.apache.spark.SparkIllegalArgumentException
   "sqlState" : "22023",
   "messageParameters" : {
     "charset" : "WINDOWS-1252",
-    "charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
+    "charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
     "functionName" : "`decode`",
     "parameter" : "`charset`"
   }