Skip to content

Commit

Permalink
[SPARK-48947][SQL] Use lowercased charset name to decrease cache miss…
Browse files Browse the repository at this point in the history
…ing in Charset.forName

### What changes were proposed in this pull request?

Since charset names can be non-literal values, they might be case inconsistent. In this pull request, we are converting the charset name to lowercase before calling Charset.forName. This allows values like 'ISO-8859-1' and 'Iso-8859-1' to hit the 2-level cached Charset. By using lowercase instead of uppercase charset names, we align the way `Charset.forName` does further lookup after cache missing.

### Why are the changes needed?

performance improvement

- L1 lookup

```java
    private static Charset lookup(String charsetName) {
        if (charsetName == null)
            throw new IllegalArgumentException("Null charset name");
        Object[] a;
        if ((a = cache1) != null && charsetName.equals(a[0]))
            return (Charset)a[1];
        // We expect most programs to use one Charset repeatedly.
        // We convey a hint to this effect to the VM by putting the
        // level 1 cache miss code in a separate method.
        return lookup2(charsetName);
    }
```

- L2 lookup

```java
   private static Charset lookup2(String charsetName) {
        Object[] a;
        if ((a = cache2) != null && charsetName.equals(a[0])) {
            cache2 = cache1;
            cache1 = a;
            return (Charset)a[1];
        }
        Charset cs;
        if ((cs = standardProvider.charsetForName(charsetName)) != null ||
            (cs = lookupExtendedCharset(charsetName))           != null ||
            (cs = lookupViaProviders(charsetName))              != null)
        {
            cache(charsetName, cs);
            return cs;
        }

        /* Only need to check the name if we didn't find a charset for it */
        checkName(charsetName);
        return null;
    }
```
- After missing

```java
    private Map<String,Charset> cache() {
        Map<String,Charset> map = cache;
        if (map == null) {
            map = new Cache();
            map.put("utf-8", UTF_8.INSTANCE);
            map.put("iso-8859-1", ISO_8859_1.INSTANCE);
            map.put("us-ascii", US_ASCII.INSTANCE);
            map.put("utf-16", java.nio.charset.StandardCharsets.UTF_16);
            map.put("utf-16be", java.nio.charset.StandardCharsets.UTF_16BE);
            map.put("utf-16le", java.nio.charset.StandardCharsets.UTF_16LE);
            cache = map;
        }
        return map;
    }
```

### Does this PR introduce _any_ user-facing change?

no
### How was this patch tested?

existing tests

### Was this patch authored or co-authored using generative AI tooling?

no

Closes apache#47420 from yaooqinn/SPARK-48947.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
  • Loading branch information
yaooqinn authored and attilapiros committed Oct 4, 2024
1 parent 886f18a commit 7cf7e7f
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@
private[sql] object CharsetProvider {

final lazy val VALID_CHARSETS =
Set("US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16", "UTF-32")
Set("us-ascii", "iso-8859-1", "utf-8", "utf-16be", "utf-16le", "utf-16", "utf-32")

def forName(
charset: String,
legacyCharsets: Boolean,
caller: String = ""): Charset = {
if (legacyCharsets || VALID_CHARSETS.contains(charset.toUpperCase(Locale.ROOT))) {
val lowercasedCharset = charset.toLowerCase(Locale.ROOT)
if (legacyCharsets || VALID_CHARSETS.contains(lowercasedCharset)) {
try {
Charset.forName(charset)
Charset.forName(lowercasedCharset)
} catch {
case _: IllegalCharsetNameException |
_: UnsupportedCharsetException |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -864,7 +864,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -882,7 +882,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -900,7 +900,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand Down Expand Up @@ -1144,7 +1144,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand All @@ -1162,7 +1162,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand Down Expand Up @@ -1212,7 +1212,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand All @@ -1230,7 +1230,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -796,7 +796,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -814,7 +814,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand All @@ -832,7 +832,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`encode`",
"parameter" : "`charset`"
}
Expand Down Expand Up @@ -1076,7 +1076,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand All @@ -1094,7 +1094,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "Windows-xxx",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand Down Expand Up @@ -1144,7 +1144,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand All @@ -1162,7 +1162,7 @@ org.apache.spark.SparkIllegalArgumentException
"sqlState" : "22023",
"messageParameters" : {
"charset" : "WINDOWS-1252",
"charsets" : "UTF-16LE, UTF-8, UTF-32, UTF-16BE, UTF-16, US-ASCII, ISO-8859-1",
"charsets" : "utf-8, utf-16be, iso-8859-1, utf-16le, utf-16, utf-32, us-ascii",
"functionName" : "`decode`",
"parameter" : "`charset`"
}
Expand Down

0 comments on commit 7cf7e7f

Please sign in to comment.