From 3e9520aa0efe74b0866d1dcbeffd580e8163bd24 Mon Sep 17 00:00:00 2001 From: Sukera Date: Tue, 9 Apr 2024 09:08:34 +0200 Subject: [PATCH 1/3] Improve performance of `ncodeunits(::Char)` This improves performance of `ncodeunits(::Char)` by simply counting the number of non-zero bytes (except for `\0`, which is encoded as all zero bytes). For a performance comparison, see https://gist.github.com/Seelengrab/ebb02d4b8d754700c2869de8daf88cad. The version in this PR is called `nbytesencoded` in the benchmarks. Correctness has been verified with Supposition.jl, using the existing implementation as an oracle: ``` julia> using Supposition julia> const chars = Data.Characters() julia> @check max_examples=1_000_000 function bytesenc(c=chars) ncodeunits(c) == nbytesencoded(c) end; Test Summary: | Pass Total Time bytesenc | 1 1 1.2s julia> ncodeunits('\0') == nbytesencoded('\0') true ``` Notably, neither the existing nor the new implementation check whether the given `Char` is valid or not, since the only thing that matters is how many bytes are written out. --- base/char.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index 08d661c41de56..693374822b7ee 100644 --- a/base/char.jl +++ b/base/char.jl @@ -62,7 +62,20 @@ to an output stream, or `ncodeunits(string(c))` but computed efficiently. This method requires at least Julia 1.1. In Julia 1.0 consider using `ncodeunits(string(c))`. """ -ncodeunits(c::Char) = write(devnull, c) # this is surprisingly efficient +function ncodeunits(c::Char) + # All Char are 4 byte wide, and since unicode encoding + # doesn't have null bytes (except for \0), we can just + # count non-zero bytes + char_data = reinterpret(UInt32, c) + mask = 0xff % UInt32 + nbytes = !iszero(char_data & mask) + Base.Cartesian.@nexprs 3 i -> begin + m <<= 0x8 + nbytes += !iszero(char_data & mask) + end + # We have to account for `\0`, which is encoded as all zeros + nbytes + iszero(uc) +end """ codepoint(c::AbstractChar) -> Integer From 19b29df65f9d42311bb0490226e85628d9abaa1f Mon Sep 17 00:00:00 2001 From: Sukera Date: Tue, 9 Apr 2024 13:17:46 +0200 Subject: [PATCH 2/3] Fix use with malformed `Char` The previous implementation assumed that all `Char` are well-formed, which is of course not guaranteed to be the case (and which is also correctly handled by the existing implementation). On top of that, this is even faster, since counting the number of trailing zeros has hardware support on a wide range of architectures. Implementation based on a suggestion by @jakobnissen. Thanks! --- base/char.jl | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/base/char.jl b/base/char.jl index 693374822b7ee..1733e21eff0fc 100644 --- a/base/char.jl +++ b/base/char.jl @@ -63,18 +63,13 @@ to an output stream, or `ncodeunits(string(c))` but computed efficiently. using `ncodeunits(string(c))`. """ function ncodeunits(c::Char) - # All Char are 4 byte wide, and since unicode encoding - # doesn't have null bytes (except for \0), we can just - # count non-zero bytes - char_data = reinterpret(UInt32, c) - mask = 0xff % UInt32 - nbytes = !iszero(char_data & mask) - Base.Cartesian.@nexprs 3 i -> begin - m <<= 0x8 - nbytes += !iszero(char_data & mask) - end - # We have to account for `\0`, which is encoded as all zeros - nbytes + iszero(uc) + u = reinterpret(UInt32, c) + + # We care about how many trailing bytes are all zero + n_nonzero_bytes = sizeof(UInt32) - div(trailing_zeros(u), 0x8) + + # Take care of '\0', which has an all-zero bitpattern + n_nonzero_bytes + iszero(u) end """ From 409e7e7b2f9d991cf072f8d69cfe4710bf5675d4 Mon Sep 17 00:00:00 2001 From: Sukera Date: Tue, 9 Apr 2024 16:17:23 +0200 Subject: [PATCH 3/3] Incorporate code review --- base/char.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/base/char.jl b/base/char.jl index 1733e21eff0fc..bc68a672ce0ca 100644 --- a/base/char.jl +++ b/base/char.jl @@ -64,10 +64,9 @@ to an output stream, or `ncodeunits(string(c))` but computed efficiently. """ function ncodeunits(c::Char) u = reinterpret(UInt32, c) - # We care about how many trailing bytes are all zero + # subtract that from the total number of bytes n_nonzero_bytes = sizeof(UInt32) - div(trailing_zeros(u), 0x8) - # Take care of '\0', which has an all-zero bitpattern n_nonzero_bytes + iszero(u) end