Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Encoding.UTF8.GetMaxByte/CharCount perf #69910

Merged
merged 4 commits into from
Jun 3, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/libraries/Common/src/System/Text/ValueUtf8Converter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public ValueUtf8Converter(Span<byte> initialBuffer)

public Span<byte> ConvertAndTerminateString(ReadOnlySpan<char> value)
{
int maxSize = Encoding.UTF8.GetMaxByteCount(value.Length) + 1;
int maxSize = checked(Encoding.UTF8.GetMaxByteCount(value.Length) + 1);
if (_bytes.Length < maxSize)
{
Dispose();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ private static unsafe IntPtr StringToHGlobalUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocHGlobal(nb + 1);
IntPtr ptr = AllocHGlobal(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down Expand Up @@ -1040,7 +1040,7 @@ public static unsafe IntPtr StringToCoTaskMemUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocCoTaskMem(nb + 1);
IntPtr ptr = AllocCoTaskMem(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,50 @@ private unsafe byte[] GetBytesForSmallInput(string s)
return new Span<byte>(ref *pDestination, bytesWritten).ToArray(); // this overload of Span ctor doesn't validate length
}

public override int GetMaxByteCount(int charCount)
{
// This is a specialization of UTF8Encoding.GetMaxByteCount
// with the assumption that the default replacement fallback
// emits 3 fallback bytes ([ EF BF BD ] = '\uFFFD') per
// malformed input char in the worst case.

if ((uint)charCount > (int.MaxValue / MaxUtf8BytesPerChar) - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(charCount);
static void ThrowArgumentException(int charCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(charCount),
message: (charCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetByteCountOverflow);
}
}

return (charCount * MaxUtf8BytesPerChar) + MaxUtf8BytesPerChar;
}

public override int GetMaxCharCount(int byteCount)
{
// This is a specialization of UTF8Encoding.GetMaxCharCount
// with the assumption that the default replacement fallback
// emits one fallback char ('\uFFFD') per malformed input
// byte in the worst case.

if ((uint)byteCount > int.MaxValue - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(byteCount);
static void ThrowArgumentException(int byteCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(byteCount),
message: (byteCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetCharCountOverflow);
}
}

return byteCount + 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 1

could you remind me why we are adding 1 here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some callers expect GetMaxCharCount(byteCount) to be the maximum number of characters that might be converted from any call to Encoding.GetDecoder().GetChars(buffer_of_byteCount_length, some_output_buffer). StreamReader makes this assumption, for instance.

DecoderNLS can hold partial state between calls to GetChars if a non-ASCII byte is seen. There are two possible outcomes here:

  1. The internal state is never completed and represents the maximum invalid subsequence of a UTF-8 buffer. The Encoding instance will replace the entire captured state with a single '\uFFFD' character before processing the rest of the input buffer.

  2. The internal state is 3 bytes of a 4-byte sequence, and the first byte of the incoming buffer would complete the sequence. This means the output would contain 2 characters: the high & low surrogates.

In both scenarios, the worst-case expansion is that the internally captured state results in +1 additional character needed in the output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need more comment here explaining that?

AaronRobinsonMSFT marked this conversation as resolved.
Show resolved Hide resolved
}

public override string GetString(byte[] bytes)
{
// This method is short and can be inlined, meaning that the null check below
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,20 @@ public override int GetMaxByteCount(int charCount)
throw new ArgumentOutOfRangeException(nameof(charCount),
SR.ArgumentOutOfRange_NeedNonNegNum);

// Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
long byteCount = (long)charCount + 1;
// GetMaxByteCount assumes that the caller might have a stateful Encoder instance. If the
// Encoder instance already has a captured high surrogate, then one of two things will
// happen:
//
// - The next char is a low surrogate, at which point the two chars together result in 4
// UTF-8 bytes in the output; or
// - The next char is not a low surrogate (or the input reaches EOF), at which point the
// standalone captured surrogate will go through the fallback routine.
//
// The second case is the worst-case scenario for expansion, so it's what we use for any
// pessimistic "max byte count" calculation: assume there's a captured surrogate and that
// it must fall back.

long byteCount = (long)charCount + 1; // +1 to account for captured surrogate, per above

if (EncoderFallback.MaxCharCount > 1)
byteCount *= EncoderFallback.MaxCharCount;
Expand All @@ -781,9 +793,24 @@ public override int GetMaxCharCount(int byteCount)
if (byteCount < 0)
throw new ArgumentOutOfRangeException(nameof(byteCount),
SR.ArgumentOutOfRange_NeedNonNegNum);

// Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
long charCount = ((long)byteCount + 1);

// GetMaxCharCount assumes that the caller might have a stateful Decoder instance. If the
// Decoder instance already has a captured partial UTF-8 subsequence, then one of two
// thngs will happen:
//
// - The next byte(s) won't complete the subsequence but will instead be consumed into
// the Decoder's internal state, resulting in no character output; or
// - The next byte(s) will complete the subsequence, and the previously captured
// subsequence and the next byte(s) will result in 1 - 2 chars output; or
// - The captured subsequence will be treated as a singular ill-formed subsequence, at
// which point the captured subsequence will go through the fallback routine.
// (See The Unicode Standard, Sec. 3.9 for more information on this.)
//
// The third case is the worst-case scenario for expansion, since it means 0 bytes of
// new input could cause any existing captured state to expand via fallback. So it's
// what we'll use for any pessimistic "max char count" calculation.

long charCount = ((long)byteCount + 1); // +1 to account for captured subsequence, as above

// Non-shortest form would fall back, so get max count from fallback.
// So would 11... followed by 11..., so you could fall back every byte
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,24 @@ public class UTF8EncodingGetMaxByteCount
public void GetMaxByteCount(int charCount)
{
int expected = (charCount + 1) * 3;
Assert.Equal(expected, Encoding.UTF8.GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxByteCount(charCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(-1_300_000_000)] // yields positive result when *3
[InlineData(int.MaxValue / 3)]
[InlineData(int.MaxValue)]
public void GetMaxByteCount_NegativeTests(int charCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => Encoding.UTF8.GetMaxByteCount(charCount));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for any Assert.Throws for an ArgumentException, you may choose to prefer the AssertExtensions version that validates the param name as well.

Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => new UTF8Encoding().GetMaxByteCount(charCount));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,22 @@ public class UTF8EncodingGetMaxCharCount
public void GetMaxCharCount(int byteCount)
{
int expected = byteCount + 1;
Assert.Equal(expected, Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxCharCount(byteCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(int.MaxValue)]
public void GetMaxCharCount_NegativeTests(int byteCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => new UTF8Encoding().GetMaxCharCount(byteCount));
}
}
}