Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Encoding.UTF8.GetMaxByte/CharCount perf #69910

Merged
merged 4 commits into from
Jun 3, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/libraries/Common/src/System/Text/ValueUtf8Converter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public ValueUtf8Converter(Span<byte> initialBuffer)

public Span<byte> ConvertAndTerminateString(ReadOnlySpan<char> value)
{
int maxSize = Encoding.UTF8.GetMaxByteCount(value.Length) + 1;
int maxSize = checked(Encoding.UTF8.GetMaxByteCount(value.Length) + 1);
if (_bytes.Length < maxSize)
{
Dispose();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ private static unsafe IntPtr StringToHGlobalUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocHGlobal(nb + 1);
IntPtr ptr = AllocHGlobal(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down Expand Up @@ -1040,7 +1040,7 @@ public static unsafe IntPtr StringToCoTaskMemUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocCoTaskMem(nb + 1);
IntPtr ptr = AllocCoTaskMem(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,50 @@ private unsafe byte[] GetBytesForSmallInput(string s)
return new Span<byte>(ref *pDestination, bytesWritten).ToArray(); // this overload of Span ctor doesn't validate length
}

public override int GetMaxByteCount(int charCount)
{
// This is a specialization of UTF8Encoding.GetMaxByteCount
// with the assumption that the default replacement fallback
// emits 3 fallback bytes ([ EF BF BD ] = '\uFFFD') per
// malformed input char in the worst case.

if ((uint)charCount > (int.MaxValue / MaxUtf8BytesPerChar) - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(charCount);
static void ThrowArgumentException(int charCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(charCount),
message: (charCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetByteCountOverflow);
}
}

return (charCount * MaxUtf8BytesPerChar) + MaxUtf8BytesPerChar;
}

public override int GetMaxCharCount(int byteCount)
{
// This is a specialization of UTF8Encoding.GetMaxCharCount
// with the assumption that the default replacement fallback
// emits one fallback char ('\uFFFD') per malformed input
// byte in the worst case.

if ((uint)byteCount > int.MaxValue - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(byteCount);
static void ThrowArgumentException(int byteCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(byteCount),
message: (byteCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetCharCountOverflow);
}
}

return byteCount + 1;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 1

could you remind me why we are adding 1 here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some callers expect GetMaxCharCount(byteCount) to be the maximum number of characters that might be converted from any call to Encoding.GetDecoder().GetChars(buffer_of_byteCount_length, some_output_buffer). StreamReader makes this assumption, for instance.

DecoderNLS can hold partial state between calls to GetChars if a non-ASCII byte is seen. There are two possible outcomes here:

  1. The internal state is never completed and represents the maximum invalid subsequence of a UTF-8 buffer. The Encoding instance will replace the entire captured state with a single '\uFFFD' character before processing the rest of the input buffer.

  2. The internal state is 3 bytes of a 4-byte sequence, and the first byte of the incoming buffer would complete the sequence. This means the output would contain 2 characters: the high & low surrogates.

In both scenarios, the worst-case expansion is that the internally captured state results in +1 additional character needed in the output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need more comment here explaining that?

AaronRobinsonMSFT marked this conversation as resolved.
Show resolved Hide resolved
}

public override string GetString(byte[] bytes)
{
// This method is short and can be inlined, meaning that the null check below
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,24 @@ public class UTF8EncodingGetMaxByteCount
public void GetMaxByteCount(int charCount)
{
int expected = (charCount + 1) * 3;
Assert.Equal(expected, Encoding.UTF8.GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxByteCount(charCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(-1_300_000_000)] // yields positive result when *3
[InlineData(int.MaxValue / 3)]
[InlineData(int.MaxValue)]
public void GetMaxByteCount_NegativeTests(int charCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => Encoding.UTF8.GetMaxByteCount(charCount));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for any Assert.Throws for an ArgumentException, you may choose to prefer the AssertExtensions version that validates the param name as well.

Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => new UTF8Encoding().GetMaxByteCount(charCount));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,22 @@ public class UTF8EncodingGetMaxCharCount
public void GetMaxCharCount(int byteCount)
{
int expected = byteCount + 1;
Assert.Equal(expected, Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxCharCount(byteCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(int.MaxValue)]
public void GetMaxCharCount_NegativeTests(int byteCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => new UTF8Encoding().GetMaxCharCount(byteCount));
}
}
}