Skip to content

Commit

Permalink
Improve Encoding.UTF8.GetMaxByte/CharCount perf (#69910)
Browse files Browse the repository at this point in the history
- Also fixes some potential integer overflows in callers
- Also updates some code comments
  • Loading branch information
GrabYourPitchforks authored Jun 3, 2022
1 parent 24f5de4 commit be81ac6
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 7 deletions.
2 changes: 1 addition & 1 deletion src/libraries/Common/src/System/Text/ValueUtf8Converter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public ValueUtf8Converter(Span<byte> initialBuffer)

public Span<byte> ConvertAndTerminateString(ReadOnlySpan<char> value)
{
int maxSize = Encoding.UTF8.GetMaxByteCount(value.Length) + 1;
int maxSize = checked(Encoding.UTF8.GetMaxByteCount(value.Length) + 1);
if (_bytes.Length < maxSize)
{
Dispose();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -993,7 +993,7 @@ private static unsafe IntPtr StringToHGlobalUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocHGlobal(nb + 1);
IntPtr ptr = AllocHGlobal(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down Expand Up @@ -1040,7 +1040,7 @@ public static unsafe IntPtr StringToCoTaskMemUTF8(string? s)

int nb = Encoding.UTF8.GetMaxByteCount(s.Length);

IntPtr ptr = AllocCoTaskMem(nb + 1);
IntPtr ptr = AllocCoTaskMem(checked(nb + 1));

int nbWritten;
byte* pbMem = (byte*)ptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,50 @@ private unsafe byte[] GetBytesForSmallInput(string s)
return new Span<byte>(ref *pDestination, bytesWritten).ToArray(); // this overload of Span ctor doesn't validate length
}

public override int GetMaxByteCount(int charCount)
{
// This is a specialization of UTF8Encoding.GetMaxByteCount
// with the assumption that the default replacement fallback
// emits 3 fallback bytes ([ EF BF BD ] = '\uFFFD') per
// malformed input char in the worst case.

if ((uint)charCount > (int.MaxValue / MaxUtf8BytesPerChar) - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(charCount);
static void ThrowArgumentException(int charCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(charCount),
message: (charCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetByteCountOverflow);
}
}

return (charCount * MaxUtf8BytesPerChar) + MaxUtf8BytesPerChar;
}

public override int GetMaxCharCount(int byteCount)
{
// This is a specialization of UTF8Encoding.GetMaxCharCount
// with the assumption that the default replacement fallback
// emits one fallback char ('\uFFFD') per malformed input
// byte in the worst case.

if ((uint)byteCount > int.MaxValue - 1)
{
// Move the throw out of the hot path to allow for inlining.
ThrowArgumentException(byteCount);
static void ThrowArgumentException(int byteCount)
{
throw new ArgumentOutOfRangeException(
paramName: nameof(byteCount),
message: (byteCount < 0) ? SR.ArgumentOutOfRange_NeedNonNegNum : SR.ArgumentOutOfRange_GetCharCountOverflow);
}
}

return byteCount + 1;
}

public override string GetString(byte[] bytes)
{
// This method is short and can be inlined, meaning that the null check below
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,20 @@ public override int GetMaxByteCount(int charCount)
throw new ArgumentOutOfRangeException(nameof(charCount),
SR.ArgumentOutOfRange_NeedNonNegNum);

// Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback
long byteCount = (long)charCount + 1;
// GetMaxByteCount assumes that the caller might have a stateful Encoder instance. If the
// Encoder instance already has a captured high surrogate, then one of two things will
// happen:
//
// - The next char is a low surrogate, at which point the two chars together result in 4
// UTF-8 bytes in the output; or
// - The next char is not a low surrogate (or the input reaches EOF), at which point the
// standalone captured surrogate will go through the fallback routine.
//
// The second case is the worst-case scenario for expansion, so it's what we use for any
// pessimistic "max byte count" calculation: assume there's a captured surrogate and that
// it must fall back.

long byteCount = (long)charCount + 1; // +1 to account for captured surrogate, per above

if (EncoderFallback.MaxCharCount > 1)
byteCount *= EncoderFallback.MaxCharCount;
Expand All @@ -782,8 +794,23 @@ public override int GetMaxCharCount(int byteCount)
throw new ArgumentOutOfRangeException(nameof(byteCount),
SR.ArgumentOutOfRange_NeedNonNegNum);

// Figure out our length, 1 char per input byte + 1 char if 1st byte is last byte of 4 byte surrogate pair
long charCount = ((long)byteCount + 1);
// GetMaxCharCount assumes that the caller might have a stateful Decoder instance. If the
// Decoder instance already has a captured partial UTF-8 subsequence, then one of two
// thngs will happen:
//
// - The next byte(s) won't complete the subsequence but will instead be consumed into
// the Decoder's internal state, resulting in no character output; or
// - The next byte(s) will complete the subsequence, and the previously captured
// subsequence and the next byte(s) will result in 1 - 2 chars output; or
// - The captured subsequence will be treated as a singular ill-formed subsequence, at
// which point the captured subsequence will go through the fallback routine.
// (See The Unicode Standard, Sec. 3.9 for more information on this.)
//
// The third case is the worst-case scenario for expansion, since it means 0 bytes of
// new input could cause any existing captured state to expand via fallback. So it's
// what we'll use for any pessimistic "max char count" calculation.

long charCount = ((long)byteCount + 1); // +1 to account for captured subsequence, as above

// Non-shortest form would fall back, so get max count from fallback.
// So would 11... followed by 11..., so you could fall back every byte
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,24 @@ public class UTF8EncodingGetMaxByteCount
public void GetMaxByteCount(int charCount)
{
int expected = (charCount + 1) * 3;
Assert.Equal(expected, Encoding.UTF8.GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxByteCount(charCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxByteCount(charCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(-1_300_000_000)] // yields positive result when *3
[InlineData(int.MaxValue / 3)]
[InlineData(int.MaxValue)]
public void GetMaxByteCount_NegativeTests(int charCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => Encoding.UTF8.GetMaxByteCount(charCount));
Assert.Throws<ArgumentOutOfRangeException>(nameof(charCount), () => new UTF8Encoding().GetMaxByteCount(charCount));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,22 @@ public class UTF8EncodingGetMaxCharCount
public void GetMaxCharCount(int byteCount)
{
int expected = byteCount + 1;
Assert.Equal(expected, Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(true, false).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, true).GetMaxCharCount(byteCount));
Assert.Equal(expected, new UTF8Encoding(false, false).GetMaxCharCount(byteCount));
}

[Theory]
[InlineData(-1)]
[InlineData(int.MinValue)]
[InlineData(-1_000_000_000)]
[InlineData(int.MaxValue)]
public void GetMaxCharCount_NegativeTests(int byteCount)
{
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => Encoding.UTF8.GetMaxCharCount(byteCount));
Assert.Throws<ArgumentOutOfRangeException>(nameof(byteCount), () => new UTF8Encoding().GetMaxCharCount(byteCount));
}
}
}

0 comments on commit be81ac6

Please sign in to comment.