Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve XmlDictionaryWriter UTF8 encoding performance #73336

Merged
merged 27 commits into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5d09005
Speed up text encoding
Daniel-Svensson Jul 7, 2022
63c760c
Update implementation
Daniel-Svensson Jul 18, 2022
196ce48
Add tests for binary xml strings
Daniel-Svensson Jul 26, 2022
65e7029
Merge branch 'binary_xml_text' of https://github.com/Daniel-Svensson/…
Daniel-Svensson Jul 26, 2022
4d8078a
limit counting code to 256 bit vectors
Daniel-Svensson Jul 26, 2022
6e5aabb
reword comment
Daniel-Svensson Aug 3, 2022
70fa189
rename test
Daniel-Svensson Aug 3, 2022
b34d259
move bytesmax
Daniel-Svensson Aug 3, 2022
5df5ae0
Fix bytesMax after moving variable initialization
Daniel-Svensson Aug 4, 2022
a790fbb
use unicode escape value in test
Daniel-Svensson Aug 4, 2022
2b82ac8
fix test typo "*" -> "+"
Daniel-Svensson Aug 4, 2022
301e531
Update src/libraries/System.Private.DataContractSerialization/src/Sys…
Daniel-Svensson Aug 12, 2022
5a21306
Remvoe vectorized code from UnsafeGetUTF8Length
Daniel-Svensson Aug 12, 2022
8a3de26
Merge remote-tracking branch 'upstream/main' into binary_xml_text
Daniel-Svensson Aug 13, 2022
048cade
Fix overfload
Daniel-Svensson Sep 8, 2022
8297311
Merge commit '080f708e7018f6c0529b6c875a44d84fc4d74419' into binary_x…
Daniel-Svensson Oct 24, 2022
287e737
use for loop which seems faster
Daniel-Svensson Oct 24, 2022
0d2a9bb
merge up to net8 preview1
Daniel-Svensson Mar 3, 2023
ab29682
remove vector loop
Daniel-Svensson Mar 6, 2023
251391f
make sealed encoding to allow devirtualisation
Daniel-Svensson Mar 11, 2023
a590739
back some changes
Daniel-Svensson Mar 20, 2023
46b6314
use uint for UnsafeGetUTF8Chars comparison
Daniel-Svensson Mar 25, 2023
82f8880
revert more changes
Daniel-Svensson Mar 26, 2023
d78aade
Fix cutoff based on new measurements
Daniel-Svensson Mar 26, 2023
3b20be8
use BinaryPrimitives.ReverseEndianness as suggested
Daniel-Svensson Mar 26, 2023
9c86b05
Update cutoff from 24 to 32 chars before calling, due to regression f…
Daniel-Svensson Mar 27, 2023
ccfb008
Remove sealed encoding since it only improves XmlConvert
Daniel-Svensson Apr 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,27 @@ public sealed class DataContractSerializer : XmlObjectSerializer
private static SerializationOption s_option = IsReflectionBackupAllowed() ? SerializationOption.ReflectionAsBackup : SerializationOption.CodeGenOnly;
private static bool s_optionAlreadySet;

internal static UTF8Encoding UTF8NoBom { get; } = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: false);
internal static UTF8Encoding ValidatingUTF8 { get; } = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
internal sealed class SealedUTF8Encoding : UTF8Encoding
{
public SealedUTF8Encoding(bool encoderShouldEmitUTF8Identifier, bool throwOnInvalidBytes)
: base(encoderShouldEmitUTF8Identifier, throwOnInvalidBytes)
{ }
}

internal sealed class SealedUnicodeEncoding : UnicodeEncoding
{
public SealedUnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)
: base(bigEndian, byteOrderMark, throwOnInvalidBytes)
{ }
}

internal static SealedUTF8Encoding UTF8NoBom { get; } = new SealedUTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: false);
internal static SealedUTF8Encoding ValidatingUTF8 { get; } = new SealedUTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
Copy link
Contributor Author

@Daniel-Svensson Daniel-Svensson Mar 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was originally a temporary part of moving vector code to Encoding class.

It does not seem to make any impact to datacontract serialisation at the moment so I can revert the changes if you want that. From the code it looks like improvements would mainly be from classes calling into XmlConverter which uses this encoding directly


internal static UnicodeEncoding UTF16NoBom { get; } = new UnicodeEncoding(bigEndian: false, byteOrderMark: false, throwOnInvalidBytes: false);
internal static UnicodeEncoding BEUTF16NoBom { get; } = new UnicodeEncoding(bigEndian: true, byteOrderMark: false, throwOnInvalidBytes: false);
internal static UnicodeEncoding ValidatingUTF16 { get; } = new UnicodeEncoding(bigEndian: false, byteOrderMark: false, throwOnInvalidBytes: true);
internal static UnicodeEncoding ValidatingBEUTF16 { get; } = new UnicodeEncoding(bigEndian: true, byteOrderMark: false, throwOnInvalidBytes: true);
internal static SealedUnicodeEncoding UTF16NoBom { get; } = new SealedUnicodeEncoding(bigEndian: false, byteOrderMark: false, throwOnInvalidBytes: false);
internal static SealedUnicodeEncoding BEUTF16NoBom { get; } = new SealedUnicodeEncoding(bigEndian: true, byteOrderMark: false, throwOnInvalidBytes: false);
internal static SealedUnicodeEncoding ValidatingUTF16 { get; } = new SealedUnicodeEncoding(bigEndian: false, byteOrderMark: false, throwOnInvalidBytes: true);
internal static SealedUnicodeEncoding ValidatingBEUTF16 { get; } = new SealedUnicodeEncoding(bigEndian: true, byteOrderMark: false, throwOnInvalidBytes: true);

internal static Base64Encoding Base64Encoding { get; } = new Base64Encoding();
internal static BinHexEncoding BinHexEncoding { get; } = new BinHexEncoding();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using System.IO;
using System.Text;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
using System.Threading.Tasks;

Expand Down Expand Up @@ -336,32 +337,28 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount)

protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
if (BitConverter.IsLittleEndian)
{
char value = *chars++;
buffer[offset++] = (byte)value;
value >>= 8;
buffer[offset++] = (byte)value;
new ReadOnlySpan<char>(chars, charCount)
.CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset)));
}
else
{
char* charsMax = chars + charCount;
while (chars < charsMax)
{
char value = *chars++;
buffer[offset++] = (byte)value;
buffer[offset++] = (byte)(value >> 8);
}
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
}

return charCount * 2;
}

protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount)
{
char* charsMax = chars + charCount;
while (chars < charsMax)
{
if (*chars >= 0x80)
break;

chars++;
}

if (chars == charsMax)
return charCount;

return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars));
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount);
Daniel-Svensson marked this conversation as resolved.
Show resolved Hide resolved
}

protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset)
Expand All @@ -370,39 +367,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe
{
fixed (byte* _bytes = &buffer[offset])
{
byte* bytes = _bytes;
byte* bytesMax = &bytes[buffer.Length - offset];
char* charsMax = &chars[charCount];

while (true)
// Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible
if ((uint)charCount < 25)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling into encoding actually seems to be faster from 25 characters up, but that is when we don't need to handle branch predictions so I increased it to 32 handle misspredictions without to much affect on performance.

The "microbenchmarks" showed >5% regression where a long class name was mixed with many short strings & names when calling encoding from 25 chars and upp for the (text based) DataContractSerializer. (In the same case the binary serializer was 10% faster). Now they are maybe? 1% regression and 5% improvement, but other things might be different since it is no r2r or pgo for local build)

{
byte* bytes = _bytes;
char* charsMax = &chars[charCount];

while (chars < charsMax)
{
char t = *chars;
if (t >= 0x80)
break;
goto NonAscii;

*bytes = (byte)t;
bytes++;
chars++;
}
return charCount;

if (chars >= charsMax)
break;

char* charsStart = chars;
while (chars < charsMax && *chars >= 0x80)
{
chars++;
}

bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes));

if (chars >= charsMax)
break;
NonAscii:
byte* bytesMax = _bytes + buffer.Length - offset;
return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes));
}
else
{
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset);
}

return (int)(bytes - _bytes);
}
}
return 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,71 @@ void AssertBytesWritten(Action<XmlDictionaryWriter> action, XmlBinaryNodeType no
}
}

[Fact]
public static void XmlBaseWriter_WriteString()
{
const byte Chars8Text = 152;
const byte Chars16Text = 154;
MemoryStream ms = new MemoryStream();
XmlDictionaryWriter writer = (XmlDictionaryWriter)XmlDictionaryWriter.CreateBinaryWriter(ms);
writer.WriteStartElement("root");

int[] lengths = new[] { 7, 8, 9, 15, 16, 17, 31, 32, 36, 258 };
byte[] buffer = new byte[lengths.Max() + 1];

foreach (var length in lengths)
{
string allAscii = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
});
string multiByteLast = string.Create(length, null, (Span<char> chars, object _) =>
{
for (int i = 0; i < chars.Length; ++i)
chars[i] = (char)(i % 128);
chars[^1] = '\u00E4'; // '�' - Latin Small Letter a with Diaeresis. Latin-1 Supplement.
});

int numBytes = Encoding.UTF8.GetBytes(allAscii, buffer);
Assert.True(numBytes == length, "Test setup wrong - allAscii");
ValidateWriteText(ms, writer, allAscii, expected: buffer.AsSpan(0, numBytes));

numBytes = Encoding.UTF8.GetBytes(multiByteLast, buffer);
Assert.True(numBytes == length + 1, "Test setup wrong - multiByte");
ValidateWriteText(ms, writer, multiByteLast, expected: buffer.AsSpan(0, numBytes));
}

static void ValidateWriteText(MemoryStream ms, XmlDictionaryWriter writer, string text, ReadOnlySpan<byte> expected)
{
writer.Flush();
ms.Seek(0, SeekOrigin.Begin);
ms.SetLength(0);
writer.WriteString(text);
writer.Flush();

ms.TryGetBuffer(out ArraySegment<byte> arraySegment);
ReadOnlySpan<byte> buffer = arraySegment;

if (expected.Length <= byte.MaxValue)
{
Assert.Equal(Chars8Text, buffer[0]);
Assert.Equal(expected.Length, buffer[1]);
buffer = buffer.Slice(2);
}
else if (expected.Length <= ushort.MaxValue)
{
Assert.Equal(Chars16Text, buffer[0]);
Assert.Equal(expected.Length, (int)(buffer[1]) | ((int)buffer[2] << 8));
buffer = buffer.Slice(3);
}
else
Assert.Fail("test use to long length");

AssertExtensions.SequenceEqual(expected, buffer);
}
}

private static bool ReadTest(MemoryStream ms, Encoding encoding, ReaderWriterFactory.ReaderWriterType rwType, byte[] byteArray)
{
ms.Position = 0;
Expand Down