-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve XmlDictionaryWriter UTF8 encoding performance #73336
Changes from 25 commits
5d09005
63c760c
196ce48
65e7029
4d8078a
6e5aabb
70fa189
b34d259
5df5ae0
a790fbb
2b82ac8
301e531
5a21306
8a3de26
048cade
8297311
287e737
0d2a9bb
ab29682
251391f
a590739
46b6314
82f8880
d78aade
3b20be8
9c86b05
ccfb008
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
|
||
using System.Buffers.Binary; | ||
using System.IO; | ||
using System.Text; | ||
using System.Runtime.InteropServices; | ||
using System.Runtime.Serialization; | ||
using System.Threading.Tasks; | ||
|
||
|
@@ -334,34 +336,26 @@ protected unsafe void UnsafeWriteUnicodeChars(char* chars, int charCount) | |
} | ||
} | ||
|
||
protected unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset) | ||
protected static unsafe int UnsafeGetUnicodeChars(char* chars, int charCount, byte[] buffer, int offset) | ||
{ | ||
char* charsMax = chars + charCount; | ||
while (chars < charsMax) | ||
if (BitConverter.IsLittleEndian) | ||
{ | ||
char value = *chars++; | ||
buffer[offset++] = (byte)value; | ||
value >>= 8; | ||
buffer[offset++] = (byte)value; | ||
new ReadOnlySpan<char>(chars, charCount) | ||
.CopyTo(MemoryMarshal.Cast<byte, char>(buffer.AsSpan(offset))); | ||
} | ||
else | ||
{ | ||
BinaryPrimitives.ReverseEndianness(new ReadOnlySpan<short>(chars, charCount), | ||
MemoryMarshal.Cast<byte, short>(buffer.AsSpan(offset))); | ||
} | ||
|
||
return charCount * 2; | ||
} | ||
|
||
protected unsafe int UnsafeGetUTF8Length(char* chars, int charCount) | ||
{ | ||
char* charsMax = chars + charCount; | ||
while (chars < charsMax) | ||
{ | ||
if (*chars >= 0x80) | ||
break; | ||
|
||
chars++; | ||
} | ||
|
||
if (chars == charsMax) | ||
return charCount; | ||
|
||
return (int)(chars - (charsMax - charCount)) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, (int)(charsMax - chars)); | ||
// Length will always be at least ( 128 / maxBytesPerChar) = 42 | ||
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetByteCount(chars, charCount); | ||
Daniel-Svensson marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffer, int offset) | ||
|
@@ -370,39 +364,32 @@ protected unsafe int UnsafeGetUTF8Chars(char* chars, int charCount, byte[] buffe | |
{ | ||
fixed (byte* _bytes = &buffer[offset]) | ||
{ | ||
byte* bytes = _bytes; | ||
byte* bytesMax = &bytes[buffer.Length - offset]; | ||
char* charsMax = &chars[charCount]; | ||
|
||
while (true) | ||
// Fast path for small strings, use Encoding.GetBytes for larger strings since it is faster when vectorization is possible | ||
if ((uint)charCount < 25) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling into encoding actually seems to be faster from 25 characters up, but that is when we don't need to handle branch predictions so I increased it to 32 handle misspredictions without to much affect on performance. The "microbenchmarks" showed >5% regression where a long class name was mixed with many short strings & names when calling encoding from 25 chars and upp for the (text based) DataContractSerializer. (In the same case the binary serializer was 10% faster). Now they are maybe? 1% regression and 5% improvement, but other things might be different since it is no r2r or pgo for local build) |
||
{ | ||
byte* bytes = _bytes; | ||
char* charsMax = &chars[charCount]; | ||
|
||
while (chars < charsMax) | ||
{ | ||
char t = *chars; | ||
if (t >= 0x80) | ||
break; | ||
goto NonAscii; | ||
|
||
*bytes = (byte)t; | ||
bytes++; | ||
chars++; | ||
} | ||
return charCount; | ||
|
||
if (chars >= charsMax) | ||
break; | ||
|
||
char* charsStart = chars; | ||
while (chars < charsMax && *chars >= 0x80) | ||
{ | ||
chars++; | ||
} | ||
|
||
bytes += (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(charsStart, (int)(chars - charsStart), bytes, (int)(bytesMax - bytes)); | ||
|
||
if (chars >= charsMax) | ||
break; | ||
NonAscii: | ||
byte* bytesMax = _bytes + buffer.Length - offset; | ||
return (int)(bytes - _bytes) + (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, (int)(charsMax - chars), bytes, (int)(bytesMax - bytes)); | ||
} | ||
else | ||
{ | ||
return (_encoding ?? DataContractSerializer.ValidatingUTF8).GetBytes(chars, charCount, _bytes, buffer.Length - offset); | ||
} | ||
|
||
return (int)(bytes - _bytes); | ||
} | ||
} | ||
return 0; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was originally a temporary part of moving vector code to Encoding class.
It does not seem to make any impact to datacontract serialisation at the moment so I can revert the changes if you want that. From the code it looks like improvements would mainly be from classes calling into XmlConverter which uses this encoding directly