Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor AdvSimd version of DecodeFromUTF8 #101620

Merged
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -864,86 +864,120 @@ private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes
// but make sure that we quit before seeing any == markers at the end of the
// string. 64 + 2 = 66 bytes.

Vector128<byte> decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
Vector128<byte> decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte();
Vector128<byte> decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte();
Vector128<byte> decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte();
Vector128<byte> decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte();
Vector128<byte> decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte();
Vector128<byte> decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte();
Vector128<byte> decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte();

Vector128<byte> decOne1;
Vector128<byte> decOne2;
Vector128<byte> decOne3;
Vector128<byte> decOne4;
Vector128<byte> decTwo1;
Vector128<byte> decTwo2;
Vector128<byte> decTwo3;
Vector128<byte> decTwo4;
Vector128<byte> str1;
Vector128<byte> str2;
Vector128<byte> str3;
Vector128<byte> str4;
Vector128<byte> res1;
Vector128<byte> res2;
Vector128<byte> res3;
// In the decoding process, we want to map each byte, representing a Base64 value, to its 6-bit (0-63) representation.
// It uses the following mapping. Values outside the following groups are invalid and, we abort decoding when encounter one.
//
// # From To Char
// 1 [43] [62] +
// 2 [47] [63] /
// 3 [48..57] [52..61] 0..9
// 4 [65..90] [0..25] A..Z
// 5 [97..122] [26..51] a..z
//
// To map an input value to its Base64 representation, we use look-up tables 'decLutOne' and 'decLutTwo'.
// 'decLutOne' helps to map groups 1, 2 and 3 while 'decLutTwo' maps groups 4 and 5 in the above list.
// After mapping, each value falls between 0-63. Consequently, the last six bits of each byte now hold a valid value.
// We then compress four such bytes (with valid 4 * 6 = 24 bits) to three UTF8 bytes (3 * 8 = 24 bits).
// For faster decoding, we use SIMD operations that allow the processing of multiple bytes together.
// However, the compress operation on adjacent values of a vector could be slower. Thus, we de-interleave while reading the input bytes that store adjacent
// bytes in separate vectors. This later simplifies the compress step with the help of logical operations.
// This requires interleaving while storing the decoded result.

// Values in 'decLutOne'
// 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
// 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
// 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 63
SwapnilGaikwad marked this conversation as resolved.
Show resolved Hide resolved
// 0, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
SwapnilGaikwad marked this conversation as resolved.
Show resolved Hide resolved
var decLutOne = (Vector128<byte>.AllBitsSet,
Vector128<byte>.AllBitsSet,
Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(),
SwapnilGaikwad marked this conversation as resolved.
Show resolved Hide resolved
Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte());

// Values in 'decLutTwo'
// 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255
SwapnilGaikwad marked this conversation as resolved.
Show resolved Hide resolved
// 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39
// 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255
// 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255
var decLutTwo = (Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(),
SwapnilGaikwad marked this conversation as resolved.
Show resolved Hide resolved
Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(),
Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(),
Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte());

byte* src = srcBytes;
byte* dest = destBytes;
Vector128<byte> offset = AdvSimd.DuplicateToVector128((byte)0x3F);
var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4);
var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4);
Vector128<byte> offset = Vector128.Create<byte>(63);

do
{
// Load 64 bytes and de-interleave.
// Step 1: Load 64 bytes and de-interleave.
AssertRead<Vector128<byte>>(src, srcStart, sourceLength);
(str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src);

// Get indices for second LUT:
decTwo1 = AdvSimd.SubtractSaturate(str1, offset);
decTwo2 = AdvSimd.SubtractSaturate(str2, offset);
decTwo3 = AdvSimd.SubtractSaturate(str3, offset);
decTwo4 = AdvSimd.SubtractSaturate(str4, offset);

// Get values from first LUT. Out-of-range indices are set to 0.
decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1);
decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2);
decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3);
decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4);

// Get values from second LUT. Out-of-range indices are unchanged.
var (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src);

// Step 2: Map each valid input to its Base64 value.
// We use two look-ups to compute partial results and combine them later.

// Step 2.1: Detect valid Base64 values from the first three groups. Maps input as,
// 0 to 63 (Invalid) => 255
// 0 to 63 (Valid) => Their Base64 equivalent
// 64 to 255 => 0

// Each input value acts as an index in the look-up table 'decLutOne'.
// e.g., for group 1: index 43 maps to 62 (Base64 '+').
// Group 4 and 5 values are out-of-range (>64), so they are mapped to zero.
// Other valid indices but invalid values are mapped to 255.
Vector128<byte> decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1);
Vector128<byte> decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2);
Vector128<byte> decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3);
Vector128<byte> decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4);

// Step 2.2: Detect valid Base64 values from groups 4 and 5. Maps input as,
// 0 to 63 => 0
// 64 to 122 (Valid) => Their Base64 equivalent
// 64 to 122 (Invalid) => 255
// 123 to 255 => Remains unchanged

// Subtract/offset each input value by 63 so that it can be used as a valid offset.
// Subtract saturate makes values from the first three groups set to zero that are
// then mapped to zero in the subsequent look-up.
Vector128<byte> decTwo1 = AdvSimd.SubtractSaturate(str1, offset);
Vector128<byte> decTwo2 = AdvSimd.SubtractSaturate(str2, offset);
Vector128<byte> decTwo3 = AdvSimd.SubtractSaturate(str3, offset);
Vector128<byte> decTwo4 = AdvSimd.SubtractSaturate(str4, offset);

// We use VTBX to map values where out-of-range indices are unchanged.
decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1);
decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2);
decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3);
decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4);

// Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'.
// Step 3: Combine the partial result.
// Each look-up above maps valid values to their Base64 equivalent or zero.
// Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values.
str1 = decOne1 | decTwo1;
str2 = decOne2 | decTwo2;
str3 = decOne3 | decTwo3;
str4 = decOne4 | decTwo4;
str1 = (decOne1 | decTwo1);
str2 = (decOne2 | decTwo2);
str3 = (decOne3 | decTwo3);
str4 = (decOne4 | decTwo4);

// Step 4: Detect an invalid input value.
// Invalid values < 122 are set to 255 while the ones above 122 are unchanged.
// Check for invalid input, any value larger than 63.
Vector128<byte> classified = AdvSimd.CompareGreaterThan(str1, offset)
| AdvSimd.CompareGreaterThan(str2, offset)
| AdvSimd.CompareGreaterThan(str3, offset)
| AdvSimd.CompareGreaterThan(str4, offset);
Vector128<byte> classified = (Vector128.GreaterThan(str1, offset)
| Vector128.GreaterThan(str2, offset)
| Vector128.GreaterThan(str3, offset)
| Vector128.GreaterThan(str4, offset));

// Check that all bits are zero.
if (classified != Vector128<byte>.Zero)
{
break;
}

// Compress four bytes into three.
res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4);
res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2);
res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4;
// Step 5: Compress four bytes into three.
Vector128<byte> res1 = ((str1 << 2) | (str2 >> 4));
Vector128<byte> res2 = ((str2 << 4) | (str3 >> 2));
Vector128<byte> res3 = ((str3 << 6) | str4);

// Interleave and store decoded result.
// Step 6: Interleave and store decoded results.
AssertWrite<Vector128<byte>>(dest, destStart, destLength);
AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3));

Expand Down
Loading