From 5a20826edc1e40805886643843df13672d6c09bb Mon Sep 17 00:00:00 2001 From: landerlyoung Date: Tue, 20 Oct 2020 15:07:57 +0800 Subject: [PATCH] Fix performance issue with UTF8ToString (#12517) introduce new functions: - UTF8ToStringNBytes - UTF16ToStringNBytes - UTF32ToStringNBytes add docs to preamble.js.rst Decode string exactly length with given length, any '\0' in between will be kept as-is. Those functions require an argument `lengthInBytes`, so no need to iterator the heap to find a null-terminator, thus have better performance. --- AUTHORS | 1 + .../source/docs/api_reference/preamble.js.rst | 31 +++ src/runtime_strings.js | 79 +++++++ src/runtime_strings_extra.js | 55 +++++ tests/utf32.cpp | 220 +++++++++++++----- tests/utf8.cpp | 33 +++ 6 files changed, 356 insertions(+), 63 deletions(-) diff --git a/AUTHORS b/AUTHORS index 32cf80ff0872..83f6a912b578 100644 --- a/AUTHORS +++ b/AUTHORS @@ -519,3 +519,4 @@ a license to everyone to use it as detailed in LICENSE.) * Pawel Czarnecki (copyright owned by 8th Wall, Inc.) * Dhairya Bahl < dhairyabahl5@gmail.com > * Sam Gao +* LanderlYoung diff --git a/site/source/docs/api_reference/preamble.js.rst b/site/source/docs/api_reference/preamble.js.rst index 8e58e053b7e1..2503a9be7448 100644 --- a/site/source/docs/api_reference/preamble.js.rst +++ b/site/source/docs/api_reference/preamble.js.rst @@ -180,6 +180,17 @@ Conversion functions — strings, pointers and arrays :param maxBytesToWrite: A limit on the number of bytes that this function can at most write out. If the string is longer than this, the output is truncated. The outputted string will always be null terminated, even if truncation occurred, as long as ``maxBytesToWrite > 0``. + +.. js:function:: UTF8ToStringNBytes(ptr, lengthInBytes) + + Given a pointer ``ptr`` to a UTF8-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object. + + :param ptr: A pointer to a UTF8-encoded string in the Emscripten HEAP. + :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is. + :returns: A JavaScript ``String`` object + + + .. js:function:: UTF16ToString(ptr) Given a pointer ``ptr`` to a null-terminated UTF16LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object. @@ -202,6 +213,16 @@ Conversion functions — strings, pointers and arrays +.. js:function:: UTF16ToStringNBytes(ptr, lengthInBytes) + + Given a pointer ``ptr`` to a UTF16LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object. + + :param ptr: A pointer to a UTF16LE-encoded string in the Emscripten HEAP. + :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is. + :returns: A JavaScript ``String`` object + + + .. js:function:: UTF32ToString(ptr) Given a pointer ``ptr`` to a null-terminated UTF32LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object. @@ -223,6 +244,16 @@ Conversion functions — strings, pointers and arrays +.. js:function:: UTF32ToStringNBytes(ptr, lengthInBytes) + + Given a pointer ``ptr`` to a UTF32LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object. + + :param ptr: A pointer to a UTF32LE-encoded string in the Emscripten HEAP. + :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is. + :returns: A JavaScript ``String`` object + + + .. js:function:: AsciiToString(ptr) Converts an ASCII or Latin-1 encoded string to a JavaScript String object. diff --git a/src/runtime_strings.js b/src/runtime_strings.js index f378e89c92fd..c850491a7d92 100644 --- a/src/runtime_strings.js +++ b/src/runtime_strings.js @@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) { #endif } + +/** + * @param {number} idx + * @param {number=} lengthInBytes + * @return {string} + */ +function UTF8ArrayToStringNBytes(heap, idx, lengthInBytes) { +#if CAN_ADDRESS_2GB + idx >>>= 0; +#endif + var endPtr = idx + lengthInBytes; + +#if TEXTDECODER == 2 + return UTF8Decoder.decode( + heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr)) + ); +#else // TEXTDECODER == 2 +#if TEXTDECODER + if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) { + return UTF8Decoder.decode(heap.subarray(idx, endPtr)); + } else { +#endif // TEXTDECODER + var str = ''; + + while (idx < endPtr) { + // For UTF8 byte structure, see: + // http://en.wikipedia.org/wiki/UTF-8#Description + // https://www.ietf.org/rfc/rfc2279.txt + // https://tools.ietf.org/html/rfc3629 + var u0 = heap[idx++]; + if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; } + var u1 = heap[idx++] & 63; + if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; } + var u2 = heap[idx++] & 63; + if ((u0 & 0xF0) == 0xE0) { + u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; + } else { +#if ASSERTIONS + if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!'); +#endif + u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63); + } + + if (u0 < 0x10000) { + str += String.fromCharCode(u0); + } else { + var ch = u0 - 0x10000; + str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + } + } +#if TEXTDECODER + } +#endif // TEXTDECODER + return str; +#endif // TEXTDECODER == 2 +} + +// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a +// copy of that string as a Javascript String object. +// lengthInBytes: specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) +// will be decoded using utf8 encoding, and any \0 in between will be decoded as-is. +/** + * @param {number} ptr + * @param {number=} lengthInBytes + * @return {string} + */ +function UTF8ToStringNBytes(ptr, lengthInBytes) { +#if CAN_ADDRESS_2GB + ptr >>>= 0; +#endif +#if TEXTDECODER == 2 + if (!ptr) return ''; + var end = ptr + lengthInBytes; + return UTF8Decoder.decode(HEAPU8.subarray(ptr, end)); +#else + return ptr ? UTF8ArrayToStringNBytes(HEAPU8, ptr, lengthInBytes) : ''; +#endif +} + // Copies the given Javascript String object 'str' to the given byte array at address 'outIdx', // encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP. // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write. diff --git a/src/runtime_strings_extra.js b/src/runtime_strings_extra.js index dd2e86d75c46..2261730fa344 100644 --- a/src/runtime_strings_extra.js +++ b/src/runtime_strings_extra.js @@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) { #endif // TEXTDECODER } +function UTF16ToStringNBytes(ptr, lengthInBytes) { +#if ASSERTIONS + assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!'); + assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!'); +#endif +#if TEXTDECODER + var endPtr = ptr + lengthInBytes; + +#if TEXTDECODER != 2 + if (endPtr - ptr > 32 && UTF16Decoder) { +#endif // TEXTDECODER != 2 + return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr)); +#if TEXTDECODER != 2 + } else { +#endif // TEXTDECODER != 2 +#endif // TEXTDECODER + var i = 0; + + var lengthInCodeUnit = lengthInBytes / 2; + var str = ''; + while (i < lengthInCodeUnit) { + var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}}; + ++i; + // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through. + str += String.fromCharCode(codeUnit); + } +#if TEXTDECODER && TEXTDECODER != 2 + } +#endif // TEXTDECODER +} + // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', // null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP. // Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write. @@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) { return str; } +function UTF32ToStringNBytes(ptr, lengthInBytes) { +#if ASSERTIONS + assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!'); + assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!'); +#endif + var i = 0; + + var lengthInCodePoint = lengthInBytes / 4; + var str = ''; + while (i < lengthInCodePoint) { + var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}}; + ++i; + // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing. + // See http://unicode.org/faq/utf_bom.html#utf16-3 + if (utf32 >= 0x10000) { + var ch = utf32 - 0x10000; + str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + } else { + str += String.fromCharCode(utf32); + } + } + return str; +} + // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', // null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP. // Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write. diff --git a/tests/utf32.cpp b/tests/utf32.cpp index 6acd298a1e52..09699781e92e 100644 --- a/tests/utf32.cpp +++ b/tests/utf32.cpp @@ -3,10 +3,11 @@ // University of Illinois/NCSA Open Source License. Both these licenses can be // found in the LICENSE file. +#include +#include #include #include -#include -#include +#include #include typedef unsigned int utf32; @@ -14,65 +15,158 @@ typedef unsigned short utf16; // This code tests that Unicode std::wstrings can be marshalled between C++ and JS. int main() { - std::wstring wstr = L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC is the Euro sign, U+2007C is a Chinese Han character that looks like three raindrops. - - printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t)); - - if (sizeof(wchar_t) == 4) { - utf32 *memory = new utf32[wstr.length()+1]; - - EM_ASM({ - var str = UTF32ToString($0); - out(str); - var numBytesWritten = stringToUTF32(str, $1, $2); - if (numBytesWritten != 23*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, (wstr.length()+1)*sizeof(utf32)); - - // Compare memory to confirm that the string is intact after taking a route through JS side. - const utf32 *srcPtr = reinterpret_cast(wstr.c_str()); - for(int i = 0;; ++i) { - assert(memory[i] == srcPtr[i]); - if (srcPtr[i] == 0) - break; - } - - EM_ASM({ - var str = UTF32ToString($0); - out(str); - var numBytesWritten = stringToUTF32(str, $1, $2); - if (numBytesWritten != 5*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, 6*sizeof(utf32)); - assert(memory[5] == 0); - - delete[] memory; - } else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar. - utf16 *memory = new utf16[2*wstr.length()+1]; - - EM_ASM({ - var str = UTF16ToString($0); - out(str); - var numBytesWritten = stringToUTF16(str, $1, $2); - if (numBytesWritten != 25*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, (2*wstr.length()+1)*sizeof(utf16)); - - // Compare memory to confirm that the string is intact after taking a route through JS side. - const utf16 *srcPtr = reinterpret_cast(wstr.c_str()); - for(int i = 0;; ++i) { - assert(memory[i] == srcPtr[i]); - if (srcPtr[i] == 0) - break; - } - - EM_ASM({ - var str = UTF16ToString($0); - out(str); - var numBytesWritten = stringToUTF16(str, $1, $2); - if (numBytesWritten != 5*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, 6*sizeof(utf16)); - assert(memory[5] == 0); - - delete[] memory; - } - - printf("OK.\n"); + std::wstring wstr = + L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC + // is the Euro sign, U+2007C + // is a Chinese Han character + // that looks like three + // raindrops. + + printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t)); + + if (sizeof(wchar_t) == 4) { + utf32* memory = new utf32[wstr.length() + 1]; + + EM_ASM( + { + var str = UTF32ToString($0); + out(str); + var numBytesWritten = stringToUTF32(str, $1, $2); + if (numBytesWritten != 23 * 4) + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, (wstr.length() + 1) * sizeof(utf32)); + + // Compare memory to confirm that the string is intact after taking a route through JS side. + const utf32* srcPtr = reinterpret_cast(wstr.c_str()); + for (int i = 0;; ++i) { + assert(memory[i] == srcPtr[i]); + if (srcPtr[i] == 0) + break; + } + + EM_ASM( + { + var str = UTF32ToString($0); + out(str); + var numBytesWritten = stringToUTF32(str, $1, $2); + if (numBytesWritten != 5 * 4) + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, 6 * sizeof(utf32)); + assert(memory[5] == 0); + + // UTF32ToStringNBytes without null-terminate + size_t dashIndex = wstr.find(L'-'); + std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1); + int outLength = EM_ASM_INT( + { + var str = UTF32ToStringNBytes($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF32(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + subString.data(), subString.length() * sizeof(utf32), memory, + (wstr.length() + 1) * sizeof(utf32)); + assert(outLength == subString.length() * sizeof(utf32)); + + // UTF32ToStringNBytes without '\0' inside + std::wstring wstr2 = wstr; + wstr2[dashIndex] = L'\0'; + int outLength2 = EM_ASM_INT( + { + var str = UTF32ToStringNBytes($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF32(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + wstr2.c_str(), wstr2.length() * sizeof(utf32), memory, (wstr.length() + 1) * sizeof(utf32)); + assert(outLength2 == wstr2.length() * sizeof(utf32)); + assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length())); + + delete[] memory; + } else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar. + utf16* memory = new utf16[2 * wstr.length() + 1]; + + EM_ASM( + { + var str = UTF16ToString($0); + out(str); + var numBytesWritten = stringToUTF16(str, $1, $2); + if (numBytesWritten != 25 * 2) + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, (2 * wstr.length() + 1) * sizeof(utf16)); + + // Compare memory to confirm that the string is intact after taking a route through JS side. + const utf16* srcPtr = reinterpret_cast(wstr.c_str()); + for (int i = 0;; ++i) { + assert(memory[i] == srcPtr[i]); + if (srcPtr[i] == 0) + break; + } + + EM_ASM( + { + var str = UTF16ToString($0); + out(str); + var numBytesWritten = stringToUTF16(str, $1, $2); + if (numBytesWritten != 5 * 2) + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, 6 * sizeof(utf16)); + assert(memory[5] == 0); + + // UTF16ToStringNBytes without null-terminate + size_t dashIndex = wstr.find(L'-'); + std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1); + int outLength = EM_ASM_INT( + { + var str = UTF16ToStringNBytes($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF16(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + subString.data(), subString.length() * sizeof(utf16), memory, + (wstr.length() + 1) * sizeof(utf16)); + assert(outLength == subString.length() * sizeof(utf16)); + + // UTF16ToStringNBytes without '\0' inside + std::wstring wstr2 = wstr; + wstr2[dashIndex] = L'\0'; + int outLength2 = EM_ASM_INT( + { + var str = UTF16ToStringNBytes($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF16(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + wstr2.c_str(), wstr2.length() * sizeof(utf16), memory, (wstr.length() + 1) * sizeof(utf16)); + assert(outLength2 == wstr2.length() * sizeof(utf16)); + assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length())); + + delete[] memory; + } + + printf("OK.\n"); } diff --git a/tests/utf8.cpp b/tests/utf8.cpp index fabb1412c602..7de35a1fa93f 100644 --- a/tests/utf8.cpp +++ b/tests/utf8.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include // This code tests that Unicode std::wstrings can be marshalled between C++ and JS. @@ -50,6 +52,37 @@ int main() { printf("i=%d:%u,%u\n", i, (unsigned int)(unsigned char)utf8String[i], (unsigned int)(unsigned char)utf8String2[i]); assert(!strcmp(utf8String, utf8String2)); + // UTF8ToStringNBytes decode string with no null-terminate. + std::string utf8StringObject(utf8String); + std::string_view utf8SubString = std::string_view(utf8StringObject) + .substr(0, utf8StringObject.find('-') + 1); + char utf8String3[128] = {}; + EM_ASM({ + var str = UTF8ToStringNBytes($0, $1); + out(str); + var expectBytesWritten = $1; + var numBytesWritten = stringToUTF8(str, $2, $3); + if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten; + }, utf8SubString.data(), utf8SubString.length(), utf8String3, 128); + assert(utf8SubString.length() == strlen(utf8String3)); + assert(utf8SubString == utf8String3); + + // UTF8ToStringNBytes decode string which contains '\0' inside. + std::string utf8StringObject1(utf8String); + // change the '-' to '\0' + utf8StringObject1[utf8StringObject.find('-')] = '\0'; + char utf8String4[128] = {}; + int outLength = EM_ASM_INT({ + var str = UTF8ToStringNBytes($0, $1); + out(str); + var expectBytesWritten = $1; + var numBytesWritten = stringToUTF8(str, $2, $3); + if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten; + return numBytesWritten; + }, utf8StringObject1.c_str(), utf8StringObject1.length(), utf8String4, 128); + assert(utf8StringObject1.length() == outLength); + assert(utf8StringObject1 == std::string_view(utf8String4, outLength)); + // Test that text gets properly cut off if output buffer is too small. EM_ASM({ var str = UTF8ToString($0);