From 7dfe67154d1e65e97cacdaac2192b81cf8be8164 Mon Sep 17 00:00:00 2001 From: landerlyoung Date: Tue, 20 Oct 2020 15:07:57 +0800 Subject: [PATCH] Fix performance issue with UTF8ToString (#12517) introduce new functions - UTF8ToStringWithLength - UTF16ToStringWithLength - UTF32ToStringWithLength Decode string exactly length with given length, any '\0' in between will be kept as-is. Those functions require an argument `lengthInBytes`, so no need to iterator the heap to find a null-terminator, thus have better performance. --- AUTHORS | 1 + src/runtime_strings.js | 79 +++++++++++++ src/runtime_strings_extra.js | 55 +++++++++ tests/utf32.cpp | 220 +++++++++++++++++++++++++---------- tests/utf8.cpp | 33 ++++++ 5 files changed, 325 insertions(+), 63 deletions(-) diff --git a/AUTHORS b/AUTHORS index 7764a002a7f4..cc654f6af195 100644 --- a/AUTHORS +++ b/AUTHORS @@ -515,3 +515,4 @@ a license to everyone to use it as detailed in LICENSE.) * kamenokonyokonyoko * Lectem * Henrik Algestam +* LanderlYoung diff --git a/src/runtime_strings.js b/src/runtime_strings.js index f378e89c92fd..49ec7ef3e252 100644 --- a/src/runtime_strings.js +++ b/src/runtime_strings.js @@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) { #endif } + +/** + * @param {number} idx + * @param {number=} lengthInBytes + * @return {string} + */ +function UTF8ArrayToStringWithLength(heap, idx, lengthInBytes) { +#if CAN_ADDRESS_2GB + idx >>>= 0; +#endif + var endPtr = idx + lengthInBytes; + +#if TEXTDECODER == 2 + return UTF8Decoder.decode( + heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr)) + ); +#else // TEXTDECODER == 2 +#if TEXTDECODER + if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) { + return UTF8Decoder.decode(heap.subarray(idx, endPtr)); + } else { +#endif // TEXTDECODER + var str = ''; + + while (idx < endPtr) { + // For UTF8 byte structure, see: + // http://en.wikipedia.org/wiki/UTF-8#Description + // https://www.ietf.org/rfc/rfc2279.txt + // https://tools.ietf.org/html/rfc3629 + var u0 = heap[idx++]; + if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; } + var u1 = heap[idx++] & 63; + if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; } + var u2 = heap[idx++] & 63; + if ((u0 & 0xF0) == 0xE0) { + u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; + } else { +#if ASSERTIONS + if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!'); +#endif + u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63); + } + + if (u0 < 0x10000) { + str += String.fromCharCode(u0); + } else { + var ch = u0 - 0x10000; + str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + } + } +#if TEXTDECODER + } +#endif // TEXTDECODER + return str; +#endif // TEXTDECODER == 2 +} + +// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a +// copy of that string as a Javascript String object. +// lengthInBytes: specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) +// will be decoded using utf8 encoding, and any \0 in between will be decoded as-it. +/** + * @param {number} ptr + * @param {number=} lengthInBytes + * @return {string} + */ +function UTF8ToStringWithLength(ptr, lengthInBytes) { +#if CAN_ADDRESS_2GB + ptr >>>= 0; +#endif +#if TEXTDECODER == 2 + if (!ptr) return ''; + var end = ptr + lengthInBytes; + return UTF8Decoder.decode(HEAPU8.subarray(ptr, end)); +#else + return ptr ? UTF8ArrayToStringWithLength(HEAPU8, ptr, lengthInBytes) : ''; +#endif +} + // Copies the given Javascript String object 'str' to the given byte array at address 'outIdx', // encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP. // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write. diff --git a/src/runtime_strings_extra.js b/src/runtime_strings_extra.js index dd2e86d75c46..c0d8d161b734 100644 --- a/src/runtime_strings_extra.js +++ b/src/runtime_strings_extra.js @@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) { #endif // TEXTDECODER } +function UTF16ToStringWithLength(ptr, lengthInBytes) { +#if ASSERTIONS + assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!'); + assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!'); +#endif +#if TEXTDECODER + var endPtr = ptr + lengthInBytes; + +#if TEXTDECODER != 2 + if (endPtr - ptr > 32 && UTF16Decoder) { +#endif // TEXTDECODER != 2 + return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr)); +#if TEXTDECODER != 2 + } else { +#endif // TEXTDECODER != 2 +#endif // TEXTDECODER + var i = 0; + + var lengthInCodeUnit = lengthInBytes / 2; + var str = ''; + while (i < lengthInCodeUnit) { + var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}}; + ++i; + // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through. + str += String.fromCharCode(codeUnit); + } +#if TEXTDECODER && TEXTDECODER != 2 + } +#endif // TEXTDECODER +} + // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', // null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP. // Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write. @@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) { return str; } +function UTF32ToStringWithLength(ptr, lengthInBytes) { +#if ASSERTIONS + assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!'); + assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!'); +#endif + var i = 0; + + var lengthInCodePoint = lengthInBytes / 4; + var str = ''; + while (i < lengthInCodePoint) { + var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}}; + ++i; + // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing. + // See http://unicode.org/faq/utf_bom.html#utf16-3 + if (utf32 >= 0x10000) { + var ch = utf32 - 0x10000; + str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF)); + } else { + str += String.fromCharCode(utf32); + } + } + return str; +} + // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr', // null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP. // Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write. diff --git a/tests/utf32.cpp b/tests/utf32.cpp index 6acd298a1e52..4539f7791ef8 100644 --- a/tests/utf32.cpp +++ b/tests/utf32.cpp @@ -3,10 +3,11 @@ // University of Illinois/NCSA Open Source License. Both these licenses can be // found in the LICENSE file. +#include +#include #include #include -#include -#include +#include #include typedef unsigned int utf32; @@ -14,65 +15,158 @@ typedef unsigned short utf16; // This code tests that Unicode std::wstrings can be marshalled between C++ and JS. int main() { - std::wstring wstr = L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC is the Euro sign, U+2007C is a Chinese Han character that looks like three raindrops. - - printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t)); - - if (sizeof(wchar_t) == 4) { - utf32 *memory = new utf32[wstr.length()+1]; - - EM_ASM({ - var str = UTF32ToString($0); - out(str); - var numBytesWritten = stringToUTF32(str, $1, $2); - if (numBytesWritten != 23*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, (wstr.length()+1)*sizeof(utf32)); - - // Compare memory to confirm that the string is intact after taking a route through JS side. - const utf32 *srcPtr = reinterpret_cast(wstr.c_str()); - for(int i = 0;; ++i) { - assert(memory[i] == srcPtr[i]); - if (srcPtr[i] == 0) - break; - } - - EM_ASM({ - var str = UTF32ToString($0); - out(str); - var numBytesWritten = stringToUTF32(str, $1, $2); - if (numBytesWritten != 5*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, 6*sizeof(utf32)); - assert(memory[5] == 0); - - delete[] memory; - } else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar. - utf16 *memory = new utf16[2*wstr.length()+1]; - - EM_ASM({ - var str = UTF16ToString($0); - out(str); - var numBytesWritten = stringToUTF16(str, $1, $2); - if (numBytesWritten != 25*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, (2*wstr.length()+1)*sizeof(utf16)); - - // Compare memory to confirm that the string is intact after taking a route through JS side. - const utf16 *srcPtr = reinterpret_cast(wstr.c_str()); - for(int i = 0;; ++i) { - assert(memory[i] == srcPtr[i]); - if (srcPtr[i] == 0) - break; - } - - EM_ASM({ - var str = UTF16ToString($0); - out(str); - var numBytesWritten = stringToUTF16(str, $1, $2); - if (numBytesWritten != 5*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; - }, wstr.c_str(), memory, 6*sizeof(utf16)); - assert(memory[5] == 0); - - delete[] memory; - } - - printf("OK.\n"); + std::wstring wstr = + L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC + // is the Euro sign, U+2007C + // is a Chinese Han character + // that looks like three + // raindrops. + + printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t)); + + if (sizeof(wchar_t) == 4) { + utf32* memory = new utf32[wstr.length() + 1]; + + EM_ASM( + { + var str = UTF32ToString($0); + out(str); + var numBytesWritten = stringToUTF32(str, $1, $2); + if (numBytesWritten != 23 * 4) + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, (wstr.length() + 1) * sizeof(utf32)); + + // Compare memory to confirm that the string is intact after taking a route through JS side. + const utf32* srcPtr = reinterpret_cast(wstr.c_str()); + for (int i = 0;; ++i) { + assert(memory[i] == srcPtr[i]); + if (srcPtr[i] == 0) + break; + } + + EM_ASM( + { + var str = UTF32ToString($0); + out(str); + var numBytesWritten = stringToUTF32(str, $1, $2); + if (numBytesWritten != 5 * 4) + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, 6 * sizeof(utf32)); + assert(memory[5] == 0); + + // UTF32ToStringWithLength without null-terminate + size_t dashIndex = wstr.find(L'-'); + std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1); + int outLength = EM_ASM_INT( + { + var str = UTF32ToStringWithLength($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF32(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + subString.data(), subString.length() * sizeof(utf32), memory, + (wstr.length() + 1) * sizeof(utf32)); + assert(outLength == subString.length() * sizeof(utf32)); + + // UTF32ToStringWithLength without '\0' inside + std::wstring wstr2 = wstr; + wstr2[dashIndex] = L'\0'; + int outLength2 = EM_ASM_INT( + { + var str = UTF32ToStringWithLength($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF32(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + wstr2.c_str(), wstr2.length() * sizeof(utf32), memory, (wstr.length() + 1) * sizeof(utf32)); + assert(outLength2 == wstr2.length() * sizeof(utf32)); + assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length())); + + delete[] memory; + } else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar. + utf16* memory = new utf16[2 * wstr.length() + 1]; + + EM_ASM( + { + var str = UTF16ToString($0); + out(str); + var numBytesWritten = stringToUTF16(str, $1, $2); + if (numBytesWritten != 25 * 2) + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, (2 * wstr.length() + 1) * sizeof(utf16)); + + // Compare memory to confirm that the string is intact after taking a route through JS side. + const utf16* srcPtr = reinterpret_cast(wstr.c_str()); + for (int i = 0;; ++i) { + assert(memory[i] == srcPtr[i]); + if (srcPtr[i] == 0) + break; + } + + EM_ASM( + { + var str = UTF16ToString($0); + out(str); + var numBytesWritten = stringToUTF16(str, $1, $2); + if (numBytesWritten != 5 * 2) + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten; + }, + wstr.c_str(), memory, 6 * sizeof(utf16)); + assert(memory[5] == 0); + + // UTF16ToStringWithLength without null-terminate + size_t dashIndex = wstr.find(L'-'); + std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1); + int outLength = EM_ASM_INT( + { + var str = UTF16ToStringWithLength($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF16(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + subString.data(), subString.length() * sizeof(utf16), memory, + (wstr.length() + 1) * sizeof(utf16)); + assert(outLength == subString.length() * sizeof(utf16)); + + // UTF16ToStringWithLength without '\0' inside + std::wstring wstr2 = wstr; + wstr2[dashIndex] = L'\0'; + int outLength2 = EM_ASM_INT( + { + var str = UTF16ToStringWithLength($0, $1); + out(str); + var expectedBytesWritten = $1; + var numBytesWritten = stringToUTF16(str, $2, $3); + if (numBytesWritten != expectedBytesWritten) { + throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' + + expectedBytesWritten; + } + return numBytesWritten; + }, + wstr2.c_str(), wstr2.length() * sizeof(utf16), memory, (wstr.length() + 1) * sizeof(utf16)); + assert(outLength2 == wstr2.length() * sizeof(utf16)); + assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length())); + + delete[] memory; + } + + printf("OK.\n"); } diff --git a/tests/utf8.cpp b/tests/utf8.cpp index fabb1412c602..0d91ad07873c 100644 --- a/tests/utf8.cpp +++ b/tests/utf8.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include // This code tests that Unicode std::wstrings can be marshalled between C++ and JS. @@ -50,6 +52,37 @@ int main() { printf("i=%d:%u,%u\n", i, (unsigned int)(unsigned char)utf8String[i], (unsigned int)(unsigned char)utf8String2[i]); assert(!strcmp(utf8String, utf8String2)); + // UTF8ToStringWithLength decode string with no null-terminate. + std::string utf8StringObject(utf8String); + std::string_view utf8SubString = std::string_view(utf8StringObject) + .substr(0, utf8StringObject.find('-') + 1); + char utf8String3[128] = {}; + EM_ASM({ + var str = UTF8ToStringWithLength($0, $1); + out(str); + var expectBytesWritten = $1; + var numBytesWritten = stringToUTF8(str, $2, $3); + if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten; + }, utf8SubString.data(), utf8SubString.length(), utf8String3, 128); + assert(utf8SubString.length() == strlen(utf8String3)); + assert(utf8SubString == utf8String3); + + // UTF8ToStringWithLength decode string which contains '\0' inside. + std::string utf8StringObject1(utf8String); + // change the '-' to '\0' + utf8StringObject1[utf8StringObject.find('-')] = '\0'; + char utf8String4[128] = {}; + int outLength = EM_ASM_INT({ + var str = UTF8ToStringWithLength($0, $1); + out(str); + var expectBytesWritten = $1; + var numBytesWritten = stringToUTF8(str, $2, $3); + if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten; + return numBytesWritten; + }, utf8StringObject1.c_str(), utf8StringObject1.length(), utf8String4, 128); + assert(utf8StringObject1.length() == outLength); + assert(utf8StringObject1 == std::string_view(utf8String4, outLength)); + // Test that text gets properly cut off if output buffer is too small. EM_ASM({ var str = UTF8ToString($0);