Skip to content

Commit

Permalink
Fix performance issue with UTF8ToString (emscripten-core#12517)
Browse files Browse the repository at this point in the history
introduce new functions
- UTF8ToStringWithLength
- UTF16ToStringWithLength
- UTF32ToStringWithLength

Decode string exactly length with given length, any '\0' in between will
be kept as-is.

Those functions require an argument `lengthInBytes`, so no need to iterator
the heap to find a null-terminator, thus have better performance.
  • Loading branch information
LanderlYoung authored and taylorcyang committed Oct 20, 2020
1 parent 05c0473 commit 7dfe671
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 63 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,4 @@ a license to everyone to use it as detailed in LICENSE.)
* kamenokonyokonyoko <kamenokonokotan@gmail.com>
* Lectem <lectem@gmail.com>
* Henrik Algestam <henrik@algestam.se>
* LanderlYoung <landerlyoung@gmail.com>
79 changes: 79 additions & 0 deletions src/runtime_strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) {
#endif
}


/**
* @param {number} idx
* @param {number=} lengthInBytes
* @return {string}
*/
function UTF8ArrayToStringWithLength(heap, idx, lengthInBytes) {
#if CAN_ADDRESS_2GB
idx >>>= 0;
#endif
var endPtr = idx + lengthInBytes;

#if TEXTDECODER == 2
return UTF8Decoder.decode(
heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr))
);
#else // TEXTDECODER == 2
#if TEXTDECODER
if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) {
return UTF8Decoder.decode(heap.subarray(idx, endPtr));
} else {
#endif // TEXTDECODER
var str = '';

while (idx < endPtr) {
// For UTF8 byte structure, see:
// http://en.wikipedia.org/wiki/UTF-8#Description
// https://www.ietf.org/rfc/rfc2279.txt
// https://tools.ietf.org/html/rfc3629
var u0 = heap[idx++];
if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; }
var u1 = heap[idx++] & 63;
if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; }
var u2 = heap[idx++] & 63;
if ((u0 & 0xF0) == 0xE0) {
u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
} else {
#if ASSERTIONS
if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!');
#endif
u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63);
}

if (u0 < 0x10000) {
str += String.fromCharCode(u0);
} else {
var ch = u0 - 0x10000;
str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
}
}
#if TEXTDECODER
}
#endif // TEXTDECODER
return str;
#endif // TEXTDECODER == 2
}

// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a
// copy of that string as a Javascript String object.
// lengthInBytes: specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes)
// will be decoded using utf8 encoding, and any \0 in between will be decoded as-it.
/**
* @param {number} ptr
* @param {number=} lengthInBytes
* @return {string}
*/
function UTF8ToStringWithLength(ptr, lengthInBytes) {
#if CAN_ADDRESS_2GB
ptr >>>= 0;
#endif
#if TEXTDECODER == 2
if (!ptr) return '';
var end = ptr + lengthInBytes;
return UTF8Decoder.decode(HEAPU8.subarray(ptr, end));
#else
return ptr ? UTF8ArrayToStringWithLength(HEAPU8, ptr, lengthInBytes) : '';
#endif
}

// Copies the given Javascript String object 'str' to the given byte array at address 'outIdx',
// encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP.
// Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down
55 changes: 55 additions & 0 deletions src/runtime_strings_extra.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) {
#endif // TEXTDECODER
}

function UTF16ToStringWithLength(ptr, lengthInBytes) {
#if ASSERTIONS
assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!');
#endif
#if TEXTDECODER
var endPtr = ptr + lengthInBytes;

#if TEXTDECODER != 2
if (endPtr - ptr > 32 && UTF16Decoder) {
#endif // TEXTDECODER != 2
return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr));
#if TEXTDECODER != 2
} else {
#endif // TEXTDECODER != 2
#endif // TEXTDECODER
var i = 0;

var lengthInCodeUnit = lengthInBytes / 2;
var str = '';
while (i < lengthInCodeUnit) {
var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
++i;
// fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
str += String.fromCharCode(codeUnit);
}
#if TEXTDECODER && TEXTDECODER != 2
}
#endif // TEXTDECODER
}

// Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
// null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP.
// Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down Expand Up @@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) {
return str;
}

function UTF32ToStringWithLength(ptr, lengthInBytes) {
#if ASSERTIONS
assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!');
#endif
var i = 0;

var lengthInCodePoint = lengthInBytes / 4;
var str = '';
while (i < lengthInCodePoint) {
var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
++i;
// Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
// See http://unicode.org/faq/utf_bom.html#utf16-3
if (utf32 >= 0x10000) {
var ch = utf32 - 0x10000;
str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
} else {
str += String.fromCharCode(utf32);
}
}
return str;
}

// Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
// null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP.
// Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down
220 changes: 157 additions & 63 deletions tests/utf32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,76 +3,170 @@
// University of Illinois/NCSA Open Source License. Both these licenses can be
// found in the LICENSE file.

#include <cassert>
#include <emscripten.h>
#include <stdio.h>
#include <string>
#include <emscripten.h>
#include <cassert>
#include <string_view>
#include <wchar.h>

typedef unsigned int utf32;
typedef unsigned short utf16;

// This code tests that Unicode std::wstrings can be marshalled between C++ and JS.
int main() {
std::wstring wstr = L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC is the Euro sign, U+2007C is a Chinese Han character that looks like three raindrops.

printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t));

if (sizeof(wchar_t) == 4) {
utf32 *memory = new utf32[wstr.length()+1];

EM_ASM({
var str = UTF32ToString($0);
out(str);
var numBytesWritten = stringToUTF32(str, $1, $2);
if (numBytesWritten != 23*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
}, wstr.c_str(), memory, (wstr.length()+1)*sizeof(utf32));

// Compare memory to confirm that the string is intact after taking a route through JS side.
const utf32 *srcPtr = reinterpret_cast<const utf32 *>(wstr.c_str());
for(int i = 0;; ++i) {
assert(memory[i] == srcPtr[i]);
if (srcPtr[i] == 0)
break;
}

EM_ASM({
var str = UTF32ToString($0);
out(str);
var numBytesWritten = stringToUTF32(str, $1, $2);
if (numBytesWritten != 5*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
}, wstr.c_str(), memory, 6*sizeof(utf32));
assert(memory[5] == 0);

delete[] memory;
} else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar.
utf16 *memory = new utf16[2*wstr.length()+1];

EM_ASM({
var str = UTF16ToString($0);
out(str);
var numBytesWritten = stringToUTF16(str, $1, $2);
if (numBytesWritten != 25*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
}, wstr.c_str(), memory, (2*wstr.length()+1)*sizeof(utf16));

// Compare memory to confirm that the string is intact after taking a route through JS side.
const utf16 *srcPtr = reinterpret_cast<const utf16 *>(wstr.c_str());
for(int i = 0;; ++i) {
assert(memory[i] == srcPtr[i]);
if (srcPtr[i] == 0)
break;
}

EM_ASM({
var str = UTF16ToString($0);
out(str);
var numBytesWritten = stringToUTF16(str, $1, $2);
if (numBytesWritten != 5*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
}, wstr.c_str(), memory, 6*sizeof(utf16));
assert(memory[5] == 0);

delete[] memory;
}

printf("OK.\n");
std::wstring wstr =
L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC
// is the Euro sign, U+2007C
// is a Chinese Han character
// that looks like three
// raindrops.

printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t));

if (sizeof(wchar_t) == 4) {
utf32* memory = new utf32[wstr.length() + 1];

EM_ASM(
{
var str = UTF32ToString($0);
out(str);
var numBytesWritten = stringToUTF32(str, $1, $2);
if (numBytesWritten != 23 * 4)
throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
},
wstr.c_str(), memory, (wstr.length() + 1) * sizeof(utf32));

// Compare memory to confirm that the string is intact after taking a route through JS side.
const utf32* srcPtr = reinterpret_cast<const utf32*>(wstr.c_str());
for (int i = 0;; ++i) {
assert(memory[i] == srcPtr[i]);
if (srcPtr[i] == 0)
break;
}

EM_ASM(
{
var str = UTF32ToString($0);
out(str);
var numBytesWritten = stringToUTF32(str, $1, $2);
if (numBytesWritten != 5 * 4)
throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
},
wstr.c_str(), memory, 6 * sizeof(utf32));
assert(memory[5] == 0);

// UTF32ToStringWithLength without null-terminate
size_t dashIndex = wstr.find(L'-');
std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1);
int outLength = EM_ASM_INT(
{
var str = UTF32ToStringWithLength($0, $1);
out(str);
var expectedBytesWritten = $1;
var numBytesWritten = stringToUTF32(str, $2, $3);
if (numBytesWritten != expectedBytesWritten) {
throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' +
expectedBytesWritten;
}
return numBytesWritten;
},
subString.data(), subString.length() * sizeof(utf32), memory,
(wstr.length() + 1) * sizeof(utf32));
assert(outLength == subString.length() * sizeof(utf32));

// UTF32ToStringWithLength without '\0' inside
std::wstring wstr2 = wstr;
wstr2[dashIndex] = L'\0';
int outLength2 = EM_ASM_INT(
{
var str = UTF32ToStringWithLength($0, $1);
out(str);
var expectedBytesWritten = $1;
var numBytesWritten = stringToUTF32(str, $2, $3);
if (numBytesWritten != expectedBytesWritten) {
throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' +
expectedBytesWritten;
}
return numBytesWritten;
},
wstr2.c_str(), wstr2.length() * sizeof(utf32), memory, (wstr.length() + 1) * sizeof(utf32));
assert(outLength2 == wstr2.length() * sizeof(utf32));
assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length()));

delete[] memory;
} else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar.
utf16* memory = new utf16[2 * wstr.length() + 1];

EM_ASM(
{
var str = UTF16ToString($0);
out(str);
var numBytesWritten = stringToUTF16(str, $1, $2);
if (numBytesWritten != 25 * 2)
throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
},
wstr.c_str(), memory, (2 * wstr.length() + 1) * sizeof(utf16));

// Compare memory to confirm that the string is intact after taking a route through JS side.
const utf16* srcPtr = reinterpret_cast<const utf16*>(wstr.c_str());
for (int i = 0;; ++i) {
assert(memory[i] == srcPtr[i]);
if (srcPtr[i] == 0)
break;
}

EM_ASM(
{
var str = UTF16ToString($0);
out(str);
var numBytesWritten = stringToUTF16(str, $1, $2);
if (numBytesWritten != 5 * 2)
throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
},
wstr.c_str(), memory, 6 * sizeof(utf16));
assert(memory[5] == 0);

// UTF16ToStringWithLength without null-terminate
size_t dashIndex = wstr.find(L'-');
std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1);
int outLength = EM_ASM_INT(
{
var str = UTF16ToStringWithLength($0, $1);
out(str);
var expectedBytesWritten = $1;
var numBytesWritten = stringToUTF16(str, $2, $3);
if (numBytesWritten != expectedBytesWritten) {
throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' +
expectedBytesWritten;
}
return numBytesWritten;
},
subString.data(), subString.length() * sizeof(utf16), memory,
(wstr.length() + 1) * sizeof(utf16));
assert(outLength == subString.length() * sizeof(utf16));

// UTF16ToStringWithLength without '\0' inside
std::wstring wstr2 = wstr;
wstr2[dashIndex] = L'\0';
int outLength2 = EM_ASM_INT(
{
var str = UTF16ToStringWithLength($0, $1);
out(str);
var expectedBytesWritten = $1;
var numBytesWritten = stringToUTF16(str, $2, $3);
if (numBytesWritten != expectedBytesWritten) {
throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' +
expectedBytesWritten;
}
return numBytesWritten;
},
wstr2.c_str(), wstr2.length() * sizeof(utf16), memory, (wstr.length() + 1) * sizeof(utf16));
assert(outLength2 == wstr2.length() * sizeof(utf16));
assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length()));

delete[] memory;
}

printf("OK.\n");
}
Loading

0 comments on commit 7dfe671

Please sign in to comment.