Skip to content

Commit

Permalink
Fix performance issue with UTF8ToString (emscripten-core#12517)
Browse files Browse the repository at this point in the history
introduce new functions:
- UTF8ToStringNBytes
- UTF16ToStringNBytes
- UTF32ToStringNBytes

add docs to preamble.js.rst

Decode string exactly length with given length, any '\0' in between will
be kept as-is.

Those functions require an argument `lengthInBytes`, so no need to iterator
the heap to find a null-terminator, thus have better performance.
  • Loading branch information
LanderlYoung committed Oct 26, 2020
1 parent f9d491b commit 5a20826
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 63 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -519,3 +519,4 @@ a license to everyone to use it as detailed in LICENSE.)
* Pawel Czarnecki <pawel@8thwall.com> (copyright owned by 8th Wall, Inc.)
* Dhairya Bahl < dhairyabahl5@gmail.com >
* Sam Gao <gaoshan274@gmail.com>
* LanderlYoung <landerlyoung@gmail.com>
31 changes: 31 additions & 0 deletions site/source/docs/api_reference/preamble.js.rst
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ Conversion functions — strings, pointers and arrays
:param maxBytesToWrite: A limit on the number of bytes that this function can at most write out. If the string is longer than this, the output is truncated. The outputted string will always be null terminated, even if truncation occurred, as long as ``maxBytesToWrite > 0``.



.. js:function:: UTF8ToStringNBytes(ptr, lengthInBytes)

Given a pointer ``ptr`` to a UTF8-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.

:param ptr: A pointer to a UTF8-encoded string in the Emscripten HEAP.
:param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
:returns: A JavaScript ``String`` object



.. js:function:: UTF16ToString(ptr)

Given a pointer ``ptr`` to a null-terminated UTF16LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object.
Expand All @@ -202,6 +213,16 @@ Conversion functions — strings, pointers and arrays



.. js:function:: UTF16ToStringNBytes(ptr, lengthInBytes)

Given a pointer ``ptr`` to a UTF16LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.

:param ptr: A pointer to a UTF16LE-encoded string in the Emscripten HEAP.
:param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
:returns: A JavaScript ``String`` object



.. js:function:: UTF32ToString(ptr)

Given a pointer ``ptr`` to a null-terminated UTF32LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object.
Expand All @@ -223,6 +244,16 @@ Conversion functions — strings, pointers and arrays



.. js:function:: UTF32ToStringNBytes(ptr, lengthInBytes)

Given a pointer ``ptr`` to a UTF32LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.

:param ptr: A pointer to a UTF32LE-encoded string in the Emscripten HEAP.
:param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
:returns: A JavaScript ``String`` object



.. js:function:: AsciiToString(ptr)

Converts an ASCII or Latin-1 encoded string to a JavaScript String object.
Expand Down
79 changes: 79 additions & 0 deletions src/runtime_strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) {
#endif
}


/**
* @param {number} idx
* @param {number=} lengthInBytes
* @return {string}
*/
function UTF8ArrayToStringNBytes(heap, idx, lengthInBytes) {
#if CAN_ADDRESS_2GB
idx >>>= 0;
#endif
var endPtr = idx + lengthInBytes;

#if TEXTDECODER == 2
return UTF8Decoder.decode(
heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr))
);
#else // TEXTDECODER == 2
#if TEXTDECODER
if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) {
return UTF8Decoder.decode(heap.subarray(idx, endPtr));
} else {
#endif // TEXTDECODER
var str = '';

while (idx < endPtr) {
// For UTF8 byte structure, see:
// http://en.wikipedia.org/wiki/UTF-8#Description
// https://www.ietf.org/rfc/rfc2279.txt
// https://tools.ietf.org/html/rfc3629
var u0 = heap[idx++];
if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; }
var u1 = heap[idx++] & 63;
if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; }
var u2 = heap[idx++] & 63;
if ((u0 & 0xF0) == 0xE0) {
u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
} else {
#if ASSERTIONS
if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!');
#endif
u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63);
}

if (u0 < 0x10000) {
str += String.fromCharCode(u0);
} else {
var ch = u0 - 0x10000;
str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
}
}
#if TEXTDECODER
}
#endif // TEXTDECODER
return str;
#endif // TEXTDECODER == 2
}

// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a
// copy of that string as a Javascript String object.
// lengthInBytes: specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes)
// will be decoded using utf8 encoding, and any \0 in between will be decoded as-is.
/**
* @param {number} ptr
* @param {number=} lengthInBytes
* @return {string}
*/
function UTF8ToStringNBytes(ptr, lengthInBytes) {
#if CAN_ADDRESS_2GB
ptr >>>= 0;
#endif
#if TEXTDECODER == 2
if (!ptr) return '';
var end = ptr + lengthInBytes;
return UTF8Decoder.decode(HEAPU8.subarray(ptr, end));
#else
return ptr ? UTF8ArrayToStringNBytes(HEAPU8, ptr, lengthInBytes) : '';
#endif
}

// Copies the given Javascript String object 'str' to the given byte array at address 'outIdx',
// encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP.
// Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down
55 changes: 55 additions & 0 deletions src/runtime_strings_extra.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) {
#endif // TEXTDECODER
}

function UTF16ToStringNBytes(ptr, lengthInBytes) {
#if ASSERTIONS
assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!');
#endif
#if TEXTDECODER
var endPtr = ptr + lengthInBytes;

#if TEXTDECODER != 2
if (endPtr - ptr > 32 && UTF16Decoder) {
#endif // TEXTDECODER != 2
return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr));
#if TEXTDECODER != 2
} else {
#endif // TEXTDECODER != 2
#endif // TEXTDECODER
var i = 0;

var lengthInCodeUnit = lengthInBytes / 2;
var str = '';
while (i < lengthInCodeUnit) {
var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
++i;
// fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
str += String.fromCharCode(codeUnit);
}
#if TEXTDECODER && TEXTDECODER != 2
}
#endif // TEXTDECODER
}

// Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
// null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP.
// Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down Expand Up @@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) {
return str;
}

function UTF32ToStringNBytes(ptr, lengthInBytes) {
#if ASSERTIONS
assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!');
#endif
var i = 0;

var lengthInCodePoint = lengthInBytes / 4;
var str = '';
while (i < lengthInCodePoint) {
var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
++i;
// Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
// See http://unicode.org/faq/utf_bom.html#utf16-3
if (utf32 >= 0x10000) {
var ch = utf32 - 0x10000;
str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
} else {
str += String.fromCharCode(utf32);
}
}
return str;
}

// Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
// null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP.
// Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write.
Expand Down
Loading

0 comments on commit 5a20826

Please sign in to comment.