Skip to content

Commit

Permalink
fix performance issue with UTF8ToString (emscripten-core#12517)
Browse files Browse the repository at this point in the history
- add extra parameter `exactStringLength` to UTF8ToString,
This parameter is optional, and if given, the `maxBytesToRead` parameter is ignored.

to keep consistent API flavor, `UTF16ToString`, `UTF32ToString`, is also changed.
  • Loading branch information
LanderlYoung authored and taylorcyang committed Oct 19, 2020
1 parent e05a75e commit 3847cd4
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 32 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,4 @@ a license to everyone to use it as detailed in LICENSE.)
* kamenokonyokonyoko <kamenokonokotan@gmail.com>
* Lectem <lectem@gmail.com>
* Henrik Algestam <henrik@algestam.se>
* LanderlYoung <landerlyoung@gmail.com>
49 changes: 29 additions & 20 deletions src/runtime_strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,26 @@ var UTF8Decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') :
/**
* @param {number} idx
* @param {number=} maxBytesToRead
* @param {number=} exactLengthInBytes
* @return {string}
*/
function UTF8ArrayToString(heap, idx, maxBytesToRead) {
function UTF8ArrayToString(heap, idx, maxBytesToRead, exactLengthInBytes) {
#if CAN_ADDRESS_2GB
idx >>>= 0;
#endif
var endIdx = idx + maxBytesToRead;
let hasExactLength = typeof exactLengthInBytes === 'number';
var endIdx = idx + (hasExactLength ? exactLengthInBytes : maxBytesToRead);
#if TEXTDECODER
var endPtr = idx;
// TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself.
// Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage.
// (As a tiny code save trick, compare endPtr against endIdx using a negation, so that undefined means Infinity)
while (heap[endPtr] && !(endPtr >= endIdx)) ++endPtr;
if (hasExactLength) {
endPtr = idx + exactLengthInBytes;
} else {
// TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself.
// Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage.
// (As a tiny code save trick, compare endPtr against endIdx using a negation, so that undefined means Infinity)
while (heap[endPtr] && !(endPtr >= endIdx)) ++endPtr;
hasExactLength = true;
}
#endif // TEXTDECODER

#if TEXTDECODER == 2
Expand All @@ -46,22 +53,18 @@ function UTF8ArrayToString(heap, idx, maxBytesToRead) {
} else {
#endif // TEXTDECODER
var str = '';
#if TEXTDECODER
// If building with TextDecoder, we have already computed the string length above, so test loop end condition against that
while (idx < endPtr) {
#else
while (!(idx >= endIdx)) {
#endif
// If we have already computed the string length above, so test loop end condition against that
while (hasExactLength ? idx < endPtr : !(idx >= endIdx)) {
// For UTF8 byte structure, see:
// http://en.wikipedia.org/wiki/UTF-8#Description
// https://www.ietf.org/rfc/rfc2279.txt
// https://tools.ietf.org/html/rfc3629
var u0 = heap[idx++];
#if !TEXTDECODER
// If not building with TextDecoder enabled, we don't know the string length, so scan for \0 byte.
// If building with TextDecoder, we know exactly at what byte index the string ends, so checking for nulls here would be redundant.
if (!u0) return str;
#endif
if (!hasExactLength) {
// If we don't know the string length, so scan for \0 byte.
// Otherwise we know exactly at what byte index the string ends, so checking for nulls here would be redundant.
if (!u0) return str;
}
if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; }
var u1 = heap[idx++] & 63;
if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; }
Expand Down Expand Up @@ -99,22 +102,28 @@ function UTF8ArrayToString(heap, idx, maxBytesToRead) {
// N.B. mixing frequent uses of UTF8ToString() with and without maxBytesToRead may
// throw JS JIT optimizations off, so it is worth to consider consistently using one
// style or the other.
// exactLengthInBytes: an optional length that specifies the exactly length of a string (without the
// trailing null terminator). If this parameter is given, the maxBytesToRead is ignored.
/**
* @param {number} ptr
* @param {number=} maxBytesToRead
* @return {string}
*/
function UTF8ToString(ptr, maxBytesToRead) {
function UTF8ToString(ptr, maxBytesToRead, exactLengthInBytes) {
#if CAN_ADDRESS_2GB
ptr >>>= 0;
#endif
#if TEXTDECODER == 2
if (!ptr) return '';
var maxPtr = ptr + maxBytesToRead;
for(var end = ptr; !(end >= maxPtr) && HEAPU8[end];) ++end;
if (typeof exactLengthInBytes === 'number') {
maxPtr = ptr + exactLengthInBytes;
} else {
for(var end = ptr; !(end >= maxPtr) && HEAPU8[end];) ++end;
}
return UTF8Decoder.decode(HEAPU8.subarray(ptr, end));
#else
return ptr ? UTF8ArrayToString(HEAPU8, ptr, maxBytesToRead) : '';
return ptr ? UTF8ArrayToString(HEAPU8, ptr, maxBytesToRead, exactLengthInBytes) : '';
#endif
}

Expand Down
31 changes: 19 additions & 12 deletions src/runtime_strings_extra.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,26 @@ var UTF16Decoder = typeof TextDecoder !== 'undefined' ? new TextDecoder('utf-16l
#endif // TEXTDECODER
#endif // TEXTDECODER == 2

function UTF16ToString(ptr, maxBytesToRead) {
function UTF16ToString(ptr, maxBytesToRead, exactLengthInBytes) {
#if ASSERTIONS
assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
#endif
let hasExactLength = typeof exactLengthInBytes === 'number';
#if TEXTDECODER
var endPtr = ptr;
// TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself.
// Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage.
var idx = endPtr >> 1;
var maxIdx = idx + maxBytesToRead / 2;
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
// will always evaluate to true. This saves on code size.
while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx;
endPtr = idx << 1;
if (hasExactLength) {
endPtr = ptr + exactLengthInBytes;
} else {
// TextDecoder needs to know the byte length in advance, it doesn't stop on null terminator by itself.
// Also, use the length info to avoid running tiny strings through TextDecoder, since .subarray() allocates garbage.
var idx = endPtr >> 1;
var maxIdx = idx + maxBytesToRead / 2;
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
// will always evaluate to true. This saves on code size.
while (!(idx >= maxIdx) && HEAPU16[idx]) ++idx;
endPtr = idx << 1;
hasExactLength = true;
}

#if TEXTDECODER != 2
if (endPtr - ptr > 32 && UTF16Decoder) {
Expand All @@ -67,7 +73,7 @@ function UTF16ToString(ptr, maxBytesToRead) {
var str = '';
while (1) {
var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
if (codeUnit == 0 || i == maxBytesToRead / 2) return str;
if (hasExactLength ? i == exactLengthInBytes / 2 : codeUnit == 0 || i == maxBytesToRead / 2) return str;
++i;
// fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
str += String.fromCharCode(codeUnit);
Expand Down Expand Up @@ -120,16 +126,17 @@ function lengthBytesUTF16(str) {
return str.length*2;
}

function UTF32ToString(ptr, maxBytesToRead) {
function UTF32ToString(ptr, maxBytesToRead, exactLengthInBytes) {
#if ASSERTIONS
assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
#endif
const hasExactLength = typeof exactLengthInBytes === 'number';
var i = 0;

var str = '';
// If maxBytesToRead is not passed explicitly, it will be undefined, and this
// will always evaluate to true. This saves on code size.
while (!(i >= maxBytesToRead / 4)) {
while (hasExactLength ? i <= exactLengthInBytes / 4 : !(i >= maxBytesToRead / 4)) {
var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
if (utf32 == 0) break;
++i;
Expand Down

0 comments on commit 3847cd4

Please sign in to comment.