Fix performance issue with UTF8ToString (emscripten-core#12517)

introduce new functions: - UTF8ToStringNBytes - UTF16ToStringNBytes - UTF32ToStringNBytes add docs to preamble.js.rst Decode string exactly length with given length, any '\0' in between will be kept as-is. Those functions require an argument `lengthInBytes`, so no need to iterator the heap to find a null-terminator, thus have better performance.
LanderlYoung · Oct 26, 2020 · 5a20826 · 5a20826
1 parent f9d491b
commit 5a20826
Show file tree

Hide file tree

Showing 6 changed files with 356 additions and 63 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -519,3 +519,4 @@ a license to everyone to use it as detailed in LICENSE.)
 * Pawel Czarnecki <pawel@8thwall.com> (copyright owned by 8th Wall, Inc.)
 * Dhairya Bahl < dhairyabahl5@gmail.com >
 * Sam Gao <gaoshan274@gmail.com>
+* LanderlYoung <landerlyoung@gmail.com>
diff --git a/site/source/docs/api_reference/preamble.js.rst b/site/source/docs/api_reference/preamble.js.rst
@@ -180,6 +180,17 @@ Conversion functions — strings, pointers and arrays
   :param maxBytesToWrite: A limit on the number of bytes that this function can at most write out. If the string is longer than this, the output is truncated. The outputted string will always be null terminated, even if truncation occurred, as long as ``maxBytesToWrite > 0``.
 
 
+
+.. js:function:: UTF8ToStringNBytes(ptr, lengthInBytes)
+
+  Given a pointer ``ptr`` to a UTF8-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.
+
+  :param ptr: A pointer to a UTF8-encoded string in the Emscripten HEAP.
+  :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
+  :returns: A JavaScript ``String`` object
+
+
+
 .. js:function:: UTF16ToString(ptr)
 
   Given a pointer ``ptr`` to a null-terminated UTF16LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object.
@@ -202,6 +213,16 @@ Conversion functions — strings, pointers and arrays
 
 
 
+.. js:function:: UTF16ToStringNBytes(ptr, lengthInBytes)
+
+  Given a pointer ``ptr`` to a UTF16LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.
+
+  :param ptr: A pointer to a UTF16LE-encoded string in the Emscripten HEAP.
+  :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
+  :returns: A JavaScript ``String`` object
+
+
+
 .. js:function:: UTF32ToString(ptr)
 
   Given a pointer ``ptr`` to a null-terminated UTF32LE-encoded string in the Emscripten HEAP, returns a copy of that string as a JavaScript ``String`` object.
@@ -223,6 +244,16 @@ Conversion functions — strings, pointers and arrays
 
 
 
+.. js:function:: UTF32ToStringNBytes(ptr, lengthInBytes)
+
+  Given a pointer ``ptr`` to a UTF32LE-encoded string in the emscripten HEAP, returns a copy of that string as a Javascript String object.
+
+  :param ptr: A pointer to a UTF32LE-encoded string in the Emscripten HEAP.
+  :param lengthInBytes: Specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes) will be decoded using utf8 encoding, and any ``\0`` in between will be decoded as-is.
+  :returns: A JavaScript ``String`` object
+
+
+
 .. js:function:: AsciiToString(ptr)
 
   Converts an ASCII or Latin-1 encoded string to a JavaScript String object.

diff --git a/src/runtime_strings.js b/src/runtime_strings.js
@@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) {
 #endif
 }
 
+
+/**
+ * @param {number} idx
+ * @param {number=} lengthInBytes
+ * @return {string}
+ */
+function UTF8ArrayToStringNBytes(heap, idx, lengthInBytes) {
+#if CAN_ADDRESS_2GB
+  idx >>>= 0;
+#endif
+  var endPtr = idx + lengthInBytes;
+
+#if TEXTDECODER == 2
+  return UTF8Decoder.decode(
+    heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr))
+  );
+#else // TEXTDECODER == 2
+#if TEXTDECODER
+  if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) {
+    return UTF8Decoder.decode(heap.subarray(idx, endPtr));
+  } else {
+#endif // TEXTDECODER
+    var str = '';
+
+    while (idx < endPtr) {
+      // For UTF8 byte structure, see:
+      // http://en.wikipedia.org/wiki/UTF-8#Description
+      // https://www.ietf.org/rfc/rfc2279.txt
+      // https://tools.ietf.org/html/rfc3629
+      var u0 = heap[idx++];
+      if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; }
+      var u1 = heap[idx++] & 63;
+      if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; }
+      var u2 = heap[idx++] & 63;
+      if ((u0 & 0xF0) == 0xE0) {
+        u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
+      } else {
+#if ASSERTIONS
+        if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!');
+#endif
+        u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63);
+      }
+
+      if (u0 < 0x10000) {
+        str += String.fromCharCode(u0);
+      } else {
+        var ch = u0 - 0x10000;
+        str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
+      }
+    }
+#if TEXTDECODER
+  }
+#endif // TEXTDECODER
+  return str;
+#endif // TEXTDECODER == 2
+}
+
+// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a
+// copy of that string as a Javascript String object.
+// lengthInBytes:  specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes)
+//                 will be decoded using utf8 encoding, and any \0 in between will be decoded as-is.
+/**
+ * @param {number} ptr
+ * @param {number=} lengthInBytes
+ * @return {string}
+ */
+function UTF8ToStringNBytes(ptr, lengthInBytes) {
+#if CAN_ADDRESS_2GB
+  ptr >>>= 0;
+#endif
+#if TEXTDECODER == 2
+  if (!ptr) return '';
+  var end = ptr + lengthInBytes;
+  return UTF8Decoder.decode(HEAPU8.subarray(ptr, end));
+#else
+  return ptr ? UTF8ArrayToStringNBytes(HEAPU8, ptr, lengthInBytes) : '';
+#endif
+}
+
 // Copies the given Javascript String object 'str' to the given byte array at address 'outIdx',
 // encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP.
 // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write.

diff --git a/src/runtime_strings_extra.js b/src/runtime_strings_extra.js
@@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) {
 #endif // TEXTDECODER
 }
 
+function UTF16ToStringNBytes(ptr, lengthInBytes) {
+#if ASSERTIONS
+  assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
+  assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!');
+#endif
+#if TEXTDECODER
+  var endPtr = ptr + lengthInBytes;
+
+#if TEXTDECODER != 2
+  if (endPtr - ptr > 32 && UTF16Decoder) {
+#endif // TEXTDECODER != 2
+    return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr));
+#if TEXTDECODER != 2
+  } else {
+#endif // TEXTDECODER != 2
+#endif // TEXTDECODER
+    var i = 0;
+
+    var lengthInCodeUnit = lengthInBytes / 2;
+    var str = '';
+    while (i < lengthInCodeUnit) {
+      var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
+      ++i;
+      // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
+      str += String.fromCharCode(codeUnit);
+    }
+#if TEXTDECODER && TEXTDECODER != 2
+  }
+#endif // TEXTDECODER
+}
+
 // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
 // null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP.
 // Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write.
@@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) {
   return str;
 }
 
+function UTF32ToStringNBytes(ptr, lengthInBytes) {
+#if ASSERTIONS
+  assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
+  assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!');
+#endif
+  var i = 0;
+
+  var lengthInCodePoint = lengthInBytes / 4;
+  var str = '';
+  while (i < lengthInCodePoint) {
+    var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
+    ++i;
+    // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
+    // See http://unicode.org/faq/utf_bom.html#utf16-3
+    if (utf32 >= 0x10000) {
+      var ch = utf32 - 0x10000;
+      str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
+    } else {
+      str += String.fromCharCode(utf32);
+    }
+  }
+  return str;
+}
+
 // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
 // null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP.
 // Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write.