From 7dfe67154d1e65e97cacdaac2192b81cf8be8164 Mon Sep 17 00:00:00 2001
From: landerlyoung <landerlyoung@gmail.com>
Date: Tue, 20 Oct 2020 15:07:57 +0800
Subject: [PATCH] Fix performance issue with UTF8ToString (#12517)

introduce new functions
- UTF8ToStringWithLength
- UTF16ToStringWithLength
- UTF32ToStringWithLength

Decode string exactly length with given length, any '\0' in between will
be kept as-is.

Those functions require an argument `lengthInBytes`, so no need to iterator
the heap to find a null-terminator, thus have better performance.
---
 AUTHORS                      |   1 +
 src/runtime_strings.js       |  79 +++++++++++++
 src/runtime_strings_extra.js |  55 +++++++++
 tests/utf32.cpp              | 220 +++++++++++++++++++++++++----------
 tests/utf8.cpp               |  33 ++++++
 5 files changed, 325 insertions(+), 63 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index 7764a002a7f4..cc654f6af195 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -515,3 +515,4 @@ a license to everyone to use it as detailed in LICENSE.)
 * kamenokonyokonyoko <kamenokonokotan@gmail.com>
 * Lectem <lectem@gmail.com>
 * Henrik Algestam <henrik@algestam.se>
+* LanderlYoung <landerlyoung@gmail.com>
diff --git a/src/runtime_strings.js b/src/runtime_strings.js
index f378e89c92fd..49ec7ef3e252 100644
--- a/src/runtime_strings.js
+++ b/src/runtime_strings.js
@@ -118,6 +118,85 @@ function UTF8ToString(ptr, maxBytesToRead) {
 #endif
 }
 
+
+/**
+ * @param {number} idx
+ * @param {number=} lengthInBytes
+ * @return {string}
+ */
+function UTF8ArrayToStringWithLength(heap, idx, lengthInBytes) {
+#if CAN_ADDRESS_2GB
+  idx >>>= 0;
+#endif
+  var endPtr = idx + lengthInBytes;
+
+#if TEXTDECODER == 2
+  return UTF8Decoder.decode(
+    heap.subarray ? heap.subarray(idx, endPtr) : new Uint8Array(heap.slice(idx, endPtr))
+  );
+#else // TEXTDECODER == 2
+#if TEXTDECODER
+  if (endPtr - idx > 16 && heap.subarray && UTF8Decoder) {
+    return UTF8Decoder.decode(heap.subarray(idx, endPtr));
+  } else {
+#endif // TEXTDECODER
+    var str = '';
+
+    while (idx < endPtr) {
+      // For UTF8 byte structure, see:
+      // http://en.wikipedia.org/wiki/UTF-8#Description
+      // https://www.ietf.org/rfc/rfc2279.txt
+      // https://tools.ietf.org/html/rfc3629
+      var u0 = heap[idx++];
+      if (!(u0 & 0x80)) { str += String.fromCharCode(u0); continue; }
+      var u1 = heap[idx++] & 63;
+      if ((u0 & 0xE0) == 0xC0) { str += String.fromCharCode(((u0 & 31) << 6) | u1); continue; }
+      var u2 = heap[idx++] & 63;
+      if ((u0 & 0xF0) == 0xE0) {
+        u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
+      } else {
+#if ASSERTIONS
+        if ((u0 & 0xF8) != 0xF0) warnOnce('Invalid UTF-8 leading byte 0x' + u0.toString(16) + ' encountered when deserializing a UTF-8 string on the asm.js/wasm heap to a JS string!');
+#endif
+        u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (heap[idx++] & 63);
+      }
+
+      if (u0 < 0x10000) {
+        str += String.fromCharCode(u0);
+      } else {
+        var ch = u0 - 0x10000;
+        str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
+      }
+    }
+#if TEXTDECODER
+  }
+#endif // TEXTDECODER
+  return str;
+#endif // TEXTDECODER == 2
+}
+
+// Given a pointer 'ptr' to a UTF8-encoded string in the emscripten HEAP, returns a
+// copy of that string as a Javascript String object.
+// lengthInBytes:  specifies the number of bytes to read. The string at [ptr, ptr + lengthInBytes)
+//                 will be decoded using utf8 encoding, and any \0 in between will be decoded as-it.
+/**
+ * @param {number} ptr
+ * @param {number=} lengthInBytes
+ * @return {string}
+ */
+function UTF8ToStringWithLength(ptr, lengthInBytes) {
+#if CAN_ADDRESS_2GB
+  ptr >>>= 0;
+#endif
+#if TEXTDECODER == 2
+  if (!ptr) return '';
+  var end = ptr + lengthInBytes;
+  return UTF8Decoder.decode(HEAPU8.subarray(ptr, end));
+#else
+  return ptr ? UTF8ArrayToStringWithLength(HEAPU8, ptr, lengthInBytes) : '';
+#endif
+}
+
 // Copies the given Javascript String object 'str' to the given byte array at address 'outIdx',
 // encoded in UTF8 form and null-terminated. The copy will require at most str.length*4+1 bytes of space in the HEAP.
 // Use the function lengthBytesUTF8 to compute the exact number of bytes (excluding null terminator) that this function will write.
diff --git a/src/runtime_strings_extra.js b/src/runtime_strings_extra.js
index dd2e86d75c46..c0d8d161b734 100644
--- a/src/runtime_strings_extra.js
+++ b/src/runtime_strings_extra.js
@@ -77,6 +77,37 @@ function UTF16ToString(ptr, maxBytesToRead) {
 #endif // TEXTDECODER
 }
 
+function UTF16ToStringWithLength(ptr, lengthInBytes) {
+#if ASSERTIONS
+  assert(ptr % 2 == 0, 'Pointer passed to UTF16ToString must be aligned to two bytes!');
+  assert(lengthInBytes % 2 == 0, 'Length passed to UTF16ToString must be a even number!');
+#endif
+#if TEXTDECODER
+  var endPtr = ptr + lengthInBytes;
+
+#if TEXTDECODER != 2
+  if (endPtr - ptr > 32 && UTF16Decoder) {
+#endif // TEXTDECODER != 2
+    return UTF16Decoder.decode(HEAPU8.subarray(ptr, endPtr));
+#if TEXTDECODER != 2
+  } else {
+#endif // TEXTDECODER != 2
+#endif // TEXTDECODER
+    var i = 0;
+
+    var lengthInCodeUnit = lengthInBytes / 2;
+    var str = '';
+    while (i < lengthInCodeUnit) {
+      var codeUnit = {{{ makeGetValue('ptr', 'i*2', 'i16') }}};
+      ++i;
+      // fromCharCode constructs a character from a UTF-16 code unit, so we can pass the UTF16 string right through.
+      str += String.fromCharCode(codeUnit);
+    }
+#if TEXTDECODER && TEXTDECODER != 2
+  }
+#endif // TEXTDECODER
+}
+
 // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
 // null-terminated and encoded in UTF16 form. The copy will require at most str.length*4+2 bytes of space in the HEAP.
 // Use the function lengthBytesUTF16() to compute the exact number of bytes (excluding null terminator) that this function will write.
@@ -145,6 +176,30 @@ function UTF32ToString(ptr, maxBytesToRead) {
   return str;
 }
 
+function UTF32ToStringWithLength(ptr, lengthInBytes) {
+#if ASSERTIONS
+  assert(ptr % 4 == 0, 'Pointer passed to UTF32ToString must be aligned to four bytes!');
+  assert(lengthInBytes % 4 == 0, 'Length passed to UTF32ToString must be multiple of 4!');
+#endif
+  var i = 0;
+
+  var lengthInCodePoint = lengthInBytes / 4;
+  var str = '';
+  while (i < lengthInCodePoint) {
+    var utf32 = {{{ makeGetValue('ptr', 'i*4', 'i32') }}};
+    ++i;
+    // Gotcha: fromCharCode constructs a character from a UTF-16 encoded code (pair), not from a Unicode code point! So encode the code point to UTF-16 for constructing.
+    // See http://unicode.org/faq/utf_bom.html#utf16-3
+    if (utf32 >= 0x10000) {
+      var ch = utf32 - 0x10000;
+      str += String.fromCharCode(0xD800 | (ch >> 10), 0xDC00 | (ch & 0x3FF));
+    } else {
+      str += String.fromCharCode(utf32);
+    }
+  }
+  return str;
+}
+
 // Copies the given Javascript String object 'str' to the emscripten HEAP at address 'outPtr',
 // null-terminated and encoded in UTF32 form. The copy will require at most str.length*4+4 bytes of space in the HEAP.
 // Use the function lengthBytesUTF32() to compute the exact number of bytes (excluding null terminator) that this function will write.
diff --git a/tests/utf32.cpp b/tests/utf32.cpp
index 6acd298a1e52..4539f7791ef8 100644
--- a/tests/utf32.cpp
+++ b/tests/utf32.cpp
@@ -3,10 +3,11 @@
 // University of Illinois/NCSA Open Source License.  Both these licenses can be
 // found in the LICENSE file.
 
+#include <cassert>
+#include <emscripten.h>
 #include <stdio.h>
 #include <string>
-#include <emscripten.h>
-#include <cassert>
+#include <string_view>
 #include <wchar.h>
 
 typedef unsigned int utf32;
@@ -14,65 +15,158 @@ typedef unsigned short utf16;
 
 // This code tests that Unicode std::wstrings can be marshalled between C++ and JS.
 int main() {
-	std::wstring wstr = L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC is the Euro sign, U+2007C is a Chinese Han character that looks like three raindrops.
-
-	printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t));
-
-	if (sizeof(wchar_t) == 4) {
-		utf32 *memory = new utf32[wstr.length()+1];
-
-		EM_ASM({
-			var str = UTF32ToString($0);
-			out(str);
-			var numBytesWritten = stringToUTF32(str, $1, $2);
-			if (numBytesWritten != 23*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
-		}, wstr.c_str(), memory, (wstr.length()+1)*sizeof(utf32));
-
-		// Compare memory to confirm that the string is intact after taking a route through JS side.
-		const utf32 *srcPtr = reinterpret_cast<const utf32 *>(wstr.c_str());
-		for(int i = 0;; ++i) {
-			assert(memory[i] == srcPtr[i]);
-			if (srcPtr[i] == 0)
-				break;
-		}
-
-		EM_ASM({
-			var str = UTF32ToString($0);
-			out(str);
-			var numBytesWritten = stringToUTF32(str, $1, $2);
-			if (numBytesWritten != 5*4) throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
-		}, wstr.c_str(), memory, 6*sizeof(utf32));
-		assert(memory[5] == 0);
-
-		delete[] memory;
-	} else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar.
-		utf16 *memory = new utf16[2*wstr.length()+1];
-
-		EM_ASM({
-			var str = UTF16ToString($0);
-			out(str);
-			var numBytesWritten = stringToUTF16(str, $1, $2);
-			if (numBytesWritten != 25*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
-		}, wstr.c_str(), memory, (2*wstr.length()+1)*sizeof(utf16));
-
-		// Compare memory to confirm that the string is intact after taking a route through JS side.
-		const utf16 *srcPtr = reinterpret_cast<const utf16 *>(wstr.c_str());
-		for(int i = 0;; ++i) {
-			assert(memory[i] == srcPtr[i]);
-			if (srcPtr[i] == 0)
-				break;
-		}
-
-		EM_ASM({
-			var str = UTF16ToString($0);
-			out(str);
-			var numBytesWritten = stringToUTF16(str, $1, $2);
-			if (numBytesWritten != 5*2) throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
-		}, wstr.c_str(), memory, 6*sizeof(utf16));
-		assert(memory[5] == 0);
-		
-		delete[] memory;
-	}
-
-	printf("OK.\n");
+  std::wstring wstr =
+    L"abc\u2603\u20AC\U0002007C123 --- abc\u2603\u20AC\U0002007C123"; // U+2603 is snowman, U+20AC
+                                                                      // is the Euro sign, U+2007C
+                                                                      // is a Chinese Han character
+                                                                      // that looks like three
+                                                                      // raindrops.
+
+  printf("sizeof(wchar_t): %d.\n", (int)sizeof(wchar_t));
+
+  if (sizeof(wchar_t) == 4) {
+    utf32* memory = new utf32[wstr.length() + 1];
+
+    EM_ASM(
+      {
+        var str = UTF32ToString($0);
+        out(str);
+        var numBytesWritten = stringToUTF32(str, $1, $2);
+        if (numBytesWritten != 23 * 4)
+          throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
+      },
+      wstr.c_str(), memory, (wstr.length() + 1) * sizeof(utf32));
+
+    // Compare memory to confirm that the string is intact after taking a route through JS side.
+    const utf32* srcPtr = reinterpret_cast<const utf32*>(wstr.c_str());
+    for (int i = 0;; ++i) {
+      assert(memory[i] == srcPtr[i]);
+      if (srcPtr[i] == 0)
+        break;
+    }
+
+    EM_ASM(
+      {
+        var str = UTF32ToString($0);
+        out(str);
+        var numBytesWritten = stringToUTF32(str, $1, $2);
+        if (numBytesWritten != 5 * 4)
+          throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten;
+      },
+      wstr.c_str(), memory, 6 * sizeof(utf32));
+    assert(memory[5] == 0);
+
+    // UTF32ToStringWithLength without null-terminate
+    size_t dashIndex = wstr.find(L'-');
+    std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1);
+    int outLength = EM_ASM_INT(
+      {
+        var str = UTF32ToStringWithLength($0, $1);
+        out(str);
+        var expectedBytesWritten = $1;
+        var numBytesWritten = stringToUTF32(str, $2, $3);
+        if (numBytesWritten != expectedBytesWritten) {
+          throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' +
+            expectedBytesWritten;
+        }
+        return numBytesWritten;
+      },
+      subString.data(), subString.length() * sizeof(utf32), memory,
+      (wstr.length() + 1) * sizeof(utf32));
+    assert(outLength == subString.length() * sizeof(utf32));
+
+    // UTF32ToStringWithLength without '\0' inside
+    std::wstring wstr2 = wstr;
+    wstr2[dashIndex] = L'\0';
+    int outLength2 = EM_ASM_INT(
+      {
+        var str = UTF32ToStringWithLength($0, $1);
+        out(str);
+        var expectedBytesWritten = $1;
+        var numBytesWritten = stringToUTF32(str, $2, $3);
+        if (numBytesWritten != expectedBytesWritten) {
+          throw 'stringToUTF32 wrote an invalid length ' + numBytesWritten + ' != ' +
+            expectedBytesWritten;
+        }
+        return numBytesWritten;
+      },
+      wstr2.c_str(), wstr2.length() * sizeof(utf32), memory, (wstr.length() + 1) * sizeof(utf32));
+    assert(outLength2 == wstr2.length() * sizeof(utf32));
+    assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length()));
+
+    delete[] memory;
+  } else { // sizeof(wchar_t) == 2, and we're building with -fshort-wchar.
+    utf16* memory = new utf16[2 * wstr.length() + 1];
+
+    EM_ASM(
+      {
+        var str = UTF16ToString($0);
+        out(str);
+        var numBytesWritten = stringToUTF16(str, $1, $2);
+        if (numBytesWritten != 25 * 2)
+          throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
+      },
+      wstr.c_str(), memory, (2 * wstr.length() + 1) * sizeof(utf16));
+
+    // Compare memory to confirm that the string is intact after taking a route through JS side.
+    const utf16* srcPtr = reinterpret_cast<const utf16*>(wstr.c_str());
+    for (int i = 0;; ++i) {
+      assert(memory[i] == srcPtr[i]);
+      if (srcPtr[i] == 0)
+        break;
+    }
+
+    EM_ASM(
+      {
+        var str = UTF16ToString($0);
+        out(str);
+        var numBytesWritten = stringToUTF16(str, $1, $2);
+        if (numBytesWritten != 5 * 2)
+          throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten;
+      },
+      wstr.c_str(), memory, 6 * sizeof(utf16));
+    assert(memory[5] == 0);
+
+    // UTF16ToStringWithLength without null-terminate
+    size_t dashIndex = wstr.find(L'-');
+    std::wstring_view subString = std::wstring_view(wstr).substr(0, dashIndex + 1);
+    int outLength = EM_ASM_INT(
+      {
+        var str = UTF16ToStringWithLength($0, $1);
+        out(str);
+        var expectedBytesWritten = $1;
+        var numBytesWritten = stringToUTF16(str, $2, $3);
+        if (numBytesWritten != expectedBytesWritten) {
+          throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' +
+            expectedBytesWritten;
+        }
+        return numBytesWritten;
+      },
+      subString.data(), subString.length() * sizeof(utf16), memory,
+      (wstr.length() + 1) * sizeof(utf16));
+    assert(outLength == subString.length() * sizeof(utf16));
+
+    // UTF16ToStringWithLength without '\0' inside
+    std::wstring wstr2 = wstr;
+    wstr2[dashIndex] = L'\0';
+    int outLength2 = EM_ASM_INT(
+      {
+        var str = UTF16ToStringWithLength($0, $1);
+        out(str);
+        var expectedBytesWritten = $1;
+        var numBytesWritten = stringToUTF16(str, $2, $3);
+        if (numBytesWritten != expectedBytesWritten) {
+          throw 'stringToUTF16 wrote an invalid length ' + numBytesWritten + ' != ' +
+            expectedBytesWritten;
+        }
+        return numBytesWritten;
+      },
+      wstr2.c_str(), wstr2.length() * sizeof(utf16), memory, (wstr.length() + 1) * sizeof(utf16));
+    assert(outLength2 == wstr2.length() * sizeof(utf16));
+    assert(wstr2 == std::wstring_view((wchar_t*)memory, wstr2.length()));
+
+    delete[] memory;
+  }
+
+  printf("OK.\n");
 }
diff --git a/tests/utf8.cpp b/tests/utf8.cpp
index fabb1412c602..0d91ad07873c 100644
--- a/tests/utf8.cpp
+++ b/tests/utf8.cpp
@@ -8,6 +8,8 @@
 #include <wchar.h>
 #include <iostream>
 #include <cassert>
+#include <string>
+#include <string_view>
 #include <emscripten.h>
 
 // This code tests that Unicode std::wstrings can be marshalled between C++ and JS.
@@ -50,6 +52,37 @@ int main() {
       printf("i=%d:%u,%u\n", i, (unsigned int)(unsigned char)utf8String[i], (unsigned int)(unsigned char)utf8String2[i]);
   assert(!strcmp(utf8String, utf8String2));
 
+  // UTF8ToStringWithLength decode string with no null-terminate.
+  std::string utf8StringObject(utf8String);
+  std::string_view utf8SubString = std::string_view(utf8StringObject)
+                                     .substr(0, utf8StringObject.find('-') + 1);
+  char utf8String3[128] = {};
+  EM_ASM({
+    var str = UTF8ToStringWithLength($0, $1);
+    out(str);
+    var expectBytesWritten = $1;
+    var numBytesWritten = stringToUTF8(str, $2, $3);
+    if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten;
+  }, utf8SubString.data(), utf8SubString.length(), utf8String3, 128);
+  assert(utf8SubString.length() == strlen(utf8String3));
+  assert(utf8SubString == utf8String3);
+
+  // UTF8ToStringWithLength decode string which contains '\0' inside.
+  std::string utf8StringObject1(utf8String);
+  // change the '-' to '\0'
+  utf8StringObject1[utf8StringObject.find('-')] = '\0';
+  char utf8String4[128] = {};
+  int outLength = EM_ASM_INT({
+   var str = UTF8ToStringWithLength($0, $1);
+   out(str);
+   var expectBytesWritten = $1;
+   var numBytesWritten = stringToUTF8(str, $2, $3);
+   if (numBytesWritten != expectBytesWritten) throw 'stringToUTF8 wrote an invalid length ' + numBytesWritten + " != " + expectBytesWritten;
+   return numBytesWritten;
+  }, utf8StringObject1.c_str(), utf8StringObject1.length(), utf8String4, 128);
+  assert(utf8StringObject1.length() == outLength);
+  assert(utf8StringObject1 == std::string_view(utf8String4, outLength));
+
   // Test that text gets properly cut off if output buffer is too small.
   EM_ASM({
     var str = UTF8ToString($0);