From aab9d644464675633752efca7b3b2c2bd6faf46b Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Wed, 31 Jul 2024 22:10:56 -0400 Subject: [PATCH] src: improve `buffer.transcode` performance --- benchmark/buffers/buffer-transcode.js | 35 +++++++++ src/node_i18n.cc | 107 +++++++++++--------------- 2 files changed, 79 insertions(+), 63 deletions(-) create mode 100644 benchmark/buffers/buffer-transcode.js diff --git a/benchmark/buffers/buffer-transcode.js b/benchmark/buffers/buffer-transcode.js new file mode 100644 index 00000000000000..cbb3b2e9b16374 --- /dev/null +++ b/benchmark/buffers/buffer-transcode.js @@ -0,0 +1,35 @@ +'use strict'; +const common = require('../common.js'); +const assert = require('node:assert'); +const buffer = require('node:buffer'); + +const hasIntl = !!process.config.variables.v8_enable_i18n_support; +const encodings = ['latin1', 'ascii', 'ucs2', 'utf8']; + +if (!hasIntl) { + console.log('Skipping: `transcode` is only available on platforms that support i18n`'); + process.exit(0); +} + +const bench = common.createBenchmark(main, { + fromEncoding: encodings, + toEncoding: encodings, + length: [1, 10, 1000], + n: [1e5], +}, { + combinationFilter(p) { + return !(p.fromEncoding === 'ucs2' && p.toEncoding === 'utf8'); + }, +}); + +function main({ n, fromEncoding, toEncoding, length }) { + const input = Buffer.from('a'.repeat(length)); + let out = 0; + bench.start(); + for (let i = 0; i < n; i++) { + const dest = buffer.transcode(input, fromEncoding, toEncoding); + out += dest.buffer.byteLength; + } + bench.end(n); + assert.ok(out >= 0); +} diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 7a13f35d2f2bcb..43bb68351bf0a6 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -42,6 +42,7 @@ #include "node_i18n.h" #include "node_external_reference.h" +#include "simdutf.h" #if defined(NODE_HAVE_I18N_SUPPORT) @@ -147,7 +148,6 @@ MaybeLocal Transcode(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; MaybeLocal ret; MaybeStackBuffer result; Converter to(toEncoding); @@ -170,22 +170,21 @@ MaybeLocal Transcode(Environment* env, return ret; } -MaybeLocal TranscodeToUcs2(Environment* env, - const char* fromEncoding, - const char* toEncoding, - const char* source, - const size_t source_length, - UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeLocal ret; +MaybeLocal TranscodeLatin1ToUcs2(Environment* env, + const char* fromEncoding, + const char* toEncoding, + const char* source, + const size_t source_length, + UErrorCode* status) { MaybeStackBuffer destbuf(source_length); - Converter from(fromEncoding); - const size_t length_in_chars = source_length * sizeof(UChar); - ucnv_toUChars(from.conv(), *destbuf, length_in_chars, - source, source_length, status); - if (U_SUCCESS(*status)) - ret = ToBufferEndian(env, &destbuf); - return ret; + auto actual_length = + simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out()); + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; + } + + return Buffer::New(env, &destbuf); } MaybeLocal TranscodeFromUcs2(Environment* env, @@ -194,13 +193,11 @@ MaybeLocal TranscodeFromUcs2(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; MaybeStackBuffer sourcebuf; MaybeLocal ret; Converter to(toEncoding); - size_t sublen = ucnv_getMinCharSize(to.conv()); - std::string sub(sublen, '?'); + std::string sub(to.min_char_size(), '?'); to.set_subst_chars(sub.c_str()); const size_t length_in_chars = source_length / sizeof(UChar); @@ -221,26 +218,18 @@ MaybeLocal TranscodeUcs2FromUtf8(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeStackBuffer destbuf; - int32_t result_length; - u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length, - source, source_length, status); - MaybeLocal ret; - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } else if (*status == U_BUFFER_OVERFLOW_ERROR) { - *status = U_ZERO_ERROR; - destbuf.AllocateSufficientStorage(result_length); - u_strFromUTF8(*destbuf, result_length, &result_length, - source, source_length, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } + size_t expected_utf16_length = + simdutf::utf16_length_from_utf8(source, source_length); + MaybeStackBuffer destbuf(expected_utf16_length); + auto actual_length = + simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out()); + + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; } - return ret; + + return Buffer::New(env, &destbuf); } MaybeLocal TranscodeUtf8FromUcs2(Environment* env, @@ -249,32 +238,25 @@ MaybeLocal TranscodeUtf8FromUcs2(Environment* env, const char* source, const size_t source_length, UErrorCode* status) { - *status = U_ZERO_ERROR; - MaybeLocal ret; const size_t length_in_chars = source_length / sizeof(UChar); - int32_t result_length; - MaybeStackBuffer sourcebuf; - MaybeStackBuffer destbuf; - CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars); - u_strToUTF8(*destbuf, destbuf.capacity(), &result_length, - *sourcebuf, length_in_chars, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } else if (*status == U_BUFFER_OVERFLOW_ERROR) { - *status = U_ZERO_ERROR; - destbuf.AllocateSufficientStorage(result_length); - u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf, - length_in_chars, status); - if (U_SUCCESS(*status)) { - destbuf.SetLength(result_length); - ret = ToBufferEndian(env, &destbuf); - } + size_t expected_utf8_length = simdutf::utf8_length_from_utf16le( + reinterpret_cast(source), length_in_chars); + + MaybeStackBuffer destbuf(expected_utf8_length); + auto actual_length = simdutf::convert_utf16le_to_utf8( + reinterpret_cast(source), + length_in_chars, + destbuf.out()); + + if (actual_length == 0) { + *status = U_INVALID_CHAR_FOUND; + return {}; } - return ret; + + return Buffer::New(env, &destbuf); } -const char* EncodingName(const enum encoding encoding) { +constexpr const char* EncodingName(const enum encoding encoding) { switch (encoding) { case ASCII: return "us-ascii"; case LATIN1: return "iso8859-1"; @@ -284,7 +266,7 @@ const char* EncodingName(const enum encoding encoding) { } } -bool SupportedEncoding(const enum encoding encoding) { +constexpr bool SupportedEncoding(const enum encoding encoding) { switch (encoding) { case ASCII: case LATIN1: @@ -309,8 +291,7 @@ void Transcode(const FunctionCallbackInfo&args) { switch (fromEncoding) { case ASCII: case LATIN1: - if (toEncoding == UCS2) - tfn = &TranscodeToUcs2; + if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2; break; case UTF8: if (toEncoding == UCS2)