Skip to content

Commit

Permalink
src: improve buffer.transcode performance
Browse files Browse the repository at this point in the history
PR-URL: #54153
Reviewed-By: Daniel Lemire <daniel@lemire.me>
Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com>
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: Minwoo Jung <nodecorelab@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
  • Loading branch information
anonrig authored and targos committed Sep 26, 2024
1 parent fffddfa commit c502550
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 63 deletions.
35 changes: 35 additions & 0 deletions benchmark/buffers/buffer-transcode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'use strict';
const common = require('../common.js');
const assert = require('node:assert');
const buffer = require('node:buffer');

const hasIntl = !!process.config.variables.v8_enable_i18n_support;
const encodings = ['latin1', 'ascii', 'ucs2', 'utf8'];

if (!hasIntl) {
console.log('Skipping: `transcode` is only available on platforms that support i18n`');
process.exit(0);
}

const bench = common.createBenchmark(main, {
fromEncoding: encodings,
toEncoding: encodings,
length: [1, 10, 1000],
n: [1e5],
}, {
combinationFilter(p) {
return !(p.fromEncoding === 'ucs2' && p.toEncoding === 'utf8');
},
});

function main({ n, fromEncoding, toEncoding, length }) {
const input = Buffer.from('a'.repeat(length));
let out = 0;
bench.start();
for (let i = 0; i < n; i++) {
const dest = buffer.transcode(input, fromEncoding, toEncoding);
out += dest.buffer.byteLength;
}
bench.end(n);
assert.ok(out >= 0);
}
107 changes: 44 additions & 63 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@

#include "node_i18n.h"
#include "node_external_reference.h"
#include "simdutf.h"

#if defined(NODE_HAVE_I18N_SUPPORT)

Expand Down Expand Up @@ -146,7 +147,6 @@ MaybeLocal<Object> Transcode(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
MaybeStackBuffer<char> result;
Converter to(toEncoding);
Expand All @@ -169,22 +169,21 @@ MaybeLocal<Object> Transcode(Environment* env,
return ret;
}

MaybeLocal<Object> TranscodeToUcs2(Environment* env,
const char* fromEncoding,
const char* toEncoding,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
MaybeLocal<Object> TranscodeLatin1ToUcs2(Environment* env,
const char* fromEncoding,
const char* toEncoding,
const char* source,
const size_t source_length,
UErrorCode* status) {
MaybeStackBuffer<UChar> destbuf(source_length);
Converter from(fromEncoding);
const size_t length_in_chars = source_length * sizeof(UChar);
ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
source, source_length, status);
if (U_SUCCESS(*status))
ret = ToBufferEndian(env, &destbuf);
return ret;
auto actual_length =
simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out());
if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}

return Buffer::New(env, &destbuf);
}

MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
Expand All @@ -193,13 +192,11 @@ MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeStackBuffer<UChar> sourcebuf;
MaybeLocal<Object> ret;
Converter to(toEncoding);

size_t sublen = ucnv_getMinCharSize(to.conv());
std::string sub(sublen, '?');
std::string sub(to.min_char_size(), '?');
to.set_subst_chars(sub.c_str());

const size_t length_in_chars = source_length / sizeof(UChar);
Expand All @@ -220,26 +217,18 @@ MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeStackBuffer<UChar> destbuf;
int32_t result_length;
u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
source, source_length, status);
MaybeLocal<Object> ret;
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
destbuf.AllocateSufficientStorage(result_length);
u_strFromUTF8(*destbuf, result_length, &result_length,
source, source_length, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
}
size_t expected_utf16_length =
simdutf::utf16_length_from_utf8(source, source_length);
MaybeStackBuffer<UChar> destbuf(expected_utf16_length);
auto actual_length =
simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out());

if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}
return ret;

return Buffer::New(env, &destbuf);
}

MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
Expand All @@ -248,32 +237,25 @@ MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
const char* source,
const size_t source_length,
UErrorCode* status) {
*status = U_ZERO_ERROR;
MaybeLocal<Object> ret;
const size_t length_in_chars = source_length / sizeof(UChar);
int32_t result_length;
MaybeStackBuffer<UChar> sourcebuf;
MaybeStackBuffer<char> destbuf;
CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
*sourcebuf, length_in_chars, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
destbuf.AllocateSufficientStorage(result_length);
u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
length_in_chars, status);
if (U_SUCCESS(*status)) {
destbuf.SetLength(result_length);
ret = ToBufferEndian(env, &destbuf);
}
size_t expected_utf8_length = simdutf::utf8_length_from_utf16le(
reinterpret_cast<const char16_t*>(source), length_in_chars);

MaybeStackBuffer<char> destbuf(expected_utf8_length);
auto actual_length = simdutf::convert_utf16le_to_utf8(
reinterpret_cast<const char16_t*>(source),
length_in_chars,
destbuf.out());

if (actual_length == 0) {
*status = U_INVALID_CHAR_FOUND;
return {};
}
return ret;

return Buffer::New(env, &destbuf);
}

const char* EncodingName(const enum encoding encoding) {
constexpr const char* EncodingName(const enum encoding encoding) {
switch (encoding) {
case ASCII: return "us-ascii";
case LATIN1: return "iso8859-1";
Expand All @@ -283,7 +265,7 @@ const char* EncodingName(const enum encoding encoding) {
}
}

bool SupportedEncoding(const enum encoding encoding) {
constexpr bool SupportedEncoding(const enum encoding encoding) {
switch (encoding) {
case ASCII:
case LATIN1:
Expand All @@ -308,8 +290,7 @@ void Transcode(const FunctionCallbackInfo<Value>&args) {
switch (fromEncoding) {
case ASCII:
case LATIN1:
if (toEncoding == UCS2)
tfn = &TranscodeToUcs2;
if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2;
break;
case UTF8:
if (toEncoding == UCS2)
Expand Down

0 comments on commit c502550

Please sign in to comment.