Skip to content

Commit

Permalink
src: improve utf8 string generation performance
Browse files Browse the repository at this point in the history
PR-URL: nodejs#54873
Reviewed-By: Daniel Lemire <daniel@lemire.me>
Reviewed-By: Matteo Collina <matteo.collina@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Stephen Belanger <admin@stephenbelanger.com>
  • Loading branch information
anonrig authored and louwers committed Nov 2, 2024
1 parent bfba8f3 commit a04d08f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 21 deletions.
40 changes: 22 additions & 18 deletions src/string_bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -388,47 +388,47 @@ Maybe<size_t> StringBytes::StorageSize(Isolate* isolate,
Local<Value> val,
enum encoding encoding) {
HandleScope scope(isolate);
size_t data_size = 0;
bool is_buffer = Buffer::HasInstance(val);

if (is_buffer && (encoding == BUFFER || encoding == LATIN1)) {
if (Buffer::HasInstance(val) && (encoding == BUFFER || encoding == LATIN1)) {
return Just(Buffer::Length(val));
}

Local<String> str;
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
return Nothing<size_t>();
String::ValueView view(isolate, str);
size_t data_size = 0;

switch (encoding) {
case ASCII:
case LATIN1:
data_size = str->Length();
data_size = view.length();
break;

case BUFFER:
case UTF8:
// A single UCS2 codepoint never takes up more than 3 utf8 bytes.
// It is an exercise for the caller to decide when a string is
// long enough to justify calling Size() instead of StorageSize()
data_size = 3 * str->Length();
data_size = 3 * view.length();
break;

case UCS2:
data_size = str->Length() * sizeof(uint16_t);
data_size = view.length() * sizeof(uint16_t);
break;

case BASE64URL:
data_size = simdutf::base64_length_from_binary(str->Length(),
data_size = simdutf::base64_length_from_binary(view.length(),
simdutf::base64_url);
break;

case BASE64:
data_size = simdutf::base64_length_from_binary(str->Length());
data_size = simdutf::base64_length_from_binary(view.length());
break;

case HEX:
CHECK(str->Length() % 2 == 0 && "invalid hex string length");
data_size = str->Length() / 2;
CHECK(view.length() % 2 == 0 && "invalid hex string length");
data_size = view.length() / 2;
break;

default:
Expand All @@ -449,32 +449,36 @@ Maybe<size_t> StringBytes::Size(Isolate* isolate,
Local<String> str;
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
return Nothing<size_t>();
String::ValueView view(isolate, str);

switch (encoding) {
case ASCII:
case LATIN1:
return Just<size_t>(str->Length());
return Just<size_t>(view.length());

case BUFFER:
case UTF8:
return Just<size_t>(str->Utf8Length(isolate));
if (view.is_one_byte()) {
return Just<size_t>(simdutf::utf8_length_from_latin1(
reinterpret_cast<const char*>(view.data8()), view.length()));
}
return Just<size_t>(simdutf::utf8_length_from_utf16(
reinterpret_cast<const char16_t*>(view.data16()), view.length()));

case UCS2:
return Just(str->Length() * sizeof(uint16_t));
return Just(view.length() * sizeof(uint16_t));

case BASE64URL: {
String::Value value(isolate, str);
return Just(simdutf::base64_length_from_binary(value.length(),
return Just(simdutf::base64_length_from_binary(view.length(),
simdutf::base64_url));
}

case BASE64: {
String::Value value(isolate, str);
return Just(simdutf::base64_length_from_binary(value.length()));
return Just(simdutf::base64_length_from_binary(view.length()));
}

case HEX:
return Just<size_t>(str->Length() / 2);
return Just<size_t>(view.length() / 2);
}

UNREACHABLE();
Expand Down
28 changes: 25 additions & 3 deletions src/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include <sys/types.h>
#endif

#include <simdutf.h>

#include <atomic>
#include <cstdio>
#include <cstring>
Expand Down Expand Up @@ -100,11 +102,31 @@ static void MakeUtf8String(Isolate* isolate,
MaybeStackBuffer<T>* target) {
Local<String> string;
if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return;
String::ValueView value_view(isolate, string);

auto value_length = value_view.length();

if (value_view.is_one_byte()) {
auto const_char = reinterpret_cast<const char*>(value_view.data8());
auto expected_length =
target->capacity() < (static_cast<size_t>(value_length) * 2 + 1)
? simdutf::utf8_length_from_latin1(const_char, value_length)
: value_length * 2;

// Add +1 for null termination.
target->AllocateSufficientStorage(expected_length + 1);
const auto actual_length = simdutf::convert_latin1_to_utf8(
const_char, value_length, target->out());
target->SetLengthAndZeroTerminate(actual_length);
return;
}

size_t storage;
if (!StringBytes::StorageSize(isolate, string, UTF8).To(&storage)) return;
storage += 1;
// Add +1 for null termination.
size_t storage = (3 * value_length) + 1;
target->AllocateSufficientStorage(storage);

// TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's
// implemented
const int flags =
String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8;
const int length =
Expand Down

0 comments on commit a04d08f

Please sign in to comment.