From 1054fc2531fe6d83cd5d39258030e89cd8a015d2 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Tue, 10 Jan 2023 12:25:19 +0100 Subject: [PATCH] src: use simdutf for converting externalized builtins to UTF-16 Remove the dependency on ICU for this part, as well as the hacky way of converting embedder main sources to UTF-8 via V8 APIs. Allow `UnionBytes` to own the memory its pointing to in order to simplify the code on the `BuiltinLoader` side. PR-URL: https://github.com/nodejs/node/pull/46119 Reviewed-By: Yagiz Nizipli Reviewed-By: James M Snell Reviewed-By: Rafael Gonzaga Reviewed-By: Darshan Sen --- configure.py | 6 +--- node.gyp | 1 + src/api/environment.cc | 15 ++-------- src/env-inl.h | 5 ---- src/env.h | 6 ---- src/node_builtins.cc | 26 ++++++++-------- src/node_builtins.h | 9 +----- src/node_union_bytes.h | 64 ++++++---------------------------------- src/util.cc | 62 ++++++++++++++++++++++++++++++++++++++ test/cctest/test_util.cc | 17 ++++++++++- 10 files changed, 106 insertions(+), 105 deletions(-) diff --git a/configure.py b/configure.py index c1d8815fc024ee..4ebb5606be4f6c 100755 --- a/configure.py +++ b/configure.py @@ -2024,11 +2024,7 @@ def make_bin_override(): for builtin in shareable_builtins: builtin_id = 'node_shared_builtin_' + builtin.replace('/', '_') + '_path' if getattr(options, builtin_id): - if options.with_intl == 'none': - option_name = '--shared-builtin-' + builtin + '-path' - error(option_name + ' is incompatible with --with-intl=none' ) - else: - output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)] + output['defines'] += [builtin_id.upper() + '=' + getattr(options, builtin_id)] else: output['variables']['node_builtin_shareable_builtins'] += [shareable_builtins[builtin]] diff --git a/node.gyp b/node.gyp index 043b46d6d56ee5..f1844720762529 100644 --- a/node.gyp +++ b/node.gyp @@ -973,6 +973,7 @@ 'deps/googletest/googletest.gyp:gtest_main', 'deps/histogram/histogram.gyp:histogram', 'deps/uvwasi/uvwasi.gyp:uvwasi', + 'deps/simdutf/simdutf.gyp:simdutf', ], 'includes': [ diff --git a/src/api/environment.cc b/src/api/environment.cc index 8f8b7d5b6d0aec..68af6d7483db75 100644 --- a/src/api/environment.cc +++ b/src/api/environment.cc @@ -467,21 +467,10 @@ MaybeLocal LoadEnvironment( Environment* env, const char* main_script_source_utf8) { CHECK_NOT_NULL(main_script_source_utf8); - Isolate* isolate = env->isolate(); return LoadEnvironment( - env, - [&](const StartExecutionCallbackInfo& info) -> MaybeLocal { - // This is a slightly hacky way to convert UTF-8 to UTF-16. - Local str = - String::NewFromUtf8(isolate, - main_script_source_utf8).ToLocalChecked(); - auto main_utf16 = std::make_unique(isolate, str); - - // TODO(addaleax): Avoid having a global table for all scripts. + env, [&](const StartExecutionCallbackInfo& info) -> MaybeLocal { std::string name = "embedder_main_" + std::to_string(env->thread_id()); - builtins::BuiltinLoader::Add( - name.c_str(), UnionBytes(**main_utf16, main_utf16->length())); - env->set_main_utf16(std::move(main_utf16)); + builtins::BuiltinLoader::Add(name.c_str(), main_script_source_utf8); Realm* realm = env->principal_realm(); return realm->ExecuteBootstrapper(name.c_str()); diff --git a/src/env-inl.h b/src/env-inl.h index 536a0014d7130f..8ce3a4e8473047 100644 --- a/src/env-inl.h +++ b/src/env-inl.h @@ -799,11 +799,6 @@ void Environment::RemoveCleanupHook(CleanupQueue::Callback fn, void* arg) { cleanup_queue_.Remove(fn, arg); } -void Environment::set_main_utf16(std::unique_ptr str) { - CHECK(!main_utf16_); - main_utf16_ = std::move(str); -} - void Environment::set_process_exit_handler( std::function&& handler) { process_exit_handler_ = std::move(handler); diff --git a/src/env.h b/src/env.h index d247b7fff20f6e..779c29c5181335 100644 --- a/src/env.h +++ b/src/env.h @@ -959,7 +959,6 @@ class Environment : public MemoryRetainer { #endif // HAVE_INSPECTOR - inline void set_main_utf16(std::unique_ptr); inline void set_process_exit_handler( std::function&& handler); @@ -1134,11 +1133,6 @@ class Environment : public MemoryRetainer { std::unique_ptr principal_realm_ = nullptr; - // Keeps the main script source alive is one was passed to LoadEnvironment(). - // We should probably find a way to just use plain `v8::String`s created from - // the source passed to LoadEnvironment() directly instead. - std::unique_ptr main_utf16_; - // Used by allocate_managed_buffer() and release_managed_buffer() to keep // track of the BackingStore for a given pointer. std::unordered_map> diff --git a/src/node_builtins.cc b/src/node_builtins.cc index e68cc71bfcb827..534520e60243ea 100644 --- a/src/node_builtins.cc +++ b/src/node_builtins.cc @@ -3,6 +3,7 @@ #include "env-inl.h" #include "node_external_reference.h" #include "node_internals.h" +#include "simdutf.h" #include "util-inl.h" namespace node { @@ -35,7 +36,6 @@ BuiltinLoader BuiltinLoader::instance_; BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) { LoadJavaScriptSource(); -#if defined(NODE_HAVE_I18N_SUPPORT) #ifdef NODE_SHARED_BUILTIN_CJS_MODULE_LEXER_LEXER_PATH AddExternalizedBuiltin( "internal/deps/cjs-module-lexer/lexer", @@ -52,7 +52,6 @@ BuiltinLoader::BuiltinLoader() : config_(GetConfig()), has_code_cache_(false) { AddExternalizedBuiltin("internal/deps/undici/undici", STRINGIFY(NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH)); #endif // NODE_SHARED_BUILTIN_UNDICI_UNDICI_PATH -#endif // NODE_HAVE_I18N_SUPPORT } BuiltinLoader* BuiltinLoader::GetInstance() { @@ -240,7 +239,6 @@ MaybeLocal BuiltinLoader::LoadBuiltinSource(Isolate* isolate, #endif // NODE_BUILTIN_MODULES_PATH } -#if defined(NODE_HAVE_I18N_SUPPORT) void BuiltinLoader::AddExternalizedBuiltin(const char* id, const char* filename) { std::string source; @@ -252,16 +250,20 @@ void BuiltinLoader::AddExternalizedBuiltin(const char* id, return; } - icu::UnicodeString utf16 = icu::UnicodeString::fromUTF8( - icu::StringPiece(source.data(), source.length())); - auto source_utf16 = std::make_unique(utf16); - Add(id, - UnionBytes(reinterpret_cast((*source_utf16).getBuffer()), - utf16.length())); - // keep source bytes for builtin alive while BuiltinLoader exists - GetInstance()->externalized_source_bytes_.push_back(std::move(source_utf16)); + Add(id, source); +} + +bool BuiltinLoader::Add(const char* id, std::string_view utf8source) { + size_t expected_u16_length = + simdutf::utf16_length_from_utf8(utf8source.data(), utf8source.length()); + auto out = std::make_shared>(expected_u16_length); + size_t u16_length = simdutf::convert_utf8_to_utf16le( + utf8source.data(), + utf8source.length(), + reinterpret_cast(out->data())); + out->resize(u16_length); + return Add(id, UnionBytes(out)); } -#endif // NODE_HAVE_I18N_SUPPORT // Returns Local of the compiled module if return_code_cache // is false (we are only compiling the function). diff --git a/src/node_builtins.h b/src/node_builtins.h index 2f95e04109fdbd..f90a4151850d54 100644 --- a/src/node_builtins.h +++ b/src/node_builtins.h @@ -8,9 +8,6 @@ #include #include #include -#if defined(NODE_HAVE_I18N_SUPPORT) -#include -#endif // NODE_HAVE_I18N_SUPPORT #include #include "node_mutex.h" #include "node_union_bytes.h" @@ -71,6 +68,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader { static v8::Local GetConfigString(v8::Isolate* isolate); static bool Exists(const char* id); static bool Add(const char* id, const UnionBytes& source); + static bool Add(const char* id, std::string_view utf8source); static bool CompileAllBuiltins(v8::Local context); static void RefreshCodeCache(const std::vector& in); @@ -136,18 +134,13 @@ class NODE_EXTERN_PRIVATE BuiltinLoader { static void HasCachedBuiltins( const v8::FunctionCallbackInfo& args); -#if defined(NODE_HAVE_I18N_SUPPORT) static void AddExternalizedBuiltin(const char* id, const char* filename); -#endif // NODE_HAVE_I18N_SUPPORT static BuiltinLoader instance_; BuiltinCategories builtin_categories_; BuiltinSourceMap source_; BuiltinCodeCacheMap code_cache_; UnionBytes config_; -#if defined(NODE_HAVE_I18N_SUPPORT) - std::list> externalized_source_bytes_; -#endif // NODE_HAVE_I18N_SUPPORT // Used to synchronize access to the code cache map Mutex code_cache_mutex_; diff --git a/src/node_union_bytes.h b/src/node_union_bytes.h index d296e67f005f04..6e1a3afb614323 100644 --- a/src/node_union_bytes.h +++ b/src/node_union_bytes.h @@ -11,57 +11,20 @@ namespace node { -class NonOwningExternalOneByteResource - : public v8::String::ExternalOneByteStringResource { - public: - explicit NonOwningExternalOneByteResource(const uint8_t* data, size_t length) - : data_(data), length_(length) {} - ~NonOwningExternalOneByteResource() override = default; - - const char* data() const override { - return reinterpret_cast(data_); - } - size_t length() const override { return length_; } - - NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) = - delete; - NonOwningExternalOneByteResource& operator=( - const NonOwningExternalOneByteResource&) = delete; - - private: - const uint8_t* data_; - size_t length_; -}; - -class NonOwningExternalTwoByteResource - : public v8::String::ExternalStringResource { - public: - explicit NonOwningExternalTwoByteResource(const uint16_t* data, size_t length) - : data_(data), length_(length) {} - ~NonOwningExternalTwoByteResource() override = default; - - const uint16_t* data() const override { return data_; } - size_t length() const override { return length_; } - - NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) = - delete; - NonOwningExternalTwoByteResource& operator=( - const NonOwningExternalTwoByteResource&) = delete; - - private: - const uint16_t* data_; - size_t length_; -}; - // Similar to a v8::String, but it's independent from Isolates // and can be materialized in Isolates as external Strings -// via ToStringChecked. The data pointers are owned by the caller. +// via ToStringChecked. class UnionBytes { public: UnionBytes(const uint16_t* data, size_t length) : one_bytes_(nullptr), two_bytes_(data), length_(length) {} UnionBytes(const uint8_t* data, size_t length) : one_bytes_(data), two_bytes_(nullptr), length_(length) {} + template // T = uint8_t or uint16_t + explicit UnionBytes(std::shared_ptr> data) + : UnionBytes(data->data(), data->size()) { + owning_ptr_ = data; + } UnionBytes(const UnionBytes&) = default; UnionBytes& operator=(const UnionBytes&) = default; @@ -77,23 +40,14 @@ class UnionBytes { CHECK_NOT_NULL(one_bytes_); return one_bytes_; } - v8::Local ToStringChecked(v8::Isolate* isolate) const { - if (is_one_byte()) { - NonOwningExternalOneByteResource* source = - new NonOwningExternalOneByteResource(one_bytes_data(), length_); - return v8::String::NewExternalOneByte(isolate, source).ToLocalChecked(); - } else { - NonOwningExternalTwoByteResource* source = - new NonOwningExternalTwoByteResource(two_bytes_data(), length_); - return v8::String::NewExternalTwoByte(isolate, source).ToLocalChecked(); - } - } - size_t length() { return length_; } + v8::Local ToStringChecked(v8::Isolate* isolate) const; + size_t length() const { return length_; } private: const uint8_t* one_bytes_; const uint16_t* two_bytes_; size_t length_; + std::shared_ptr owning_ptr_; }; } // namespace node diff --git a/src/util.cc b/src/util.cc index 78c6123c37a12a..8675d9fab14a65 100644 --- a/src/util.cc +++ b/src/util.cc @@ -515,4 +515,66 @@ void SetConstructorFunction(Isolate* isolate, that->Set(name, tmpl); } +namespace { + +class NonOwningExternalOneByteResource + : public v8::String::ExternalOneByteStringResource { + public: + explicit NonOwningExternalOneByteResource(const UnionBytes& source) + : source_(source) {} + ~NonOwningExternalOneByteResource() override = default; + + const char* data() const override { + return reinterpret_cast(source_.one_bytes_data()); + } + size_t length() const override { return source_.length(); } + + NonOwningExternalOneByteResource(const NonOwningExternalOneByteResource&) = + delete; + NonOwningExternalOneByteResource& operator=( + const NonOwningExternalOneByteResource&) = delete; + + private: + const UnionBytes source_; +}; + +class NonOwningExternalTwoByteResource + : public v8::String::ExternalStringResource { + public: + explicit NonOwningExternalTwoByteResource(const UnionBytes& source) + : source_(source) {} + ~NonOwningExternalTwoByteResource() override = default; + + const uint16_t* data() const override { return source_.two_bytes_data(); } + size_t length() const override { return source_.length(); } + + NonOwningExternalTwoByteResource(const NonOwningExternalTwoByteResource&) = + delete; + NonOwningExternalTwoByteResource& operator=( + const NonOwningExternalTwoByteResource&) = delete; + + private: + const UnionBytes source_; +}; + +} // anonymous namespace + +Local UnionBytes::ToStringChecked(Isolate* isolate) const { + if (UNLIKELY(length() == 0)) { + // V8 requires non-null data pointers for empty external strings, + // but we don't guarantee that. Solve this by not creating an + // external string at all in that case. + return String::Empty(isolate); + } + if (is_one_byte()) { + NonOwningExternalOneByteResource* source = + new NonOwningExternalOneByteResource(*this); + return String::NewExternalOneByte(isolate, source).ToLocalChecked(); + } else { + NonOwningExternalTwoByteResource* source = + new NonOwningExternalTwoByteResource(*this); + return String::NewExternalTwoByte(isolate, source).ToLocalChecked(); + } +} + } // namespace node diff --git a/test/cctest/test_util.cc b/test/cctest/test_util.cc index 75861ba00d52ba..443a03117c09fc 100644 --- a/test/cctest/test_util.cc +++ b/test/cctest/test_util.cc @@ -1,7 +1,8 @@ -#include "util-inl.h" #include "debug_utils-inl.h" #include "env-inl.h" #include "gtest/gtest.h" +#include "simdutf.h" +#include "util-inl.h" using node::Calloc; using node::Malloc; @@ -298,3 +299,17 @@ TEST(UtilTest, SPrintF) { const std::string with_zero = std::string("a") + '\0' + 'b'; EXPECT_EQ(SPrintF("%s", with_zero), with_zero); } + +TEST(UtilTest, SimdutfEndiannessDoesNotMeanEndianness) { + // In simdutf, "LE" does *not* refer to Little Endian, it refers + // to 16-byte code units that are stored using *host* endianness. + // This is weird and confusing naming, and so we add this assertion + // here to verify that this is actually the case (so that CI tells + // us if it changed, because for most people Little Endian is + // host endianness, so locally everything would work fine). + const char utf8source[] = "\xe7\x8c\xab"; + char16_t u16output; + size_t u16len = simdutf::convert_utf8_to_utf16le(utf8source, 3, &u16output); + EXPECT_EQ(u16len, 1u); + EXPECT_EQ(u16output, 0x732B); +}