diff --git a/justfile b/justfile index fc2e297069b..02d558aad9b 100644 --- a/justfile +++ b/justfile @@ -1,3 +1,9 @@ +default: + @just --list + +# Override this value by calling `just --set clang_version 18` +clang_version := "15" + prepare: cargo install gen-compile-commands @@ -6,3 +12,15 @@ compile-commands: clean: rm -f compile_commands.json + +build *args="//...": + bazel build {{args}} --action_env=CC=clang-{{clang_version}} --action_env=CXX=clang++-{{clang_version}} + +build-asan *args="//...": + just build {{args}} --config=asan --sandbox_debug + +test *args="//...": + bazel test {{args}} --action_env=CC=clang-{{clang_version}} --action_env=CXX=clang++-{{clang_version}} --test_env=LLVM_SYMBOLIZER=llvm-symbolizer-{{clang_version}} + +test-asan *args="//...": + just test {{args}} --config=asan diff --git a/src/node/buffer.ts b/src/node/buffer.ts index d1e50122443..d5e5b9a1e82 100644 --- a/src/node/buffer.ts +++ b/src/node/buffer.ts @@ -10,6 +10,7 @@ import { SlowBuffer, isAscii, isUtf8, + transcode, } from 'node-internal:internal_buffer'; // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment @@ -30,6 +31,7 @@ export { SlowBuffer, isAscii, isUtf8, + transcode, }; export default { @@ -46,4 +48,5 @@ export default { SlowBuffer, isAscii, isUtf8, + transcode, }; diff --git a/src/node/internal/buffer.d.ts b/src/node/internal/buffer.d.ts index 647fefa0268..fca34b23b8c 100644 --- a/src/node/internal/buffer.d.ts +++ b/src/node/internal/buffer.d.ts @@ -37,3 +37,4 @@ export function decode(buffer: Uint8Array, state: Uint8Array): string; export function flush(state: Uint8Array): string; export function isAscii(value: ArrayBufferView): boolean; export function isUtf8(value: ArrayBufferView): boolean; +export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string): ArrayBuffer; diff --git a/src/node/internal/crypto_dh.ts b/src/node/internal/crypto_dh.ts index cb8b2d9e595..5d622a01233 100644 --- a/src/node/internal/crypto_dh.ts +++ b/src/node/internal/crypto_dh.ts @@ -84,7 +84,7 @@ let DiffieHellman = function (this: DiffieHellman, sizeOrKey: number|ArrayLike, if (typeof sizeOrKey === 'number') validateInt32(sizeOrKey, 'sizeOrKey'); - if (keyEncoding && !Buffer.isEncoding(keyEncoding) && keyEncoding !== 'buffer') { + if (keyEncoding && keyEncoding !== 'buffer' && !Buffer.isEncoding(keyEncoding)) { genEncoding = generator as any; generator = keyEncoding; keyEncoding = "utf-8"; // default encoding diff --git a/src/node/internal/internal_buffer.ts b/src/node/internal/internal_buffer.ts index 2717907f9a7..7e083e4fede 100644 --- a/src/node/internal/internal_buffer.ts +++ b/src/node/internal/internal_buffer.ts @@ -440,7 +440,7 @@ export function compare(a: Buffer|Uint8Array, b: Buffer|Uint8Array) { Buffer.compare = compare; -export function isEncoding(encoding: unknown) { +export function isEncoding(encoding: unknown): encoding is string { return typeof encoding === "string" && encoding.length !== 0 && normalizeEncoding(encoding) !== undefined; @@ -2294,6 +2294,22 @@ export function isUtf8(value: ArrayBufferView) { return bufferUtil.isUtf8(value); } +export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string) { + if (!isArrayBufferView(source)) { + throw new ERR_INVALID_ARG_TYPE('source', 'ArrayBufferView', typeof source); + } + const normalizedFromEncoding = normalizeEncoding(fromEncoding); + if (!Buffer.isEncoding(normalizedFromEncoding)) { + throw new ERR_UNKNOWN_ENCODING(fromEncoding); + } + const normalizedToEncoding = normalizeEncoding(toEncoding); + if (!Buffer.isEncoding(normalizedToEncoding)) { + throw new ERR_UNKNOWN_ENCODING(toEncoding); + } + // TODO(soon): Optimization opportunity: Pass int encoding values instead of strings. + return Buffer.from(bufferUtil.transcode(source, normalizedFromEncoding, normalizedToEncoding)); +} + export default { Buffer, constants, diff --git a/src/workerd/api/node/buffer.c++ b/src/workerd/api/node/buffer.c++ index 8ae557fee9f..8eec8601165 100644 --- a/src/workerd/api/node/buffer.c++ +++ b/src/workerd/api/node/buffer.c++ @@ -8,8 +8,11 @@ #include "buffer-string-search.h" #include #include -#include +#include #include "simdutf.h" +#include "i18n.h" + +#include // These are defined by or on some systems. // To avoid warnings, undefine them before redefining them. @@ -85,34 +88,24 @@ void SwapBytes(kj::ArrayPtr bytes) { } } -enum class Encoding { - ASCII, - LATIN1, - UTF8, - UTF16LE, - BASE64, - BASE64URL, - HEX, -}; - -Encoding getEncoding(kj::StringPtr encoding) { - if (encoding == "utf8"_kj) { +inline Encoding getEncoding(kj::StringPtr input) { + if (input == "utf8"_kj) { return Encoding::UTF8; - } else if (encoding == "ascii") { + } else if (input == "ascii"_kj) { return Encoding::ASCII; - } else if (encoding == "latin1") { + } else if (input == "latin1"_kj) { return Encoding::LATIN1; - } else if (encoding == "utf16le") { + } else if (input == "utf16le"_kj) { return Encoding::UTF16LE; - } else if (encoding == "base64") { + } else if (input == "base64"_kj) { return Encoding::BASE64; - } else if (encoding == "base64url") { + } else if (input == "base64url"_kj) { return Encoding::BASE64URL; - } else if (encoding == "hex") { + } else if (input == "hex"_kj) { return Encoding::HEX; } - KJ_UNREACHABLE; + JSG_FAIL_REQUIRE(Error, kj::str("Invalid encoding: ", input)); } kj::Maybe tryFromHexDigit(char c) { @@ -137,7 +130,7 @@ kj::Array decodeHexTruncated(kj::ArrayPtr text, bool strict = fa } text = text.slice(0, text.size() - 1); } - kj::Vector vec = kj::Vector(text.size() / 2); + auto vec = kj::Vector(text.size() / 2); for (size_t i = 0; i < text.size(); i += 2) { byte b = 0; @@ -216,8 +209,9 @@ uint32_t writeInto( dest.first(amountToCopy).copyFrom(bytes.first(amountToCopy)); return amountToCopy; } + default: + KJ_UNREACHABLE; } - KJ_UNREACHABLE; } kj::Array decodeStringImpl( @@ -272,8 +266,9 @@ kj::Array decodeStringImpl( string.writeInto(js, buf, options); return decodeHexTruncated(buf, strict); } + default: + KJ_UNREACHABLE; } - KJ_UNREACHABLE; } } // namespace @@ -561,8 +556,9 @@ jsg::JsString toStringImpl( case Encoding::HEX: { return js.str(kj::encodeHex(slice)); } + default: + KJ_UNREACHABLE; } - KJ_UNREACHABLE; } } // namespace @@ -876,5 +872,16 @@ bool BufferUtil::isUtf8(kj::Array buffer) { return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size()); } +kj::Array BufferUtil::transcode(kj::Array source, kj::String rawFromEncoding, kj::String rawToEncoding) { + auto fromEncoding = getEncoding(rawFromEncoding); + auto toEncoding = getEncoding(rawToEncoding); + + JSG_REQUIRE(i18n::canBeTranscoded(fromEncoding) && + i18n::canBeTranscoded(toEncoding), Error, + "Unable to transcode buffer due to unsupported encoding"); + + return i18n::transcode(source, fromEncoding, toEncoding); +} + } // namespace workerd::api::node { diff --git a/src/workerd/api/node/buffer.h b/src/workerd/api/node/buffer.h index 3b1c6bc4365..0a5926e1f08 100644 --- a/src/workerd/api/node/buffer.h +++ b/src/workerd/api/node/buffer.h @@ -81,6 +81,9 @@ class BufferUtil final: public jsg::Object { jsg::JsString flush(jsg::Lock& js, kj::Array state); bool isAscii(kj::Array bytes); bool isUtf8(kj::Array bytes); + kj::Array transcode(kj::Array source, + kj::String rawFromEncoding, + kj::String rawToEncoding); JSG_RESOURCE_TYPE(BufferUtil) { JSG_METHOD(byteLength); @@ -94,6 +97,7 @@ class BufferUtil final: public jsg::Object { JSG_METHOD(write); JSG_METHOD(isAscii); JSG_METHOD(isUtf8); + JSG_METHOD(transcode); // For StringDecoder JSG_METHOD(decode); diff --git a/src/workerd/api/node/i18n.c++ b/src/workerd/api/node/i18n.c++ new file mode 100644 index 00000000000..5eb6f878477 --- /dev/null +++ b/src/workerd/api/node/i18n.c++ @@ -0,0 +1,261 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 +// Copyright Joyent and Node contributors. All rights reserved. MIT license. + +#include "i18n.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "simdutf.h" + +namespace workerd::api::node { + +namespace i18n { + +namespace { + +// An isolate has a 128mb memory limit. +const int ISOLATE_LIMIT = 134217728; + +struct ConverterDisposer : public kj::Disposer { + static const ConverterDisposer INSTANCE; + void disposeImpl(void* pointer) const override { + ucnv_close(reinterpret_cast(pointer)); + } +}; + +const ConverterDisposer ConverterDisposer::INSTANCE; + +const char* getEncodingName(Encoding input) { + switch (input) { + case Encoding::ASCII: + return "us-ascii"; + case Encoding::LATIN1: + return "iso8859-1"; + case Encoding::UTF16LE: + return "utf16le"; + case Encoding::UTF8: + return "utf-8"; + default: + KJ_UNREACHABLE; + } +} + +typedef kj::Maybe> (*TranscodeImpl)(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding); + +kj::Maybe> TranscodeDefault(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding) { + Converter to(toEncoding); + auto substitute = kj::str(kj::repeat('?', to.minCharSize())); + to.setSubstituteChars(substitute); + Converter from(fromEncoding); + + size_t limit = source.size() * to.maxCharSize(); + // Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check. + JSG_REQUIRE(limit <= ISOLATE_LIMIT, Error, "Source buffer is too large to transcode"); + auto out = kj::heapArray(limit); + char* target = out.asChars().begin(); + const char* source_ = source.asChars().begin(); + UErrorCode status{}; + ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, &source_, source_ + source.size(), + nullptr, nullptr, nullptr, nullptr, true, true, &status); + if (U_SUCCESS(status)) { + return out.slice(0, target - out.asChars().begin()).attach(kj::mv(out)); + } + + return kj::none; +} + +kj::Maybe> TranscodeLatin1ToUTF16(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding) { + auto length_in_chars = source.size() * sizeof(UChar); + // Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check. + JSG_REQUIRE(length_in_chars <= ISOLATE_LIMIT, Error, "Source buffer is too large to transcode"); + + Converter from(fromEncoding); + auto destbuf = kj::heapArray(length_in_chars); + auto actual_length = + simdutf::convert_latin1_to_utf16(source.asChars().begin(), source.size(), destbuf.begin()); + + // simdutf returns 0 for invalid value. + if (actual_length == 0) { + return kj::none; + } + + return destbuf.slice(0, actual_length).asBytes().attach(kj::mv(destbuf)); +} + +kj::Maybe> TranscodeFromUTF16(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding) { + Converter to(toEncoding); + auto substitute = kj::str(kj::repeat('?', to.minCharSize())); + to.setSubstituteChars(substitute); + + auto utf16_input = kj::arrayPtr(reinterpret_cast(source.begin()), + source.size() / sizeof(UChar)); + + const auto limit = utf16_input.size() * to.maxCharSize(); + + // Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check. + JSG_REQUIRE(limit <= ISOLATE_LIMIT, Error, "Buffer is too large to transcode"); + + auto destbuf = kj::heapArray(limit); + UErrorCode status{}; + auto len = ucnv_fromUChars(to.conv(), destbuf.asChars().begin(), destbuf.size(), + utf16_input.begin(), utf16_input.size(), &status); + + if (U_SUCCESS(status)) { + return destbuf.slice(0, len).asBytes().attach(kj::mv(destbuf)); + } + + return kj::none; +} + +kj::Maybe> TranscodeUTF16FromUTF8(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding) { + size_t expected_utf16_length = + simdutf::utf16_length_from_utf8(source.asChars().begin(), source.size()); + // Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check. + JSG_REQUIRE(expected_utf16_length <= ISOLATE_LIMIT, Error, + "Expected UTF-16le length is too large to transcode"); + auto destbuf = kj::heapArray(expected_utf16_length); + + size_t actual_length = + simdutf::convert_utf8_to_utf16le(source.asChars().begin(), source.size(), destbuf.begin()); + JSG_REQUIRE(actual_length == expected_utf16_length, Error, "Expected UTF16 length mismatch"); + + // simdutf returns 0 for invalid UTF-8 value. + if (actual_length == 0) { + return kj::none; + } + + return destbuf.asBytes().attach(kj::mv(destbuf)); +} + +kj::Maybe> TranscodeUTF8FromUTF16(kj::ArrayPtr source, + Encoding fromEncoding, Encoding toEncoding) { + JSG_REQUIRE(source.size() % 2 == 0, Error, "UTF-16le input size should be multiple of 2"); + auto utf16_input = + kj::arrayPtr(reinterpret_cast(source.begin()), source.size() / 2); + size_t expected_utf8_length = + simdutf::utf8_length_from_utf16le(utf16_input.begin(), utf16_input.size()); + + // Workers are limited to 128MB so this isn't actually a realistic concern, but sanity check. + JSG_REQUIRE(expected_utf8_length <= ISOLATE_LIMIT, Error, + "Expected UTF-8 length is too large to transcode"); + + auto destbuf = kj::heapArray(expected_utf8_length); + + size_t actual_length = simdutf::convert_utf16le_to_utf8(utf16_input.begin(), utf16_input.size(), + destbuf.asChars().begin()); + JSG_REQUIRE(actual_length == expected_utf8_length, Error, "Expected UTF8 length mismatch"); + + // simdutf returns 0 for invalid UTF-8 value. + if (actual_length == 0) { + return kj::none; + } + + return destbuf.asBytes().attach(kj::mv(destbuf)); +} + +} // namespace + +Converter::Converter(Encoding encoding, kj::StringPtr substitute) { + UErrorCode status = U_ZERO_ERROR; + auto name = getEncodingName(encoding); + auto conv = ucnv_open(name, &status); + JSG_REQUIRE(U_SUCCESS(status), Error, "Failed to initialize converter"); + conv_ = kj::Own(conv, ConverterDisposer::INSTANCE); + setSubstituteChars(substitute); +} + +UConverter* Converter::conv() const { + return const_cast(conv_.get()); +} + +size_t Converter::maxCharSize() const { + KJ_ASSERT_NONNULL(conv_.get()); + return ucnv_getMaxCharSize(conv_.get()); +} + +size_t Converter::minCharSize() const { + KJ_ASSERT_NONNULL(conv_.get()); + return ucnv_getMinCharSize(conv_.get()); +} + +void Converter::reset() { + KJ_ASSERT_NONNULL(conv_.get()); + ucnv_reset(conv_.get()); +} + +void Converter::setSubstituteChars(kj::StringPtr sub) { + KJ_ASSERT_NONNULL(conv_.get()); + UErrorCode status = U_ZERO_ERROR; + if (sub.size() > 0) { + ucnv_setSubstChars(conv_.get(), sub.begin(), sub.size(), &status); + JSG_REQUIRE(U_SUCCESS(status), Error, "Setting ICU substitute characters failed"); + } +} + +kj::Array transcode(kj::ArrayPtr source, Encoding fromEncoding, + Encoding toEncoding) { + // Optimization: + // If both encodings are same, we just return a copy of the buffer. + if (fromEncoding == toEncoding) { + auto destbuf = kj::heapArray(source.size()); + destbuf.asPtr().copyFrom(source); + return destbuf.asBytes().attach(kj::mv(destbuf)); + } + + TranscodeImpl transcode_function = &TranscodeDefault; + switch (fromEncoding) { + case Encoding::ASCII: + case Encoding::LATIN1: + if (toEncoding == Encoding::UTF16LE) { + transcode_function = &TranscodeLatin1ToUTF16; + } + break; + case Encoding::UTF8: + if (toEncoding == Encoding::UTF16LE) { + transcode_function = &TranscodeUTF16FromUTF8; + } + break; + case Encoding::UTF16LE: + switch (toEncoding) { + case Encoding::UTF16LE: + transcode_function = &TranscodeDefault; + break; + case Encoding::UTF8: + transcode_function = &TranscodeUTF8FromUTF16; + break; + default: + transcode_function = &TranscodeFromUTF16; + } + break; + default: + JSG_FAIL_REQUIRE(Error, "Invalid encoding passed to transcode"); + } + + return JSG_REQUIRE_NONNULL(transcode_function(source, fromEncoding, toEncoding), Error, + "Unable to transcode buffer"); +} + +} // namespace i18n + +} // namespace workerd::api::node diff --git a/src/workerd/api/node/i18n.h b/src/workerd/api/node/i18n.h new file mode 100644 index 00000000000..349c7f984a6 --- /dev/null +++ b/src/workerd/api/node/i18n.h @@ -0,0 +1,61 @@ +// Copyright (c) 2017-2022 Cloudflare, Inc. +// Licensed under the Apache 2.0 license found in the LICENSE file or at: +// https://opensource.org/licenses/Apache-2.0 + +#include +#include +#include +#include + +struct UConverter; + +namespace workerd::api::node { + +enum class Encoding { + ASCII, + LATIN1, + UTF8, + UTF16LE, + BASE64, + BASE64URL, + HEX, +}; + +namespace i18n { + +// Used by BufferUtil::transcode. +constexpr bool canBeTranscoded(Encoding encoding) noexcept { + switch (encoding) { + case Encoding::ASCII: + case Encoding::LATIN1: + case Encoding::UTF16LE: + case Encoding::UTF8: + return true; + default: + return false; + } +} + +class Converter final { +public: + explicit Converter(Encoding encoding, kj::StringPtr substitude = ""_kj); + KJ_DISALLOW_COPY_AND_MOVE(Converter); + + UConverter* conv() const; + size_t maxCharSize() const; + size_t minCharSize() const; + void reset(); + void setSubstituteChars(kj::StringPtr sub); + +private: + kj::Own conv_; +}; + +kj::Array transcode(kj::ArrayPtr source, Encoding fromEncoding, + Encoding toEncoding); + +} // namespace i18n + +} // namespace workerd::api::node + +KJ_DECLARE_NON_POLYMORPHIC(UConverter) diff --git a/src/workerd/api/node/tests/buffer-nodejs-test.js b/src/workerd/api/node/tests/buffer-nodejs-test.js index 16bb2da0a89..e0547414bdb 100644 --- a/src/workerd/api/node/tests/buffer-nodejs-test.js +++ b/src/workerd/api/node/tests/buffer-nodejs-test.js @@ -41,6 +41,7 @@ import { constants, isAscii, isUtf8, + transcode, } from 'node:buffer'; import * as buffer from 'node:buffer'; @@ -5705,3 +5706,89 @@ export const isUtf8Test = { }); } }; + +// Adapted from test/parallel/test-icu-transcode.js +export const transcodeTest = { + test(ctrl, env, ctx) { + const orig = Buffer.from('těst ☕', 'utf8'); + const tests = { + 'latin1': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f], + 'ascii': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f], + 'ucs2': [0x74, 0x00, 0x1b, 0x01, 0x73, + 0x00, 0x74, 0x00, 0x20, 0x00, + 0x15, 0x26] + }; + + for (const test in tests) { + const dest = transcode(orig, 'utf8', test); + strictEqual(dest.length, tests[test].length, `utf8->${test} length`); + for (let n = 0; n < tests[test].length; n++) { + strictEqual(dest[n], tests[test][n], `utf8->${test} char ${n}`); + } + } + + { + const dest = transcode(Buffer.from(tests.ucs2), 'ucs2', 'utf8'); + strictEqual(dest.toString(), orig.toString()); + } + + { + const utf8 = Buffer.from('€'.repeat(4000), 'utf8'); + const ucs2 = Buffer.from('€'.repeat(4000), 'ucs2'); + const utf8_to_ucs2 = transcode(utf8, 'utf8', 'ucs2'); + const ucs2_to_utf8 = transcode(ucs2, 'ucs2', 'utf8'); + deepStrictEqual(utf8, ucs2_to_utf8); + deepStrictEqual(ucs2, utf8_to_ucs2); + strictEqual(ucs2_to_utf8.toString('utf8'), utf8_to_ucs2.toString('ucs2')); + } + + { + deepStrictEqual( + transcode(Buffer.from('hi', 'ascii'), 'ascii', 'utf16le'), + Buffer.from('hi', 'utf16le')); + deepStrictEqual( + transcode(Buffer.from('hi', 'latin1'), 'latin1', 'utf16le'), + Buffer.from('hi', 'utf16le')); + deepStrictEqual( + transcode(Buffer.from('hä', 'latin1'), 'latin1', 'utf16le'), + Buffer.from('hä', 'utf16le')); + } + + { + const dest = transcode(new Uint8Array(), 'utf8', 'latin1'); + strictEqual(dest.length, 0); + } + + // Test that Uint8Array arguments are okay. + { + const uint8array = new Uint8Array([...Buffer.from('hä', 'latin1')]); + deepStrictEqual( + transcode(uint8array, 'latin1', 'utf16le'), + Buffer.from('hä', 'utf16le')); + } + + // Invalid arguments should fail + throws(() => transcode(null, 'utf8', 'ascii')); + throws(() => transcode(Buffer.from('a'), 'b', 'utf8')); + throws(() => transcode(Buffer.from('a'), 'uf8', 'b')); + + // Throws error for buffer bigger than 128mb. + { + const ISOLATE_MAX_SIZE = 134217728; + const val = Buffer.from('a'.repeat(ISOLATE_MAX_SIZE)); + + throws(() => transcode(val, 'utf16le', 'utf8')); + throws(() => transcode(val, 'latin1', 'utf16le')); + } + + // Make sure same fromEncoding and toEncoding results in copy. + { + const original = Buffer.from('a'); + const copied_value = transcode(original, 'utf8', 'utf8'); + // Let's detach the copied_value + const _ = copied_value.buffer.transfer(); + ok(copied_value.buffer.detached); + ok(!original.buffer.detached); + } + } +};