From 92c30c97870551bb2954c77ba69c419da0c7a058 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Tue, 18 Jun 2024 10:27:20 -0700 Subject: [PATCH] Implement node.js buffer.isAscii|isUtf8 (#2281) --- WORKSPACE | 10 +++ build/BUILD.simdutf | 7 ++ compile_flags.txt | 2 + src/node/buffer.ts | 6 ++ src/node/internal/buffer.d.ts | 2 + src/node/internal/internal_buffer.ts | 16 ++++ src/workerd/api/node/buffer-nodejs-test.js | 95 ++++++++++++++++++++++ src/workerd/api/node/buffer.c++ | 11 +++ src/workerd/api/node/buffer.h | 4 + src/workerd/io/BUILD.bazel | 1 + 10 files changed, 154 insertions(+) create mode 100644 build/BUILD.simdutf diff --git a/WORKSPACE b/WORKSPACE index 37d584fb0fa..4b13db522c4 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -109,6 +109,16 @@ http_archive( url = "https://github.com/ada-url/ada/releases/download/v2.8.0/singleheader.zip", ) +http_archive( + name = "simdutf", + build_file = "//:build/BUILD.simdutf", + patch_args = ["-p1"], + patches = [], + sha256 = "7867c118a11bb7ccaea0f999a28684b06040027506b424b706146cc912b80ff6", + type = "zip", + url = "https://github.com/simdutf/simdutf/releases/download/v5.2.8/singleheader.zip", +) + http_archive( name = "pyodide", build_file = "//:build/BUILD.pyodide", diff --git a/build/BUILD.simdutf b/build/BUILD.simdutf new file mode 100644 index 00000000000..70543c8bc47 --- /dev/null +++ b/build/BUILD.simdutf @@ -0,0 +1,7 @@ +cc_library( + name = "simdutf", + srcs = ["simdutf.cpp"], + hdrs = ["simdutf.h"], + visibility = ["//visibility:public"], + copts = ["-w"], +) diff --git a/compile_flags.txt b/compile_flags.txt index 663d297433f..03bb6b7bbd8 100644 --- a/compile_flags.txt +++ b/compile_flags.txt @@ -4,9 +4,11 @@ -nostdinc -Ibazel-bin/external/dawn/include -Ibazel-bin/external/ada-url/_virtual_includes/ada-url/ +-Ibazek-bin/external/simdutf/virtual_includes/simdutf/ -Ibazel-bin/external/com_cloudflare_lol_html/_virtual_includes/lolhtml -Iexternal/perfetto-sdk/sdk/ -Iexternal/ada-url/ +-Iexternal/simdutf/ -Iexternal/com_google_benchmark/include/ -Iexternal/dawn/include -Iexternal/ssl/src/include diff --git a/src/node/buffer.ts b/src/node/buffer.ts index 7ca739a85d7..d1e50122443 100644 --- a/src/node/buffer.ts +++ b/src/node/buffer.ts @@ -8,6 +8,8 @@ import { kStringMaxLength, Buffer, SlowBuffer, + isAscii, + isUtf8, } from 'node-internal:internal_buffer'; // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment @@ -26,6 +28,8 @@ export { Blob, Buffer, SlowBuffer, + isAscii, + isUtf8, }; export default { @@ -40,4 +44,6 @@ export default { Blob, Buffer, SlowBuffer, + isAscii, + isUtf8, }; diff --git a/src/node/internal/buffer.d.ts b/src/node/internal/buffer.d.ts index e06f4a75349..647fefa0268 100644 --- a/src/node/internal/buffer.d.ts +++ b/src/node/internal/buffer.d.ts @@ -35,3 +35,5 @@ export function write(buffer: Uint8Array, encoding: string): void; export function decode(buffer: Uint8Array, state: Uint8Array): string; export function flush(state: Uint8Array): string; +export function isAscii(value: ArrayBufferView): boolean; +export function isUtf8(value: ArrayBufferView): boolean; diff --git a/src/node/internal/internal_buffer.ts b/src/node/internal/internal_buffer.ts index a5adcd51895..2c312d5b9a7 100644 --- a/src/node/internal/internal_buffer.ts +++ b/src/node/internal/internal_buffer.ts @@ -2280,10 +2280,26 @@ function writeU_Int24LE( return offset; } +export function isAscii(value: ArrayBufferView) { + if ((value as any)?.detached || (value as any)?.buffer?.detached) { + throw new Error('Unable to determine if buffer is ASCII when it is detached'); + } + return bufferUtil.isAscii(value); +} + +export function isUtf8(value: ArrayBufferView) { + if ((value as any)?.detached || (value as any)?.buffer?.detached) { + throw new Error('Unable to determine if buffer is UTF8 when it is detached'); + } + return bufferUtil.isUtf8(value); +} + export default { Buffer, constants, kMaxLength, kStringMaxLength, SlowBuffer, + isAscii, + isUtf8, }; diff --git a/src/workerd/api/node/buffer-nodejs-test.js b/src/workerd/api/node/buffer-nodejs-test.js index 78c05a67f25..16bb2da0a89 100644 --- a/src/workerd/api/node/buffer-nodejs-test.js +++ b/src/workerd/api/node/buffer-nodejs-test.js @@ -39,6 +39,8 @@ import { kMaxLength, kStringMaxLength, constants, + isAscii, + isUtf8, } from 'node:buffer'; import * as buffer from 'node:buffer'; @@ -5610,3 +5612,96 @@ export const inspect = { ); } }; + +export const isAsciiTest = { + test(ctrl, env, ctx) { + const encoder = new TextEncoder(); + strictEqual(isAscii(encoder.encode('hello')), true); + strictEqual(isAscii(encoder.encode('ğ')), false); + strictEqual(isAscii(Buffer.from([])), true); + + [ + undefined, + '', 'hello', + false, true, + 0, 1, + 0n, 1n, + Symbol(), + () => {}, + {}, [], null, + ].forEach((input) => { + throws( + () => isAscii(input), + ); + }); + } +}; + +export const isUtf8Test = { + test(ctrl, env, ctx) { + const encoder = new TextEncoder(); + + strictEqual(isUtf8(encoder.encode('hello')), true); + strictEqual(isUtf8(encoder.encode('ğ')), true); + strictEqual(isUtf8(Buffer.from([])), true); + + // Taken from test/fixtures/wpt/encoding/textdecoder-fatal.any.js + [ + [0xFF], // 'invalid code' + [0xC0], // 'ends early' + [0xE0], // 'ends early 2' + [0xC0, 0x00], // 'invalid trail' + [0xC0, 0xC0], // 'invalid trail 2' + [0xE0, 0x00], // 'invalid trail 3' + [0xE0, 0xC0], // 'invalid trail 4' + [0xE0, 0x80, 0x00], // 'invalid trail 5' + [0xE0, 0x80, 0xC0], // 'invalid trail 6' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // '> 0x10FFFF' + [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], // 'obsolete lead byte' + + // Overlong encodings + [0xC0, 0x80], // 'overlong U+0000 - 2 bytes' + [0xE0, 0x80, 0x80], // 'overlong U+0000 - 3 bytes' + [0xF0, 0x80, 0x80, 0x80], // 'overlong U+0000 - 4 bytes' + [0xF8, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], // 'overlong U+0000 - 6 bytes' + + [0xC1, 0xBF], // 'overlong U+007F - 2 bytes' + [0xE0, 0x81, 0xBF], // 'overlong U+007F - 3 bytes' + [0xF0, 0x80, 0x81, 0xBF], // 'overlong U+007F - 4 bytes' + [0xF8, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], // 'overlong U+007F - 6 bytes' + + [0xE0, 0x9F, 0xBF], // 'overlong U+07FF - 3 bytes' + [0xF0, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 4 bytes' + [0xF8, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 5 bytes' + [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], // 'overlong U+07FF - 6 bytes' + + [0xF0, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 4 bytes' + [0xF8, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 5 bytes' + [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], // 'overlong U+FFFF - 6 bytes' + + [0xF8, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 5 bytes' + [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], // 'overlong U+10FFFF - 6 bytes' + + // UTF-16 surrogates encoded as code points in UTF-8 + [0xED, 0xA0, 0x80], // 'lead surrogate' + [0xED, 0xB0, 0x80], // 'trail surrogate' + [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], // 'surrogate pair' + ].forEach((input) => { + strictEqual(isUtf8(Buffer.from(input)), false); + }); + + [ + null, + undefined, + 'hello', + true, + false, + ].forEach((input) => { + throws( + () => isUtf8(input), + ); + }); + } +}; diff --git a/src/workerd/api/node/buffer.c++ b/src/workerd/api/node/buffer.c++ index 2be3608bc3a..29305bd461e 100644 --- a/src/workerd/api/node/buffer.c++ +++ b/src/workerd/api/node/buffer.c++ @@ -9,6 +9,7 @@ #include #include #include +#include "simdutf.h" // These are defined by or on some systems. // To avoid warnings, undefine them before redefining them. @@ -862,5 +863,15 @@ jsg::JsString BufferUtil::flush(jsg::Lock& js, kj::Array state) { return ret; } +bool BufferUtil::isAscii(kj::Array buffer) { + if (buffer.size() == 0) return true; + return simdutf::validate_ascii(buffer.asChars().begin(), buffer.size()); +} + +bool BufferUtil::isUtf8(kj::Array buffer) { + if (buffer.size() == 0) return true; + return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size()); +} + } // namespace workerd::api::node { diff --git a/src/workerd/api/node/buffer.h b/src/workerd/api/node/buffer.h index 5530b3eaac4..514e86ec35b 100644 --- a/src/workerd/api/node/buffer.h +++ b/src/workerd/api/node/buffer.h @@ -77,6 +77,8 @@ class BufferUtil final: public jsg::Object { kj::Array bytes, kj::Array state); jsg::JsString flush(jsg::Lock& js, kj::Array state); + bool isAscii(kj::Array bytes); + bool isUtf8(kj::Array bytes); JSG_RESOURCE_TYPE(BufferUtil) { JSG_METHOD(byteLength); @@ -88,6 +90,8 @@ class BufferUtil final: public jsg::Object { JSG_METHOD(swap); JSG_METHOD(toString); JSG_METHOD(write); + JSG_METHOD(isAscii); + JSG_METHOD(isUtf8); // For StringDecoder JSG_METHOD(decode); diff --git a/src/workerd/io/BUILD.bazel b/src/workerd/io/BUILD.bazel index b24089b7440..344eb350943 100644 --- a/src/workerd/io/BUILD.bazel +++ b/src/workerd/io/BUILD.bazel @@ -60,6 +60,7 @@ wd_cc_library( implementation_deps = [ "@capnp-cpp//src/kj/compat:kj-brotli", "@capnp-cpp//src/kj/compat:kj-gzip", + "@simdutf", ], visibility = ["//visibility:public"], deps = [