From 8df4bbf0e4356fa7b5590538a5d349189ff45e6a Mon Sep 17 00:00:00 2001 From: Yagiz Nizipli Date: Sun, 11 Dec 2022 17:28:03 -0500 Subject: [PATCH] src: add node:encoding module --- doc/api/encoding.md | 63 +++++++++++++++ doc/api/index.md | 1 + lib/encoding.js | 57 ++++++++++++++ lib/internal/bootstrap/loaders.js | 2 + node.gyp | 1 + src/node_binding.cc | 1 + src/node_encoding.cc | 77 +++++++++++++++++++ src/node_external_reference.h | 1 + test/parallel/test-encoding.js | 21 +++++ typings/internalBinding/encoding_methods.d.ts | 5 ++ 10 files changed, 229 insertions(+) create mode 100644 doc/api/encoding.md create mode 100644 lib/encoding.js create mode 100644 src/node_encoding.cc create mode 100644 test/parallel/test-encoding.js create mode 100644 typings/internalBinding/encoding_methods.d.ts diff --git a/doc/api/encoding.md b/doc/api/encoding.md new file mode 100644 index 00000000000000..07a09c4d89a944 --- /dev/null +++ b/doc/api/encoding.md @@ -0,0 +1,63 @@ +# Encoding + + + +> Stability: 1 - Experimental + + + +The `node:encoding` module provides unicode validation and transcoding. +To access it: + +```mjs +import encoding from 'node:encoding'; +``` + +```cjs +const encoding = require('node:encoding'); +``` + +This module is only available under the `node:` scheme. The following will not +work: + +```mjs +import encoding from 'encoding'; +``` + +```cjs +const encoding = require('encoding'); +``` + +## `isAscii(input)` + + + +* input {Buffer | Uint8Array | string} The ASCII input to validate. +* Returns: {boolean} Returns true if and only if the input is valid ASCII. + +This function is used to check if input contains ASCII code points (characters). + +## `isUtf8(input)` + + + +* input {Buffer | Uint8Array} The UTF8 input to validate. +* Returns: {boolean} Returns true if and only if the input is valid UTF8. + +This function is used to check if input contains UTF8 code points (characters). + +## `countUtf8(input)` + + + +* input {Buffer | Uint8Array} +* Returns: {number} + +This function is used to count the number of code points (characters) in the +input assuming that it is a valid UTF8 input. diff --git a/doc/api/index.md b/doc/api/index.md index 9c35550f5daf81..8e2dc26e3c3841 100644 --- a/doc/api/index.md +++ b/doc/api/index.md @@ -29,6 +29,7 @@ * [DNS](dns.md) * [Domain](domain.md) * [Errors](errors.md) +* [Encoding](encoding.md) * [Events](events.md) * [File system](fs.md) * [Globals](globals.md) diff --git a/lib/encoding.js b/lib/encoding.js new file mode 100644 index 00000000000000..9f4b5d39b588a5 --- /dev/null +++ b/lib/encoding.js @@ -0,0 +1,57 @@ +'use strict'; + +const { + isAscii: _isAscii, + isUtf8: _isUtf8, + countUtf8: _countUtf8, +} = internalBinding('encoding_methods'); + +const { + isUint8Array, +} = require('internal/util/types'); + +const { + emitExperimentalWarning, +} = require('internal/util'); + +const { TextEncoder } = require('util'); +const { Buffer } = require('buffer'); + +const encoder = new TextEncoder(); + +emitExperimentalWarning('Encoding'); + +function isAscii(input) { + if (Buffer.isBuffer(input) || isUint8Array(input)) { + return _isAscii(input.buffer); + } + + if (typeof input === 'string') { + const { buffer } = encoder.encode(input); + return _isAscii(buffer); + } + + return false; +} + +function isUtf8(input) { + if (Buffer.isBuffer(input) || isUint8Array(input)) { + return _isUtf8(input.buffer); + } + + return false; +} + +function countUtf8(input) { + if (Buffer.isBuffer(input) || isUint8Array(input)) { + return _countUtf8(input.buffer); + } + + return 0; +} + +module.exports = { + isAscii, + isUtf8, + countUtf8, +}; diff --git a/lib/internal/bootstrap/loaders.js b/lib/internal/bootstrap/loaders.js index 47bd830570fa9f..9475b46d05ed03 100644 --- a/lib/internal/bootstrap/loaders.js +++ b/lib/internal/bootstrap/loaders.js @@ -85,6 +85,7 @@ const internalBindingAllowlist = new SafeSet([ 'constants', 'contextify', 'crypto', + 'encoding_methods', 'fs', 'fs_event_wrap', 'http_parser', @@ -124,6 +125,7 @@ const legacyWrapperList = new SafeSet([ // Modules that can only be imported via the node: scheme. const schemelessBlockList = new SafeSet([ + 'encoding', 'test', ]); diff --git a/node.gyp b/node.gyp index 95f6abfa309428..d4035d44098793 100644 --- a/node.gyp +++ b/node.gyp @@ -501,6 +501,7 @@ 'src/node_dir.cc', 'src/node_env_var.cc', 'src/node_errors.cc', + 'src/node_encoding.cc', 'src/node_external_reference.cc', 'src/node_file.cc', 'src/node_http_parser.cc', diff --git a/src/node_binding.cc b/src/node_binding.cc index ab25501dcbae96..463acba163e1e4 100644 --- a/src/node_binding.cc +++ b/src/node_binding.cc @@ -43,6 +43,7 @@ V(contextify) \ V(credentials) \ V(errors) \ + V(encoding_methods) \ V(fs) \ V(fs_dir) \ V(fs_event_wrap) \ diff --git a/src/node_encoding.cc b/src/node_encoding.cc new file mode 100644 index 00000000000000..16912246827fab --- /dev/null +++ b/src/node_encoding.cc @@ -0,0 +1,77 @@ +#include "env-inl.h" +#include "node.h" +#include "node_errors.h" +#include "node_external_reference.h" +#include "util-inl.h" + +#include "simdutf.h" + +namespace node { + +using v8::ArrayBuffer; +using v8::BackingStore; +using v8::CFunction; +using v8::Context; +using v8::FastApiTypedArray; +using v8::FunctionCallbackInfo; +using v8::Isolate; +using v8::Local; +using v8::MaybeLocal; +using v8::Object; +using v8::String; +using v8::Uint32Array; +using v8::Uint8Array; +using v8::Value; + +// TODO(anonrig): Replace this with encoding when encoding enum is renamed. +namespace encoding_methods { + +static void IsAscii(const FunctionCallbackInfo& args) { + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsArrayBuffer()); + Local input = args[0].As(); + auto external_resource = static_cast(input->Data()); + args.GetReturnValue().Set( + simdutf::validate_ascii(external_resource, input->ByteLength())); +} + +static void IsUtf8(const FunctionCallbackInfo& args) { + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsArrayBuffer()); + Local input = args[0].As(); + auto external_resource = static_cast(input->Data()); + args.GetReturnValue().Set( + simdutf::validate_utf8(external_resource, input->ByteLength())); +} + +static void CountUtf8(const FunctionCallbackInfo& args) { + CHECK_GE(args.Length(), 1); + CHECK(args[0]->IsArrayBuffer()); + Local input = args[0].As(); + auto external_resource = static_cast(input->Data()); + int count = simdutf::count_utf8(external_resource, input->ByteLength()); + args.GetReturnValue().Set(count); +} + +static void Initialize(Local target, + Local unused, + Local context, + void* priv) { + SetMethodNoSideEffect(context, target, "isAscii", IsAscii); + SetMethodNoSideEffect(context, target, "isUtf8", IsUtf8); + SetMethodNoSideEffect(context, target, "countUtf8", CountUtf8); +} + +void RegisterExternalReferences(ExternalReferenceRegistry* registry) { + registry->Register(IsAscii); + registry->Register(IsUtf8); + registry->Register(CountUtf8); +} + +} // namespace encoding_methods +} // namespace node + +NODE_BINDING_CONTEXT_AWARE_INTERNAL(encoding_methods, + node::encoding_methods::Initialize) +NODE_BINDING_EXTERNAL_REFERENCE( + encoding_methods, node::encoding_methods::RegisterExternalReferences) \ No newline at end of file diff --git a/src/node_external_reference.h b/src/node_external_reference.h index bf4b49670de310..0257c8f8cc3509 100644 --- a/src/node_external_reference.h +++ b/src/node_external_reference.h @@ -67,6 +67,7 @@ class ExternalReferenceRegistry { V(credentials) \ V(env_var) \ V(errors) \ + V(encoding_methods) \ V(fs) \ V(fs_dir) \ V(fs_event_wrap) \ diff --git a/test/parallel/test-encoding.js b/test/parallel/test-encoding.js new file mode 100644 index 00000000000000..dfe0878666335f --- /dev/null +++ b/test/parallel/test-encoding.js @@ -0,0 +1,21 @@ +// Flags: --no-warnings +'use strict'; +require('../common'); + +const assert = require('assert'); +const encoding = require('node:encoding'); +const { TextEncoder } = require('util'); + +const encoder = new TextEncoder(); + +assert.deepStrictEqual(encoding.isAscii(encoder.encode('hello')), true); +assert.deepStrictEqual(encoding.isAscii(encoder.encode('ğ')), false); +assert.deepStrictEqual(encoding.isAscii('hello'), true); +assert.deepStrictEqual(encoding.isAscii('ğ'), false); + +assert.deepStrictEqual(encoding.isUtf8(encoder.encode('hello')), true); +assert.deepStrictEqual(encoding.isUtf8(encoder.encode('ğ')), true); +assert.deepStrictEqual(encoding.isUtf8(Buffer.from([0xf8])), false); + +assert.deepStrictEqual(encoding.countUtf8(encoder.encode('hello')), 5); +assert.deepStrictEqual(encoding.countUtf8(encoder.encode('Yağız')), 5); diff --git a/typings/internalBinding/encoding_methods.d.ts b/typings/internalBinding/encoding_methods.d.ts new file mode 100644 index 00000000000000..39fefddc903db8 --- /dev/null +++ b/typings/internalBinding/encoding_methods.d.ts @@ -0,0 +1,5 @@ +declare function InternalBinding(binding: 'encoding_methods'): { + validateAscii(input: Uint8Array): boolean + validateUtf8(input: Uint8Array): boolean + countUtf8(input: Uint8Array): boolean +};