Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ext/node): buffer.transcode() #25972

Merged
merged 8 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ext/node/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ deno_core::extension!(deno_node,

ops::buffer::op_is_ascii,
ops::buffer::op_is_utf8,
ops::buffer::op_transcode,
ops::crypto::op_node_check_prime_async,
ops::crypto::op_node_check_prime_bytes_async,
ops::crypto::op_node_check_prime_bytes,
Expand Down
106 changes: 106 additions & 0 deletions ext/node/ops/buffer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

use deno_core::anyhow::anyhow;
use deno_core::anyhow::Result;
use deno_core::op2;

#[op2(fast)]
Expand All @@ -11,3 +13,107 @@ pub fn op_is_ascii(#[buffer] buf: &[u8]) -> bool {
pub fn op_is_utf8(#[buffer] buf: &[u8]) -> bool {
std::str::from_utf8(buf).is_ok()
}

#[op2]
#[buffer]
pub fn op_transcode(
#[buffer] source: &[u8],
#[string] from_encoding: &str,
#[string] to_encoding: &str,
) -> Result<Vec<u8>> {
match (from_encoding, to_encoding) {
("utf8", "ascii") => Ok(utf8_to_ascii(source)),
("utf8", "latin1") => Ok(utf8_to_latin1(source)),
("utf8", "utf16le") => utf8_to_utf16le(source),
("utf16le", "utf8") => utf16le_to_utf8(source),
("latin1", "utf16le") | ("ascii", "utf16le") => {
Ok(latin1_ascii_to_utf16le(source))
}
(from, to) => Err(anyhow!("Unable to transcode Buffer {from}->{to}")),
}
}

fn latin1_ascii_to_utf16le(source: &[u8]) -> Vec<u8> {
let mut result = Vec::with_capacity(source.len() * 2);
for &byte in source {
result.push(byte);
result.push(0);
}
result
}

fn utf16le_to_utf8(source: &[u8]) -> Result<Vec<u8>> {
let ucs2_vec: Vec<u16> = source
.chunks(2)
.map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]]))
.collect();
String::from_utf16(&ucs2_vec)
.map(|utf8_string| utf8_string.into_bytes())
.map_err(|e| anyhow!("Invalid UTF-16 sequence: {}", e))
}

fn utf8_to_utf16le(source: &[u8]) -> Result<Vec<u8>> {
let utf8_string = std::str::from_utf8(source)?;
let ucs2_vec: Vec<u16> = utf8_string.encode_utf16().collect();
let bytes: Vec<u8> = ucs2_vec.iter().flat_map(|&x| x.to_le_bytes()).collect();
Ok(bytes)
}

fn utf8_to_latin1(source: &[u8]) -> Vec<u8> {
let mut latin1_bytes = Vec::with_capacity(source.len());
let mut i = 0;
while i < source.len() {
match source[i] {
byte if byte <= 0x7F => {
// ASCII character
latin1_bytes.push(byte);
i += 1;
}
byte if (0xC2..=0xDF).contains(&byte) && i + 1 < source.len() => {
// 2-byte UTF-8 sequence
let codepoint =
((byte as u16 & 0x1F) << 6) | (source[i + 1] as u16 & 0x3F);
latin1_bytes.push(if codepoint <= 0xFF {
codepoint as u8
} else {
b'?'
});
i += 2;
}
_ => {
// 3-byte or 4-byte UTF-8 sequence, or invalid UTF-8
latin1_bytes.push(b'?');
// Skip to the next valid UTF-8 start byte
i += 1;
while i < source.len() && (source[i] & 0xC0) == 0x80 {
i += 1;
}
}
}
}
latin1_bytes
}

fn utf8_to_ascii(source: &[u8]) -> Vec<u8> {
let mut ascii_bytes = Vec::with_capacity(source.len());
let mut i = 0;
while i < source.len() {
match source[i] {
byte if byte <= 0x7F => {
// ASCII character
ascii_bytes.push(byte);
i += 1;
}
_ => {
// Non-ASCII character
ascii_bytes.push(b'?');
// Skip to the next valid UTF-8 start byte
i += 1;
while i < source.len() && (source[i] & 0xC0) == 0x80 {
i += 1;
}
}
}
}
ascii_bytes
}
1 change: 1 addition & 0 deletions ext/node/polyfills/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ export {
kMaxLength,
kStringMaxLength,
SlowBuffer,
transcode,
} from "ext:deno_node/internal/buffer.mjs";
51 changes: 49 additions & 2 deletions ext/node/polyfills/internal/buffer.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// deno-lint-ignore-file prefer-primordials

import { core } from "ext:core/mod.js";
import { op_is_ascii, op_is_utf8 } from "ext:core/ops";
import { op_is_ascii, op_is_utf8, op_transcode } from "ext:core/ops";

import { TextDecoder, TextEncoder } from "ext:deno_web/08_text_encoding.js";
import { codes } from "ext:deno_node/internal/error_codes.ts";
Expand All @@ -32,7 +32,11 @@ import {
import { normalizeEncoding } from "ext:deno_node/internal/util.mjs";
import { validateBuffer } from "ext:deno_node/internal/validators.mjs";
import { isUint8Array } from "ext:deno_node/internal/util/types.ts";
import { ERR_INVALID_STATE, NodeError } from "ext:deno_node/internal/errors.ts";
import {
ERR_INVALID_STATE,
genericNodeError,
NodeError,
} from "ext:deno_node/internal/errors.ts";
import {
forgivingBase64Encode,
forgivingBase64UrlEncode,
Expand Down Expand Up @@ -2598,6 +2602,48 @@ export function isAscii(input) {
], input);
}

export function transcode(source, fromEnco, toEnco) {
if (!isUint8Array(source)) {
throw new codes.ERR_INVALID_ARG_TYPE(
"source",
["Buffer", "Uint8Array"],
source,
);
}
if (source.length === 0) {
return Buffer.alloc(0);
}
const code = "U_ILLEGAL_ARGUMENT_ERROR";
const illegalArgumentError = genericNodeError(
`Unable to transcode Buffer [${code}]`,
{ code: code, errno: 1 },
);
fromEnco = normalizeEncoding(fromEnco);
toEnco = normalizeEncoding(toEnco);
if (!fromEnco || !toEnco) {
throw illegalArgumentError;
}
// Return the provided source when transcode is not required
// for the from/to encoding pair.
const returnSource = fromEnco === toEnco ||
fromEnco === "ascii" && toEnco === "utf8" ||
fromEnco === "ascii" && toEnco === "latin1";
if (returnSource) {
return Buffer.from(source);
}

try {
const result = op_transcode(new Uint8Array(source), fromEnco, toEnco);
return Buffer.from(result, toEnco);
} catch (err) {
if (err.message.includes("Unable to transcode Buffer")) {
throw illegalArgumentError;
} else {
throw err;
}
}
}

export default {
atob,
btoa,
Expand All @@ -2610,4 +2656,5 @@ export default {
kMaxLength,
kStringMaxLength,
SlowBuffer,
transcode,
};
1 change: 1 addition & 0 deletions tests/node_compat/config.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@
"test-http-outgoing-settimeout.js",
"test-http-url.parse-https.request.js",
"test-http-url.parse-only-support-http-https-protocol.js",
"test-icu-transcode.js",
"test-net-access-byteswritten.js",
"test-net-better-error-messages-listen-path.js",
"test-net-better-error-messages-path.js",
Expand Down
1 change: 0 additions & 1 deletion tests/node_compat/runner/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -1632,7 +1632,6 @@ NOTE: This file should not be manually edited. Please edit `tests/node_compat/co
- [parallel/test-icu-minimum-version.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-minimum-version.js)
- [parallel/test-icu-punycode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-punycode.js)
- [parallel/test-icu-stringwidth.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-stringwidth.js)
- [parallel/test-icu-transcode.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-icu-transcode.js)
- [parallel/test-inspect-address-in-use.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-address-in-use.js)
- [parallel/test-inspect-async-hook-setup-at-inspect.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-async-hook-setup-at-inspect.js)
- [parallel/test-inspect-publish-uid.js](https://github.com/nodejs/node/tree/v18.12.1/test/parallel/test-inspect-publish-uid.js)
Expand Down
97 changes: 97 additions & 0 deletions tests/node_compat/test/parallel/test-icu-transcode.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// deno-fmt-ignore-file
// deno-lint-ignore-file

// Copyright Joyent and Node contributors. All rights reserved. MIT license.
// Taken from Node 18.12.1
// This file is automatically generated by `tests/node_compat/runner/setup.ts`. Do not modify this file manually.

'use strict';

const common = require('../common');

if (!common.hasIntl)
common.skip('missing Intl');

const buffer = require('buffer');
const assert = require('assert');
const orig = Buffer.from('těst ☕', 'utf8');

// Test Transcoding
const tests = {
'latin1': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f],
'ascii': [0x74, 0x3f, 0x73, 0x74, 0x20, 0x3f],
'ucs2': [0x74, 0x00, 0x1b, 0x01, 0x73,
0x00, 0x74, 0x00, 0x20, 0x00,
0x15, 0x26]
};

for (const test in tests) {
const dest = buffer.transcode(orig, 'utf8', test);
assert.strictEqual(dest.length, tests[test].length, `utf8->${test} length`);
for (let n = 0; n < tests[test].length; n++)
assert.strictEqual(dest[n], tests[test][n], `utf8->${test} char ${n}`);
}

{
const dest = buffer.transcode(Buffer.from(tests.ucs2), 'ucs2', 'utf8');
assert.strictEqual(dest.toString(), orig.toString());
}

{
const utf8 = Buffer.from('€'.repeat(4000), 'utf8');
const ucs2 = Buffer.from('€'.repeat(4000), 'ucs2');
const utf8_to_ucs2 = buffer.transcode(utf8, 'utf8', 'ucs2');
const ucs2_to_utf8 = buffer.transcode(ucs2, 'ucs2', 'utf8');
assert.deepStrictEqual(utf8, ucs2_to_utf8);
assert.deepStrictEqual(ucs2, utf8_to_ucs2);
assert.strictEqual(ucs2_to_utf8.toString('utf8'),
utf8_to_ucs2.toString('ucs2'));
}

assert.throws(
() => buffer.transcode(null, 'utf8', 'ascii'),
{
name: 'TypeError',
code: 'ERR_INVALID_ARG_TYPE',
message: 'The "source" argument must be an instance of Buffer ' +
'or Uint8Array. Received null'
}
);

assert.throws(
() => buffer.transcode(Buffer.from('a'), 'b', 'utf8'),
/^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]/
);

assert.throws(
() => buffer.transcode(Buffer.from('a'), 'uf8', 'b'),
/^Error: Unable to transcode Buffer \[U_ILLEGAL_ARGUMENT_ERROR\]$/
);

assert.deepStrictEqual(
buffer.transcode(Buffer.from('hi', 'ascii'), 'ascii', 'utf16le'),
Buffer.from('hi', 'utf16le'));
assert.deepStrictEqual(
buffer.transcode(Buffer.from('hi', 'latin1'), 'latin1', 'utf16le'),
Buffer.from('hi', 'utf16le'));
assert.deepStrictEqual(
buffer.transcode(Buffer.from('hä', 'latin1'), 'latin1', 'utf16le'),
Buffer.from('hä', 'utf16le'));

// Test that Uint8Array arguments are okay.
{
const uint8array = new Uint8Array([...Buffer.from('hä', 'latin1')]);
assert.deepStrictEqual(
buffer.transcode(uint8array, 'latin1', 'utf16le'),
Buffer.from('hä', 'utf16le'));
}

{
const dest = buffer.transcode(new Uint8Array(), 'utf8', 'latin1');
assert.strictEqual(dest.length, 0);
}

// Test that it doesn't crash
{
buffer.transcode(new buffer.SlowBuffer(1), 'utf16le', 'ucs2');
}