Skip to content

Commit

Permalink
add buffer.transcode for nodejs_compat
Browse files Browse the repository at this point in the history
  • Loading branch information
anonrig committed Jul 30, 2024
1 parent 3f94280 commit f171dbd
Show file tree
Hide file tree
Showing 8 changed files with 322 additions and 24 deletions.
3 changes: 3 additions & 0 deletions src/node/buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
SlowBuffer,
isAscii,
isUtf8,
transcode,
} from 'node-internal:internal_buffer';

// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
Expand All @@ -30,6 +31,7 @@ export {
SlowBuffer,
isAscii,
isUtf8,
transcode,
};

export default {
Expand All @@ -46,4 +48,5 @@ export default {
SlowBuffer,
isAscii,
isUtf8,
transcode,
};
1 change: 1 addition & 0 deletions src/node/internal/buffer.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ export function decode(buffer: Uint8Array, state: Uint8Array): string;
export function flush(state: Uint8Array): string;
export function isAscii(value: ArrayBufferView): boolean;
export function isUtf8(value: ArrayBufferView): boolean;
export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string): ArrayBuffer;
12 changes: 12 additions & 0 deletions src/node/internal/internal_buffer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2294,6 +2294,18 @@ export function isUtf8(value: ArrayBufferView) {
return bufferUtil.isUtf8(value);
}

export function transcode(source: ArrayBufferView, fromEncoding: string, toEncoding: string) {
const normalizedFromEncoding = normalizeEncoding(fromEncoding);
if (!Buffer.isEncoding(normalizedFromEncoding)) {
throw new ERR_UNKNOWN_ENCODING(fromEncoding);
}
const normalizedToEncoding = normalizeEncoding(toEncoding);
if (!Buffer.isEncoding(normalizedToEncoding)) {
throw new ERR_UNKNOWN_ENCODING(toEncoding);
}
return bufferUtil.transcode(source, fromEncoding, toEncoding);
}

export default {
Buffer,
constants,
Expand Down
55 changes: 31 additions & 24 deletions src/workerd/api/node/buffer.c++
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@
#include "buffer-string-search.h"
#include <workerd/jsg/buffersource.h>
#include <kj/encoding.h>
#include <algorithm>
#include <kj/array.h>
#include "simdutf.h"
#include "i18n.h"

#include <algorithm>

// These are defined by <sys/byteorder.h> or <netinet/in.h> on some systems.
// To avoid warnings, undefine them before redefining them.
Expand Down Expand Up @@ -85,30 +88,20 @@ void SwapBytes(kj::ArrayPtr<kj::byte> bytes) {
}
}

enum class Encoding {
ASCII,
LATIN1,
UTF8,
UTF16LE,
BASE64,
BASE64URL,
HEX,
};

Encoding getEncoding(kj::StringPtr encoding) {
if (encoding == "utf8"_kj) {
Encoding getEncoding(kj::StringPtr input) {
if (input == "utf8"_kj) {
return Encoding::UTF8;
} else if (encoding == "ascii") {
} else if (input == "ascii"_kj) {
return Encoding::ASCII;
} else if (encoding == "latin1") {
} else if (input == "latin1"_kj) {
return Encoding::LATIN1;
} else if (encoding == "utf16le") {
} else if (input == "utf16le"_kj) {
return Encoding::UTF16LE;
} else if (encoding == "base64") {
} else if (input == "base64"_kj) {
return Encoding::BASE64;
} else if (encoding == "base64url") {
} else if (input == "base64url"_kj) {
return Encoding::BASE64URL;
} else if (encoding == "hex") {
} else if (input == "hex"_kj) {
return Encoding::HEX;
}

Expand Down Expand Up @@ -137,7 +130,7 @@ kj::Array<byte> decodeHexTruncated(kj::ArrayPtr<kj::byte> text, bool strict = fa
}
text = text.slice(0, text.size() - 1);
}
kj::Vector vec = kj::Vector<kj::byte>(text.size() / 2);
auto vec = kj::Vector<kj::byte>(text.size() / 2);

for (size_t i = 0; i < text.size(); i += 2) {
byte b = 0;
Expand Down Expand Up @@ -216,8 +209,9 @@ uint32_t writeInto(
dest.first(amountToCopy).copyFrom(bytes.first(amountToCopy));
return amountToCopy;
}
default:
KJ_UNREACHABLE;
}
KJ_UNREACHABLE;
}

kj::Array<kj::byte> decodeStringImpl(
Expand Down Expand Up @@ -272,8 +266,9 @@ kj::Array<kj::byte> decodeStringImpl(
string.writeInto(js, buf, options);
return decodeHexTruncated(buf, strict);
}
default:
KJ_UNREACHABLE;
}
KJ_UNREACHABLE;
}
} // namespace

Expand Down Expand Up @@ -561,8 +556,9 @@ jsg::JsString toStringImpl(
case Encoding::HEX: {
return js.str(kj::encodeHex(slice));
}
default:
KJ_UNREACHABLE;
}
KJ_UNREACHABLE;
}

} // namespace
Expand Down Expand Up @@ -673,7 +669,7 @@ inline kj::byte* getIncompleteCharacterBuffer(kj::ArrayPtr<kj::byte> state) {
return state.begin() + BufferUtil::kIncompleteCharactersStart;
}

inline Encoding getEncoding(kj::ArrayPtr<kj::byte> state) {
Encoding getEncoding(kj::ArrayPtr<kj::byte> state) {
JSG_REQUIRE(state[BufferUtil::kEncoding] <= static_cast<kj::byte>(Encoding::HEX),
Error, "Invalid StringDecoder state");
return static_cast<Encoding>(state[BufferUtil::kEncoding]);
Expand Down Expand Up @@ -876,5 +872,16 @@ bool BufferUtil::isUtf8(kj::Array<kj::byte> buffer) {
return simdutf::validate_utf8(buffer.asChars().begin(), buffer.size());
}

kj::Array<kj::byte> BufferUtil::transcode(kj::Array<kj::byte> source, kj::String rawFromEncoding, kj::String rawToEncoding) {
auto fromEncoding = getEncoding(rawFromEncoding);
auto toEncoding = getEncoding(rawToEncoding);

JSG_REQUIRE(i18n::canBeTranscoded(fromEncoding) &&
i18n::canBeTranscoded(toEncoding), Error,
"Unable to transcode Buffer");

return i18n::transcode(source, fromEncoding, toEncoding);
}

} // namespace workerd::api::node {

4 changes: 4 additions & 0 deletions src/workerd/api/node/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ class BufferUtil final: public jsg::Object {
jsg::JsString flush(jsg::Lock& js, kj::Array<kj::byte> state);
bool isAscii(kj::Array<kj::byte> bytes);
bool isUtf8(kj::Array<kj::byte> bytes);
kj::Array<kj::byte> transcode(kj::Array<kj::byte> source,
kj::String rawFromEncoding,
kj::String rawToEncoding);

JSG_RESOURCE_TYPE(BufferUtil) {
JSG_METHOD(byteLength);
Expand All @@ -94,6 +97,7 @@ class BufferUtil final: public jsg::Object {
JSG_METHOD(write);
JSG_METHOD(isAscii);
JSG_METHOD(isUtf8);
JSG_METHOD(transcode);

// For StringDecoder
JSG_METHOD(decode);
Expand Down
205 changes: 205 additions & 0 deletions src/workerd/api/node/i18n.c++
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
// Copyright Joyent and Node contributors. All rights reserved. MIT license.

#include "i18n.h"

#include <workerd/jsg/exception.h>

#include <unicode/putil.h>
#include <unicode/timezone.h>
#include <unicode/uchar.h>
#include <unicode/uclean.h>
#include <unicode/ucnv.h>
#include <unicode/udata.h>
#include <unicode/uidna.h>
#include <unicode/ulocdata.h>
#include <unicode/urename.h>
#include <unicode/ustring.h>
#include <unicode/utf16.h>
#include <unicode/utf8.h>
#include <unicode/utypes.h>
#include <unicode/uvernum.h>
#include <unicode/uversion.h>

namespace workerd::api::node {

namespace i18n {

namespace {

struct ConverterDisposer : public kj::Disposer {
static const ConverterDisposer INSTANCE;
void disposeImpl(void* pointer) const override {
ucnv_close(reinterpret_cast<UConverter*>(pointer));
}
};

const ConverterDisposer ConverterDisposer::INSTANCE;

const char* getEncodingName(Encoding input) {
switch (input) {
case Encoding::ASCII:
return "us-ascii";
case Encoding::LATIN1:
return "iso8859-1";
case Encoding::UCS2:
return "utf16le";
case Encoding::UTF8:
return "utf-8";
default:
KJ_UNREACHABLE;
}
}

typedef kj::Maybe<kj::Array<kj::byte>> (*TranscodeImpl)(kj::ArrayPtr<kj::byte> source,
Encoding fromEncoding, Encoding toEncoding);

kj::Maybe<kj::Array<kj::byte>> TranscodeDefault(kj::ArrayPtr<kj::byte> source,
Encoding fromEncoding, Encoding toEncoding) {
Converter to(toEncoding);
std::string substitude(to.minSize(), '?');
to.setSubstitudeChars(substitude);
Converter from(fromEncoding);

auto limit = source.size() + to.maxSize();
auto out = kj::heapArray<kj::byte>(limit);
char* target = out.asChars().begin();
const char* source_ = source.asChars().begin();
UErrorCode status{};
ucnv_convertEx(to.conv(), from.conv(), &target, target + limit, &source_, source_ + source.size(),
nullptr, nullptr, nullptr, nullptr, true, true, &status);
if (U_SUCCESS(status)) {
return out.slice(0, target - out.asChars().begin()).attach(kj::mv(out));
}

return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeToUCS2(kj::ArrayPtr<kj::byte> source, Encoding fromEncoding,
Encoding toEncoding) {
UErrorCode status{};
const size_t length_in_chars = source.size() * sizeof(UChar);
Converter from(fromEncoding);
auto out = kj::heapArray<UChar>(source.size());
const auto source_ = source.asChars().begin();
ucnv_toUChars(from.conv(), out.begin(), length_in_chars, source_, source.size(), &status);
if (U_SUCCESS(status)) {
return out.asBytes().attach(kj::mv(out));
}
return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeFromUCS2(kj::ArrayPtr<kj::byte> source,
Encoding fromEncoding, Encoding toEncoding) {
UErrorCode status{};
KJ_STACK_ARRAY(kj::byte, sourcebuf, 1024, 0, 1024);
Converter to(toEncoding);
std::string substitude(to.minSize(), '?');
to.setSubstitudeChars(substitude);

const size_t length_in_chars = source.size() * sizeof(UChar);
sourcebuf.copyFrom(source.slice(0, length_in_chars));

auto destbuf = kj::heapArray<kj::byte>(length_in_chars);
const auto source_ = reinterpret_cast<const UChar*>(sourcebuf.asChars().begin());
auto len = ucnv_fromUChars(to.conv(), destbuf.asChars().begin(), length_in_chars, source_,
length_in_chars, &status);

if (U_SUCCESS(status)) {
return destbuf.slice(0, len).attach(kj::mv(destbuf));
}

return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeUcs2FromUtf8(kj::ArrayPtr<kj::byte> source,
Encoding fromEncoding, Encoding toEncoding) {
return kj::none;
}

kj::Maybe<kj::Array<kj::byte>> TranscodeUtf8FromUcs2(kj::ArrayPtr<kj::byte> source,
Encoding fromEncoding, Encoding toEncoding) {
return kj::none;
}

} // namespace

Converter::Converter(Encoding encoding, kj::StringPtr substitude) {
UErrorCode status = U_ZERO_ERROR;
auto name = getEncodingName(encoding);
auto conv = ucnv_open(name, &status);
KJ_ASSERT(U_SUCCESS(status));
conv_ = kj::Own<UConverter>(conv, ConverterDisposer::INSTANCE);
setSubstitudeChars(substitude);
}

Converter::Converter(UConverter* converter, kj::StringPtr substitude)
: conv_(converter, ConverterDisposer::INSTANCE) {
setSubstitudeChars(substitude);
}

UConverter* Converter::conv() const {
return const_cast<UConverter*>(conv_.get());
}

size_t Converter::maxSize() const {
KJ_ASSERT_NONNULL(conv_.get());
return ucnv_getMaxCharSize(conv_.get());
}

size_t Converter::minSize() const {
KJ_ASSERT_NONNULL(conv_.get());
return ucnv_getMinCharSize(conv_.get());
}

void Converter::reset() {
KJ_ASSERT_NONNULL(conv_.get());
ucnv_reset(conv_.get());
}

void Converter::setSubstitudeChars(kj::StringPtr sub) {
KJ_ASSERT_NONNULL(conv_.get());
UErrorCode status = U_ZERO_ERROR;
ucnv_setSubstChars(conv_.get(), sub.begin(), sub.size(), &status);
KJ_ASSERT(U_SUCCESS(status));
}

kj::Array<kj::byte> transcode(kj::ArrayPtr<kj::byte> source, Encoding fromEncoding,
Encoding toEncoding) {
TranscodeImpl transcode_function = &TranscodeDefault;
switch (fromEncoding) {
case Encoding::ASCII:
case Encoding::LATIN1:
if (toEncoding == Encoding::UCS2) {
transcode_function = &TranscodeToUCS2;
}
break;
case Encoding::UTF8:
if (toEncoding == Encoding::UCS2) {
transcode_function = &TranscodeUcs2FromUtf8;
}
break;
case Encoding::UCS2:
switch (toEncoding) {
case Encoding::UCS2:
transcode_function = &TranscodeDefault;
break;
case Encoding::UTF8:
transcode_function = &TranscodeUtf8FromUcs2;
break;
default:
transcode_function = &TranscodeFromUCS2;
}
default:
KJ_UNREACHABLE;
}

return JSG_REQUIRE_NONNULL(transcode_function(source, fromEncoding, toEncoding), Error,
"Unable to transcode buffer");
}

} // namespace i18n

} // namespace workerd::api::node
Loading

0 comments on commit f171dbd

Please sign in to comment.