From a30272b33395317fe79d39c42dd2d1e313561080 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Sat, 26 Oct 2019 16:27:51 +0200 Subject: [PATCH] encoding: make TextDecoder handle BOM correctly Do not accept the BOM if it comes from a different encoding, and only discard the BOM after it has actually been read (including when it is spread over multiple chunks in streaming mode). Fixes: https://github.com/nodejs/node/issues/25315 --- lib/internal/encoding.js | 27 ++++++++++++------------- src/node_buffer.cc | 8 ++++---- src/node_i18n.cc | 37 +++++++++++++++++++++++++---------- src/node_internals.h | 6 +++++- test/wpt/status/encoding.json | 5 +---- 5 files changed, 49 insertions(+), 34 deletions(-) diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index dabcd5eaccebe0..16de0c986e6cb1 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -484,25 +484,22 @@ function makeTextDecoderJS() { this[kFlags] |= CONVERTER_FLAGS_FLUSH; } - if (!this[kBOMSeen] && !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) { - if (this[kEncoding] === 'utf-8') { - if (input.length >= 3 && - input[0] === 0xEF && input[1] === 0xBB && input[2] === 0xBF) { - input = input.slice(3); - } - } else if (this[kEncoding] === 'utf-16le') { - if (input.length >= 2 && input[0] === 0xFF && input[1] === 0xFE) { - input = input.slice(2); - } + let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ? + this[kHandle].end(input) : + this[kHandle].write(input); + + if (result.length > 0 && + !this[kBOMSeen] && + !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) { + // If the very first result in the stream is a BOM, and we are not + // explicitly told to ignore it, then we discard it. + if (result[0] === '\ufeff') { + result = result.slice(1); } this[kBOMSeen] = true; } - if (this[kFlags] & CONVERTER_FLAGS_FLUSH) { - return this[kHandle].end(input); - } - - return this[kHandle].write(input); + return result; } } diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 74684110a965dd..3aa1ea2535a47d 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -219,10 +219,10 @@ size_t Length(Local obj) { } -inline MaybeLocal New(Environment* env, - Local ab, - size_t byte_offset, - size_t length) { +MaybeLocal New(Environment* env, + Local ab, + size_t byte_offset, + size_t length) { CHECK(!env->buffer_prototype_object().IsEmpty()); Local ui = Uint8Array::New(ab, byte_offset, length); Maybe mb = diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 162f5fda5d4adb..ecc0528e76f8c6 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -95,6 +95,7 @@ using v8::NewStringType; using v8::Object; using v8::ObjectTemplate; using v8::String; +using v8::Uint8Array; using v8::Value; namespace i18n { @@ -227,14 +228,6 @@ class ConverterObject : public BaseObject, Converter { const char* source = input.data(); size_t source_length = input.length(); - if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) { - int32_t bomOffset = 0; - ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status); - source += bomOffset; - source_length -= bomOffset; - converter->bomSeen_ = true; - } - UChar* target = *result; ucnv_toUnicode(converter->conv, &target, target + (limit * sizeof(UChar)), @@ -242,10 +235,34 @@ class ConverterObject : public BaseObject, Converter { nullptr, flush, &status); if (U_SUCCESS(status)) { - if (limit > 0) + bool omit_initial_bom = false; + if (limit > 0) { result.SetLength(target - &result[0]); + if (result.length() > 0 && + converter->unicode_ && + !converter->ignoreBOM_ && + !converter->bomSeen_) { + // If the very first result in the stream is a BOM, and we are not + // explicitly told to ignore it, then we mark it for discarding. + if (result[0] == 0xFEFF) { + omit_initial_bom = true; + } + converter->bomSeen_ = true; + } + } ret = ToBufferEndian(env, &result); - args.GetReturnValue().Set(ret.ToLocalChecked()); + if (omit_initial_bom && !ret.IsEmpty()) { + // Peform `ret = ret.slice(2)`. + CHECK(ret.ToLocalChecked()->IsUint8Array()); + Local orig_ret = ret.ToLocalChecked().As(); + ret = Buffer::New(env, + orig_ret->Buffer(), + orig_ret->ByteOffset() + 2, + orig_ret->ByteLength() - 2) + .FromMaybe(Local()); + } + if (!ret.IsEmpty()) + args.GetReturnValue().Set(ret.ToLocalChecked()); return; } diff --git a/src/node_internals.h b/src/node_internals.h index 0f4c32bb941a31..94d0c3220e05d3 100644 --- a/src/node_internals.h +++ b/src/node_internals.h @@ -158,7 +158,11 @@ v8::MaybeLocal New(Environment* env, char* data, size_t length, bool uses_malloc); - +// Creates a Buffer instance over an existing Uint8Array. +v8::MaybeLocal New(Environment* env, + v8::Local ab, + size_t byte_offset, + size_t length); // Construct a Buffer from a MaybeStackBuffer (and also its subclasses like // Utf8Value and TwoByteValue). // If |buf| is invalidated, an empty MaybeLocal is returned, and nothing is diff --git a/test/wpt/status/encoding.json b/test/wpt/status/encoding.json index 088eed802f0fe7..b51dde2aaee479 100644 --- a/test/wpt/status/encoding.json +++ b/test/wpt/status/encoding.json @@ -22,10 +22,7 @@ "fail": "iso-2022-jp decoder state handling bug: https://encoding.spec.whatwg.org/#iso-2022-jp-decoder" }, "textdecoder-byte-order-marks.any.js": { - "fail": "Mismatching BOM should not be ignored" - }, - "textdecoder-copy.any.js": { - "fail": "Should not have output BOM: https://encoding.spec.whatwg.org/#concept-td-serialize" + "requires": ["small-icu"] }, "textdecoder-fatal-single-byte.any.js": { "requires": ["full-icu"],