Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(NODE-6123): utf8 validation is insufficiently strict #676

Merged
merged 8 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 22 additions & 12 deletions etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import MagicString from 'magic-string';

const REQUIRE_POLYFILLS =
`const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
const REQUIRE_WEB_UTILS_POLYFILLS =
`const { TextEncoder } = require('../vendor/text-encoding');
const { encode: btoa, decode: atob } = require('../vendor/base64');\n`

const REQUIRE_PARSE_UTF8_POLYFILLS =
`const { TextDecoder } = require('../vendor/text-encoding');\n`;

export class RequireVendor {
/**
* Take the compiled source code input; types are expected to already have been removed.
Expand All @@ -14,17 +17,24 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
if (!id.includes('web_byte_utils')) {
return;
}
if (id.includes('parse_utf8')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS);

// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_POLYFILLS);
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
} else if (id.includes('web_byte_utils')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);

return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
}
}
}
35 changes: 35 additions & 0 deletions src/parse_utf8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { BSONError } from './error';

type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

// parse utf8 globals
declare const TextDecoder: TextDecoderConstructor;
let TextDecoderFatal: TextDecoder;
let TextDecoderNonFatal: TextDecoder;

/**
* Determines if the passed in bytes are valid utf8
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
* @param start - The index to start validating
* @param end - The index to end validating
*/
export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
if (fatal) {
TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
try {
return TextDecoderFatal.decode(buffer.subarray(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
return TextDecoderNonFatal.decode(buffer.subarray(start, end));
}
8 changes: 1 addition & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
import { Timestamp } from '../timestamp';
import { ByteUtils } from '../utils/byte_utils';
import { NumberUtils } from '../utils/number_utils';
import { validateUtf8 } from '../validate_utf8';

/** @public */
export interface DeserializeOptions {
Expand Down Expand Up @@ -604,12 +603,7 @@ function deserializeObject(
)
throw new BSONError('bad string length in bson');
// Namespace
if (validation != null && validation.utf8) {
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
// Update parse index position
index = index + stringSize;

Expand Down
7 changes: 2 additions & 5 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { parseUtf8 } from '../parse_utf8';
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
Expand Down Expand Up @@ -136,12 +136,9 @@ export const nodeJsByteUtils = {

const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
if (fatal) {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
parseUtf8(buffer, start, end, true);
break;
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { BSONError } from '../error';
import { tryReadBasicLatin } from './latin';
import { parseUtf8 } from '../parse_utf8';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -179,14 +180,7 @@ export const webByteUtils = {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
return parseUtf8(uint8array, start, end, fatal);
},

utf8ByteLength(input: string): number {
Expand Down
47 changes: 0 additions & 47 deletions src/validate_utf8.ts

This file was deleted.

24 changes: 16 additions & 8 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];

const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -416,22 +418,28 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
expect(output).to.be.a('string').with.lengthOf(0);
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
}
},
...utf8WebPlatformSpecTests.map(t => ({
name: t.name,
inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [
buffer: Uint8Array,
start: number,
end: number,
fatal: boolean
],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}))
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down
67 changes: 67 additions & 0 deletions test/node/data/utf8_wpt_error_cases.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
// commit sha: 7c9f867
// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544

export const utf8WebPlatformSpecTests = [
{ encoding: 'utf-8', input: [0xff], name: 'invalid code' },
{ encoding: 'utf-8', input: [0xc0], name: 'ends early' },
{ encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
{ encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
{ encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
{ encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
{ encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
{ encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
{ encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
name: 'overlong U+0000 - 6 bytes'
},

{ encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
name: 'overlong U+007f - 6 bytes'
},

{ encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
name: 'overlong U+07ff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
name: 'overlong U+ffff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
name: 'overlong U+10ffff - 6 bytes'
},

// UTf-16 surrogates encoded as code points in UTf-8
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
];
46 changes: 45 additions & 1 deletion test/node/parser/deserializer.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as BSON from '../../register-bson';
import { expect } from 'chai';
import { bufferFromHexArray } from '../tools/utils';
import { bufferFromHexArray, int32LEToHex } from '../tools/utils';
import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases';

describe('deserializer()', () => {
describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => {
Expand Down Expand Up @@ -58,4 +59,47 @@ describe('deserializer()', () => {
expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true });
});
});

describe('utf8 validation', () => {
for (const test of utf8WebPlatformSpecTests) {
const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string
const inputHexString = Buffer.from(test.input).toString('hex');
const buffer = bufferFromHexArray([
'02', // string
'6100', // 'a' key with null terminator
inputStringSize,
inputHexString,
'00'
]);
context(`when utf8 validation is on and input is ${test.name}`, () => {
it(`throws error containing 'Invalid UTF-8'`, () => {
// global case
expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw(
BSON.BSONError,
/Invalid UTF-8 string in BSON document/i
);

// specific case
expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw(
BSON.BSONError,
/Invalid UTF-8 string in BSON document/i
);
});
});

context(`when utf8 validation is off and input is ${test.name}`, () => {
it('returns a string containing at least 1 replacement character', () => {
// global case
expect(BSON.deserialize(buffer, { validation: { utf8: false } }))
.to.have.property('a')
.that.includes('\uFFFD');

// specific case
expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } }))
.to.have.property('a')
.that.includes('\uFFFD');
});
});
}
});
});
2 changes: 1 addition & 1 deletion test/node/release.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ const REQUIRED_FILES = [
'src/utils/number_utils.ts',
'src/utils/web_byte_utils.ts',
'src/utils/latin.ts',
'src/validate_utf8.ts',
'src/parse_utf8.ts',
'vendor/base64/base64.js',
'vendor/base64/package.json',
'vendor/base64/LICENSE-MIT.txt',
Expand Down