Skip to content

Commit

Permalink
fix(NODE-6123): utf8 validation is insufficiently strict (#676)
Browse files Browse the repository at this point in the history
  • Loading branch information
aditi-khare-mongoDB authored Apr 30, 2024
1 parent c58d1e2 commit ae8bac7
Show file tree
Hide file tree
Showing 10 changed files with 191 additions and 89 deletions.
34 changes: 22 additions & 12 deletions etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import MagicString from 'magic-string';

const REQUIRE_POLYFILLS =
`const { TextEncoder, TextDecoder } = require('../vendor/text-encoding');
const REQUIRE_WEB_UTILS_POLYFILLS =
`const { TextEncoder } = require('../vendor/text-encoding');
const { encode: btoa, decode: atob } = require('../vendor/base64');\n`

const REQUIRE_PARSE_UTF8_POLYFILLS =
`const { TextDecoder } = require('../vendor/text-encoding');\n`;

export class RequireVendor {
/**
* Take the compiled source code input; types are expected to already have been removed.
Expand All @@ -14,17 +17,24 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
if (!id.includes('web_byte_utils')) {
return;
}
if (id.includes('parse_utf8')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_PARSE_UTF8_POLYFILLS);

// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_POLYFILLS);
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
} else if (id.includes('web_byte_utils')) {
// MagicString lets us edit the source code and still generate an accurate source map
const magicString = new MagicString(code);
magicString.prepend(REQUIRE_WEB_UTILS_POLYFILLS);

return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
return {
code: magicString.toString(),
map: magicString.generateMap({ hires: true })
};
}
}
}
35 changes: 35 additions & 0 deletions src/parse_utf8.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import { BSONError } from './error';

type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

// parse utf8 globals
declare const TextDecoder: TextDecoderConstructor;
let TextDecoderFatal: TextDecoder;
let TextDecoderNonFatal: TextDecoder;

/**
* Determines if the passed in bytes are valid utf8
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
* @param start - The index to start validating
* @param end - The index to end validating
*/
export function parseUtf8(buffer: Uint8Array, start: number, end: number, fatal: boolean): string {
if (fatal) {
TextDecoderFatal ??= new TextDecoder('utf8', { fatal: true });
try {
return TextDecoderFatal.decode(buffer.subarray(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
TextDecoderNonFatal ??= new TextDecoder('utf8', { fatal: false });
return TextDecoderNonFatal.decode(buffer.subarray(start, end));
}
8 changes: 1 addition & 7 deletions src/parser/deserializer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import { BSONSymbol } from '../symbol';
import { Timestamp } from '../timestamp';
import { ByteUtils } from '../utils/byte_utils';
import { NumberUtils } from '../utils/number_utils';
import { validateUtf8 } from '../validate_utf8';

/** @public */
export interface DeserializeOptions {
Expand Down Expand Up @@ -604,12 +603,7 @@ function deserializeObject(
)
throw new BSONError('bad string length in bson');
// Namespace
if (validation != null && validation.utf8) {
if (!validateUtf8(buffer, index, index + stringSize - 1)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
}
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, false);
const namespace = ByteUtils.toUTF8(buffer, index, index + stringSize - 1, shouldValidateKey);
// Update parse index position
index = index + stringSize;

Expand Down
7 changes: 2 additions & 5 deletions src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BSONError } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { parseUtf8 } from '../parse_utf8';
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';

type NodeJsEncoding = 'base64' | 'hex' | 'utf8' | 'binary';
Expand Down Expand Up @@ -136,12 +136,9 @@ export const nodeJsByteUtils = {

const string = nodeJsByteUtils.toLocalBufferType(buffer).toString('utf8', start, end);
if (fatal) {
// TODO(NODE-4930): Insufficiently strict BSON UTF8 validation
for (let i = 0; i < string.length; i++) {
if (string.charCodeAt(i) === 0xfffd) {
if (!validateUtf8(buffer, start, end)) {
throw new BSONError('Invalid UTF-8 string in BSON document');
}
parseUtf8(buffer, start, end, true);
break;
}
}
Expand Down
10 changes: 2 additions & 8 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { BSONError } from '../error';
import { tryReadBasicLatin } from './latin';
import { parseUtf8 } from '../parse_utf8';

type TextDecoder = {
readonly encoding: string;
Expand Down Expand Up @@ -179,14 +180,7 @@ export const webByteUtils = {
return basicLatin;
}

if (fatal) {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
return parseUtf8(uint8array, start, end, fatal);
},

utf8ByteLength(input: string): number {
Expand Down
47 changes: 0 additions & 47 deletions src/validate_utf8.ts

This file was deleted.

24 changes: 16 additions & 8 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { utf8WebPlatformSpecTests } from './data/utf8_wpt_error_cases';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,7 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];

const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -416,22 +418,28 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
expect(output).to.be.a('string').with.lengthOf(0);
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
}
},
...utf8WebPlatformSpecTests.map(t => ({
name: t.name,
inputs: [Uint8Array.from(t.input), 0, t.input.length, true] as [
buffer: Uint8Array,
start: number,
end: number,
fatal: boolean
],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}))
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down
67 changes: 67 additions & 0 deletions test/node/data/utf8_wpt_error_cases.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// extra error cases copied from wpt/encoding/textdecoder-fatal.any.js
// commit sha: 7c9f867
// link: https://github.com/web-platform-tests/wpt/commit/7c9f8674d9809731e8919073d957d6233f6e0544

export const utf8WebPlatformSpecTests = [
{ encoding: 'utf-8', input: [0xff], name: 'invalid code' },
{ encoding: 'utf-8', input: [0xc0], name: 'ends early' },
{ encoding: 'utf-8', input: [0xe0], name: 'ends early 2' },
{ encoding: 'utf-8', input: [0xc0, 0x00], name: 'invalid trail' },
{ encoding: 'utf-8', input: [0xc0, 0xc0], name: 'invalid trail 2' },
{ encoding: 'utf-8', input: [0xe0, 0x00], name: 'invalid trail 3' },
{ encoding: 'utf-8', input: [0xe0, 0xc0], name: 'invalid trail 4' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x00], name: 'invalid trail 5' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0xc0], name: 'invalid trail 6' },
{ encoding: 'utf-8', input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10ffff' },
{ encoding: 'utf-8', input: [0xfe, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ encoding: 'utf-8', input: [0xc0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x80, 0x80],
name: 'overlong U+0000 - 6 bytes'
},

{ encoding: 'utf-8', input: [0xc1, 0xbf], name: 'overlong U+007f - 2 bytes' },
{ encoding: 'utf-8', input: [0xe0, 0x81, 0xbf], name: 'overlong U+007f - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x81, 0xbf], name: 'overlong U+007f - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x81, 0xbf],
name: 'overlong U+007f - 6 bytes'
},

{ encoding: 'utf-8', input: [0xe0, 0x9f, 0xbf], name: 'overlong U+07ff - 3 bytes' },
{ encoding: 'utf-8', input: [0xf0, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x80, 0x9f, 0xbf], name: 'overlong U+07ff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x80, 0x9f, 0xbf],
name: 'overlong U+07ff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf0, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 4 bytes' },
{ encoding: 'utf-8', input: [0xf8, 0x80, 0x8f, 0xbf, 0xbf], name: 'overlong U+ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x80, 0x8f, 0xbf, 0xbf],
name: 'overlong U+ffff - 6 bytes'
},

{ encoding: 'utf-8', input: [0xf8, 0x84, 0x8f, 0xbf, 0xbf], name: 'overlong U+10ffff - 5 bytes' },
{
encoding: 'utf-8',
input: [0xfc, 0x80, 0x84, 0x8f, 0xbf, 0xbf],
name: 'overlong U+10ffff - 6 bytes'
},

// UTf-16 surrogates encoded as code points in UTf-8
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80], name: 'lead surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xb0, 0x80], name: 'trail surrogate' },
{ encoding: 'utf-8', input: [0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80], name: 'surrogate pair' }
];
46 changes: 45 additions & 1 deletion test/node/parser/deserializer.test.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as BSON from '../../register-bson';
import { expect } from 'chai';
import { bufferFromHexArray } from '../tools/utils';
import { bufferFromHexArray, int32LEToHex } from '../tools/utils';
import { utf8WebPlatformSpecTests } from '../data/utf8_wpt_error_cases';

describe('deserializer()', () => {
describe('when the fieldsAsRaw options is present and has a value that corresponds to a key in the object', () => {
Expand Down Expand Up @@ -58,4 +59,47 @@ describe('deserializer()', () => {
expect(resultCodeWithScope).to.have.deep.nested.property('a.scope', { b: true });
});
});

describe('utf8 validation', () => {
for (const test of utf8WebPlatformSpecTests) {
const inputStringSize = int32LEToHex(test.input.length + 1); // int32 size of string
const inputHexString = Buffer.from(test.input).toString('hex');
const buffer = bufferFromHexArray([
'02', // string
'6100', // 'a' key with null terminator
inputStringSize,
inputHexString,
'00'
]);
context(`when utf8 validation is on and input is ${test.name}`, () => {
it(`throws error containing 'Invalid UTF-8'`, () => {
// global case
expect(() => BSON.deserialize(buffer, { validation: { utf8: true } })).to.throw(
BSON.BSONError,
/Invalid UTF-8 string in BSON document/i
);

// specific case
expect(() => BSON.deserialize(buffer, { validation: { utf8: { a: true } } })).to.throw(
BSON.BSONError,
/Invalid UTF-8 string in BSON document/i
);
});
});

context(`when utf8 validation is off and input is ${test.name}`, () => {
it('returns a string containing at least 1 replacement character', () => {
// global case
expect(BSON.deserialize(buffer, { validation: { utf8: false } }))
.to.have.property('a')
.that.includes('\uFFFD');

// specific case
expect(BSON.deserialize(buffer, { validation: { utf8: { a: false } } }))
.to.have.property('a')
.that.includes('\uFFFD');
});
});
}
});
});
2 changes: 1 addition & 1 deletion test/node/release.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ const REQUIRED_FILES = [
'src/utils/number_utils.ts',
'src/utils/web_byte_utils.ts',
'src/utils/latin.ts',
'src/validate_utf8.ts',
'src/parse_utf8.ts',
'vendor/base64/base64.js',
'vendor/base64/package.json',
'vendor/base64/LICENSE-MIT.txt',
Expand Down

0 comments on commit ae8bac7

Please sign in to comment.