From da9de3a1ded8b5e2b6908ca8da628e668a84ef65 Mon Sep 17 00:00:00 2001 From: Aditi Khare Date: Tue, 23 Apr 2024 17:08:51 -0400 Subject: [PATCH] temp try to detect overlong encoding w/o textDecoder --- .../require_vendor.mjs | 1 + src/error.ts | 10 + src/test.ts | 9 + src/utils/node_byte_utils.ts | 24 +- src/utils/web_byte_utils.ts | 4 +- src/validate_utf8.ts | 27 ++- test/node/byte_utils.test.ts | 225 +++++++++++++++++- 7 files changed, 289 insertions(+), 11 deletions(-) create mode 100644 src/test.ts diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs index 7d4fa4e91..bdfe9c111 100644 --- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs +++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs @@ -14,6 +14,7 @@ export class RequireVendor { * @returns {{ code: string; map: import('magic-string').SourceMap }} */ transform(code, id) { + // TODO(NODE-4930) if (!id.includes('web_byte_utils')) { return; } diff --git a/src/error.ts b/src/error.ts index ef5184a4a..7dd101c57 100644 --- a/src/error.ts +++ b/src/error.ts @@ -103,3 +103,13 @@ export class BSONOffsetError extends BSONError { this.offset = offset; } } + +export class BSONUTF8Error extends BSONError { + public get name(): 'BSONUTF8Error' { + return 'BSONUTF8Error'; + } + + constructor(message: string, options?: { cause?: unknown }) { + super(message, options); + } +} diff --git a/src/test.ts b/src/test.ts new file mode 100644 index 000000000..2ef044cfa --- /dev/null +++ b/src/test.ts @@ -0,0 +1,9 @@ +function parseUtf8Bits(arr: number[]): number { + arr[0] >>= (arr.length - 1); + for (let i = 1; i < arr.length; i++) { + arr[i] >>= 2; + arr[i] <<= i*8; + arr[0] = arr[0] | arr[i] + } + return arr[0]; +} \ No newline at end of file diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts index d6a641a47..8603b45e1 100644 --- a/src/utils/node_byte_utils.ts +++ b/src/utils/node_byte_utils.ts @@ -1,4 +1,4 @@ -import { BSONError } from '../error'; +import { BSONError, BSONUTF8Error } from '../error'; import { validateUtf8 } from '../validate_utf8'; import { tryReadBasicLatin, tryWriteBasicLatin } from './latin'; @@ -27,6 +27,28 @@ type NodeJsBufferConstructor = Omit & { declare const Buffer: NodeJsBufferConstructor; declare const require: (mod: 'crypto') => { randomBytes: (byteLength: number) => Uint8Array }; +type TextDecoder = { + readonly encoding: string; + readonly fatal: boolean; + readonly ignoreBOM: boolean; + decode(input?: Uint8Array): string; +}; +type TextDecoderConstructor = { + new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder; +}; + +type TextEncoder = { + readonly encoding: string; + encode(input?: string): Uint8Array; +}; +type TextEncoderConstructor = { + new (): TextEncoder; +}; + +// Node byte utils global +declare const TextDecoder: TextDecoderConstructor; +declare const TextEncoder: TextEncoderConstructor; + /** @internal */ export function nodejsMathRandomBytes(byteLength: number) { return nodeJsByteUtils.fromNumberArray( diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts index 77a1f0f74..e7d39b62b 100644 --- a/src/utils/web_byte_utils.ts +++ b/src/utils/web_byte_utils.ts @@ -1,4 +1,4 @@ -import { BSONError } from '../error'; +import { BSONError, BSONUTF8Error } from '../error'; import { tryReadBasicLatin } from './latin'; type TextDecoder = { @@ -183,7 +183,7 @@ export const webByteUtils = { try { return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); } catch (cause) { - throw new BSONError('Invalid UTF-8 string in BSON document', { cause }); + throw new BSONUTF8Error('Invalid UTF-8 string in BSON document', { cause }); } } return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end)); diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts index e1da934c6..ba1acb7e8 100644 --- a/src/validate_utf8.ts +++ b/src/validate_utf8.ts @@ -1,3 +1,5 @@ +import { NumberUtils } from "./utils/number_utils"; + const FIRST_BIT = 0x80; const FIRST_TWO_BITS = 0xc0; const FIRST_THREE_BITS = 0xe0; @@ -9,6 +11,12 @@ const THREE_BIT_CHAR = 0xe0; const FOUR_BIT_CHAR = 0xf0; const CONTINUING_CHAR = 0x80; +// max utf8 values representable in given number of bytes +const ONE_BYTE_MAX = 0x7f; +const TWO_BYTE_MAX = 0x7ff; +const THREE_BYTE_MAX = 0xf7ff; + + /** * Determines if the passed in bytes are valid utf8 * @param bytes - An array of 8-bit bytes. Must be indexable and have length property @@ -30,12 +38,15 @@ export function validateUtf8( return false; } continuation -= 1; - } else if (byte & FIRST_BIT) { + } else if (byte & FIRST_BIT && + parseUtf8Bytes([byte, bytes[i+1]]) > ONE_BYTE_MAX) { if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) { continuation = 1; - } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) { + } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR && + parseUtf8Bytes([byte, bytes[i+1], bytes[i+2]]) > TWO_BYTE_MAX) { continuation = 2; - } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) { + } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR && + parseUtf8Bytes([byte, bytes[i+1], bytes[i+2], bytes[i+3]]) > THREE_BYTE_MAX) { continuation = 3; } else { return false; @@ -45,3 +56,13 @@ export function validateUtf8( return !continuation; } + +function parseUtf8Bytes(arr: number[]): number { + arr[0] >>= (arr.length - 1); + for (let i = 1; i < arr.length; i++) { + arr[i] >>= 2; + arr[i] <<= i*8; + arr[0] = arr[0] | arr[i] + } + return arr[0]; +} \ No newline at end of file diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts index fa6d7f893..0a3e75ae0 100644 --- a/test/node/byte_utils.test.ts +++ b/test/node/byte_utils.test.ts @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils'; import * as sinon from 'sinon'; import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson'; import * as crypto from 'node:crypto'; +import { BSONError, BSONUTF8Error } from '../../src/error'; type ByteUtilTest = { name: string; @@ -399,6 +400,8 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [ } } ]; + + const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ { name: 'should create utf8 string from buffer input', @@ -416,6 +419,14 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ expect(output).to.be.a('string').with.lengthOf(0); } }, + { + name: 'should insert replacement character fatal is false and string is invalid', + inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], + expectation({ error, output }) { + expect(error).to.not.exist; + expect(output).to.equal('abc\uFFFD'); + } + }, { name: 'should throw an error if fatal is set and string is invalid', inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true], @@ -424,14 +435,168 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [ } }, { - name: 'should insert replacement character fatal is false and string is invalid', - inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false], - expectation({ error, output }) { - expect(error).to.not.exist; - expect(output).to.equal('abc\uFFFD'); + name: 'throw an error if fatal is set and string contains overlong encoding', + inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string contains invalid bytes', + inputs: [Buffer.from('abcff', 'hex'), 0, 2, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { + name: 'throw an error if fatal is set and string contains an unexpected continuation byte', + inputs: [Buffer.from('7F80', 'hex'), 0, 2, true], + expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFF', 'hex'), 0, 1, true], name: 'throws when provided with invalid code' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xC0', 'hex'), 0, 1, true], name: 'throws when provided with ends early' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE0', 'hex'), 0, 1, true], name: 'throws when provided with ends early 2' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xC000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xC0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 2' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 3' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 4' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE08000', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 5' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE080C0', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 6' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with > 0x10FFFF' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFE8080808080', 'hex'), 0, 6, true], name: 'throws when provided with obsolete lead byte' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + // Overlong encodings + { inputs: [Buffer.from('0xC080', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+0000 - 2 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE08080', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+0000 - 3 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF0808080', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+0000 - 4 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF880808080', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+0000 - 5 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+0000 - 6 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + { inputs: [Buffer.from('0xC1BF', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+007F - 2 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xE081BF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+007F - 3 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF08081BF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+007F - 4 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF8808081BF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+007F - 5 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC80808081BF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+007F - 6 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + { inputs: [Buffer.from('0xE09FBF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+07FF - 3 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF0809FBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+07FF - 4 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF880809FBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+07FF - 5 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC8080809FBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+07FF - 6 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + { inputs: [Buffer.from('0xF08FBFBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+FFFF - 4 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xF8808FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+FFFF - 5 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC80808FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+FFFF - 6 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + { inputs: [Buffer.from('0xF8848FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+10FFFF - 5 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xFC80848FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+10FFFF - 6 bytes' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + + // UTF-16 surrogates encoded as code points in UTF-8 + { inputs: [Buffer.from('0xEDA080', 'hex'), 0, 3, true], name: 'throws when provided with lead surrogate' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xEDB080', 'hex'), 0, 3, true], name: 'throws when provided with trail surrogate' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); + } + }, + { inputs: [Buffer.from('0xEDA080EDB080', 'hex'), 0, 6, true], name: 'throws when provided with surrogate pair' , expectation({ error }) { + expect(error).to.match(/Invalid UTF-8 string in BSON document/i); } } ]; + const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [ { name: 'should return zero for empty string', @@ -801,4 +966,54 @@ describe('ByteUtils', () => { }); } } + + let bad = [ + { encoding: 'utf-8', input: [0xFF], name: 'invalid code' }, + { encoding: 'utf-8', input: [0xC0], name: 'ends early' }, + { encoding: 'utf-8', input: [0xE0], name: 'ends early 2' }, + { encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' }, + { encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' }, + { encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' }, + { encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' }, + { encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' }, + { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' }, + { encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' }, + + // Overlong encodings + { encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' }, + { encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' }, + { encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' }, + { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' }, + + { encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' }, + { encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' }, + { encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' }, + { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' }, + + { encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' }, + { encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' }, + { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' }, + + { encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' }, + { encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' }, + + { encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' }, + { encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' }, + + // UTF-16 surrogates encoded as code points in UTF-8 + { encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' }, + { encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' }, + { encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' }, + ]; + + for (const test of bad) { + it.only(`${test.name}`, () => { + expect(() => nodeJsByteUtils.toUTF8(Uint8Array.from(test.input), 0, test.input.length, true)).to.throw(BSONError); + }); + } });