Skip to content

Commit

Permalink
temp try to detect overlong encoding w/o textDecoder
Browse files Browse the repository at this point in the history
  • Loading branch information
aditi-khare-mongoDB committed Apr 24, 2024
1 parent c386127 commit da9de3a
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 11 deletions.
1 change: 1 addition & 0 deletions etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export class RequireVendor {
* @returns {{ code: string; map: import('magic-string').SourceMap }}
*/
transform(code, id) {
// TODO(NODE-4930)
if (!id.includes('web_byte_utils')) {
return;
}
Expand Down
10 changes: 10 additions & 0 deletions src/error.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,13 @@ export class BSONOffsetError extends BSONError {
this.offset = offset;
}
}

export class BSONUTF8Error extends BSONError {
public get name(): 'BSONUTF8Error' {
return 'BSONUTF8Error';
}

constructor(message: string, options?: { cause?: unknown }) {
super(message, options);
}
}
9 changes: 9 additions & 0 deletions src/test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
function parseUtf8Bits(arr: number[]): number {
arr[0] >>= (arr.length - 1);
for (let i = 1; i < arr.length; i++) {
arr[i] >>= 2;
arr[i] <<= i*8;
arr[0] = arr[0] | arr[i]
}
return arr[0];
}
24 changes: 23 additions & 1 deletion src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { BSONError } from '../error';
import { BSONError, BSONUTF8Error } from '../error';
import { validateUtf8 } from '../validate_utf8';
import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';

Expand Down Expand Up @@ -27,6 +27,28 @@ type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
declare const Buffer: NodeJsBufferConstructor;
declare const require: (mod: 'crypto') => { randomBytes: (byteLength: number) => Uint8Array };

type TextDecoder = {
readonly encoding: string;
readonly fatal: boolean;
readonly ignoreBOM: boolean;
decode(input?: Uint8Array): string;
};
type TextDecoderConstructor = {
new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
};

type TextEncoder = {
readonly encoding: string;
encode(input?: string): Uint8Array;
};
type TextEncoderConstructor = {
new (): TextEncoder;
};

// Node byte utils global
declare const TextDecoder: TextDecoderConstructor;
declare const TextEncoder: TextEncoderConstructor;

/** @internal */
export function nodejsMathRandomBytes(byteLength: number) {
return nodeJsByteUtils.fromNumberArray(
Expand Down
4 changes: 2 additions & 2 deletions src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { BSONError } from '../error';
import { BSONError, BSONUTF8Error } from '../error';
import { tryReadBasicLatin } from './latin';

type TextDecoder = {
Expand Down Expand Up @@ -183,7 +183,7 @@ export const webByteUtils = {
try {
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
} catch (cause) {
throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
throw new BSONUTF8Error('Invalid UTF-8 string in BSON document', { cause });
}
}
return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
Expand Down
27 changes: 24 additions & 3 deletions src/validate_utf8.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { NumberUtils } from "./utils/number_utils";

const FIRST_BIT = 0x80;
const FIRST_TWO_BITS = 0xc0;
const FIRST_THREE_BITS = 0xe0;
Expand All @@ -9,6 +11,12 @@ const THREE_BIT_CHAR = 0xe0;
const FOUR_BIT_CHAR = 0xf0;
const CONTINUING_CHAR = 0x80;

// max utf8 values representable in given number of bytes
const ONE_BYTE_MAX = 0x7f;
const TWO_BYTE_MAX = 0x7ff;
const THREE_BYTE_MAX = 0xf7ff;


/**
* Determines if the passed in bytes are valid utf8
* @param bytes - An array of 8-bit bytes. Must be indexable and have length property
Expand All @@ -30,12 +38,15 @@ export function validateUtf8(
return false;
}
continuation -= 1;
} else if (byte & FIRST_BIT) {
} else if (byte & FIRST_BIT &&
parseUtf8Bytes([byte, bytes[i+1]]) > ONE_BYTE_MAX) {
if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
continuation = 1;
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
} else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR &&
parseUtf8Bytes([byte, bytes[i+1], bytes[i+2]]) > TWO_BYTE_MAX) {
continuation = 2;
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
} else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR &&
parseUtf8Bytes([byte, bytes[i+1], bytes[i+2], bytes[i+3]]) > THREE_BYTE_MAX) {
continuation = 3;
} else {
return false;
Expand All @@ -45,3 +56,13 @@ export function validateUtf8(

return !continuation;
}

function parseUtf8Bytes(arr: number[]): number {
arr[0] >>= (arr.length - 1);
for (let i = 1; i < arr.length; i++) {
arr[i] >>= 2;
arr[i] <<= i*8;
arr[0] = arr[0] | arr[i]
}
return arr[0];
}
225 changes: 220 additions & 5 deletions test/node/byte_utils.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
import * as sinon from 'sinon';
import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
import * as crypto from 'node:crypto';
import { BSONError, BSONUTF8Error } from '../../src/error';

type ByteUtilTest<K extends keyof ByteUtils> = {
name: string;
Expand Down Expand Up @@ -399,6 +400,8 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
}
}
];


const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
{
name: 'should create utf8 string from buffer input',
Expand All @@ -416,6 +419,14 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
expect(output).to.be.a('string').with.lengthOf(0);
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
}
},
{
name: 'should throw an error if fatal is set and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
Expand All @@ -424,14 +435,168 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
}
},
{
name: 'should insert replacement character fatal is false and string is invalid',
inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
expectation({ error, output }) {
expect(error).to.not.exist;
expect(output).to.equal('abc\uFFFD');
name: 'throw an error if fatal is set and string contains overlong encoding',
inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains invalid bytes',
inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{
name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFF', 'hex'), 0, 1, true], name: 'throws when provided with invalid code' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xC0', 'hex'), 0, 1, true], name: 'throws when provided with ends early' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE0', 'hex'), 0, 1, true], name: 'throws when provided with ends early 2' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xC000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xC0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 2' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 3' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 4' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE08000', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 5' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE080C0', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 6' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with > 0x10FFFF' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFE8080808080', 'hex'), 0, 6, true], name: 'throws when provided with obsolete lead byte' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

// Overlong encodings
{ inputs: [Buffer.from('0xC080', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+0000 - 2 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE08080', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+0000 - 3 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF0808080', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+0000 - 4 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF880808080', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+0000 - 5 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+0000 - 6 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

{ inputs: [Buffer.from('0xC1BF', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+007F - 2 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xE081BF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+007F - 3 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF08081BF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+007F - 4 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF8808081BF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+007F - 5 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC80808081BF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+007F - 6 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

{ inputs: [Buffer.from('0xE09FBF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+07FF - 3 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF0809FBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+07FF - 4 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF880809FBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+07FF - 5 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC8080809FBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+07FF - 6 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

{ inputs: [Buffer.from('0xF08FBFBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+FFFF - 4 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xF8808FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+FFFF - 5 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC80808FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+FFFF - 6 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

{ inputs: [Buffer.from('0xF8848FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+10FFFF - 5 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xFC80848FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+10FFFF - 6 bytes' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},

// UTF-16 surrogates encoded as code points in UTF-8
{ inputs: [Buffer.from('0xEDA080', 'hex'), 0, 3, true], name: 'throws when provided with lead surrogate' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xEDB080', 'hex'), 0, 3, true], name: 'throws when provided with trail surrogate' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
},
{ inputs: [Buffer.from('0xEDA080EDB080', 'hex'), 0, 6, true], name: 'throws when provided with surrogate pair' , expectation({ error }) {
expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
}
}
];

const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
{
name: 'should return zero for empty string',
Expand Down Expand Up @@ -801,4 +966,54 @@ describe('ByteUtils', () => {
});
}
}

let bad = [
{ encoding: 'utf-8', input: [0xFF], name: 'invalid code' },
{ encoding: 'utf-8', input: [0xC0], name: 'ends early' },
{ encoding: 'utf-8', input: [0xE0], name: 'ends early 2' },
{ encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' },
{ encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' },
{ encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' },
{ encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' },
{ encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },

// Overlong encodings
{ encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' },
{ encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },

{ encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' },
{ encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' },

{ encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' },
{ encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' },

{ encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' },
{ encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' },

{ encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' },
{ encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' },

// UTF-16 surrogates encoded as code points in UTF-8
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' },
{ encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' },
{ encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' },
];

for (const test of bad) {
it.only(`${test.name}`, () => {
expect(() => nodeJsByteUtils.toUTF8(Uint8Array.from(test.input), 0, test.input.length, true)).to.throw(BSONError);
});
}
});

0 comments on commit da9de3a

Please sign in to comment.