From da9de3a1ded8b5e2b6908ca8da628e668a84ef65 Mon Sep 17 00:00:00 2001
From: Aditi Khare <aditi.khare@mongodb.com>
Date: Tue, 23 Apr 2024 17:08:51 -0400
Subject: [PATCH] temp try to detect overlong encoding w/o textDecoder

---
 .../require_vendor.mjs                        |   1 +
 src/error.ts                                  |  10 +
 src/test.ts                                   |   9 +
 src/utils/node_byte_utils.ts                  |  24 +-
 src/utils/web_byte_utils.ts                   |   4 +-
 src/validate_utf8.ts                          |  27 ++-
 test/node/byte_utils.test.ts                  | 225 +++++++++++++++++-
 7 files changed, 289 insertions(+), 11 deletions(-)
 create mode 100644 src/test.ts

diff --git a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
index 7d4fa4e91..bdfe9c111 100644
--- a/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
+++ b/etc/rollup/rollup-plugin-require-vendor/require_vendor.mjs
@@ -14,6 +14,7 @@ export class RequireVendor {
    * @returns {{ code: string; map: import('magic-string').SourceMap }}
    */
   transform(code, id) {
+    // TODO(NODE-4930)
     if (!id.includes('web_byte_utils')) {
       return;
     }
diff --git a/src/error.ts b/src/error.ts
index ef5184a4a..7dd101c57 100644
--- a/src/error.ts
+++ b/src/error.ts
@@ -103,3 +103,13 @@ export class BSONOffsetError extends BSONError {
     this.offset = offset;
   }
 }
+
+export class BSONUTF8Error extends BSONError {
+  public get name(): 'BSONUTF8Error' {
+    return 'BSONUTF8Error';
+  }
+
+  constructor(message: string, options?: { cause?: unknown }) {
+    super(message, options);
+  }
+}
diff --git a/src/test.ts b/src/test.ts
new file mode 100644
index 000000000..2ef044cfa
--- /dev/null
+++ b/src/test.ts
@@ -0,0 +1,9 @@
+function parseUtf8Bits(arr: number[]): number {
+  arr[0] >>= (arr.length - 1);
+  for (let i = 1; i < arr.length; i++) {
+    arr[i] >>= 2;
+    arr[i] <<= i*8;
+    arr[0] = arr[0] | arr[i]
+  }
+  return arr[0];
+}
\ No newline at end of file
diff --git a/src/utils/node_byte_utils.ts b/src/utils/node_byte_utils.ts
index d6a641a47..8603b45e1 100644
--- a/src/utils/node_byte_utils.ts
+++ b/src/utils/node_byte_utils.ts
@@ -1,4 +1,4 @@
-import { BSONError } from '../error';
+import { BSONError, BSONUTF8Error } from '../error';
 import { validateUtf8 } from '../validate_utf8';
 import { tryReadBasicLatin, tryWriteBasicLatin } from './latin';
 
@@ -27,6 +27,28 @@ type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
 declare const Buffer: NodeJsBufferConstructor;
 declare const require: (mod: 'crypto') => { randomBytes: (byteLength: number) => Uint8Array };
 
+type TextDecoder = {
+  readonly encoding: string;
+  readonly fatal: boolean;
+  readonly ignoreBOM: boolean;
+  decode(input?: Uint8Array): string;
+};
+type TextDecoderConstructor = {
+  new (label: 'utf8', options: { fatal: boolean; ignoreBOM?: boolean }): TextDecoder;
+};
+
+type TextEncoder = {
+  readonly encoding: string;
+  encode(input?: string): Uint8Array;
+};
+type TextEncoderConstructor = {
+  new (): TextEncoder;
+};
+
+// Node byte utils global
+declare const TextDecoder: TextDecoderConstructor;
+declare const TextEncoder: TextEncoderConstructor;
+
 /** @internal */
 export function nodejsMathRandomBytes(byteLength: number) {
   return nodeJsByteUtils.fromNumberArray(
diff --git a/src/utils/web_byte_utils.ts b/src/utils/web_byte_utils.ts
index 77a1f0f74..e7d39b62b 100644
--- a/src/utils/web_byte_utils.ts
+++ b/src/utils/web_byte_utils.ts
@@ -1,4 +1,4 @@
-import { BSONError } from '../error';
+import { BSONError, BSONUTF8Error } from '../error';
 import { tryReadBasicLatin } from './latin';
 
 type TextDecoder = {
@@ -183,7 +183,7 @@ export const webByteUtils = {
       try {
         return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
       } catch (cause) {
-        throw new BSONError('Invalid UTF-8 string in BSON document', { cause });
+        throw new BSONUTF8Error('Invalid UTF-8 string in BSON document', { cause });
       }
     }
     return new TextDecoder('utf8', { fatal }).decode(uint8array.slice(start, end));
diff --git a/src/validate_utf8.ts b/src/validate_utf8.ts
index e1da934c6..ba1acb7e8 100644
--- a/src/validate_utf8.ts
+++ b/src/validate_utf8.ts
@@ -1,3 +1,5 @@
+import { NumberUtils } from "./utils/number_utils";
+
 const FIRST_BIT = 0x80;
 const FIRST_TWO_BITS = 0xc0;
 const FIRST_THREE_BITS = 0xe0;
@@ -9,6 +11,12 @@ const THREE_BIT_CHAR = 0xe0;
 const FOUR_BIT_CHAR = 0xf0;
 const CONTINUING_CHAR = 0x80;
 
+// max utf8 values representable in given number of bytes
+const ONE_BYTE_MAX = 0x7f;
+const TWO_BYTE_MAX = 0x7ff;
+const THREE_BYTE_MAX = 0xf7ff;
+
+
 /**
  * Determines if the passed in bytes are valid utf8
  * @param bytes - An array of 8-bit bytes. Must be indexable and have length property
@@ -30,12 +38,15 @@ export function validateUtf8(
         return false;
       }
       continuation -= 1;
-    } else if (byte & FIRST_BIT) {
+    } else if (byte & FIRST_BIT &&
+      parseUtf8Bytes([byte, bytes[i+1]]) > ONE_BYTE_MAX) {
       if ((byte & FIRST_THREE_BITS) === TWO_BIT_CHAR) {
         continuation = 1;
-      } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR) {
+      } else if ((byte & FIRST_FOUR_BITS) === THREE_BIT_CHAR &&
+      parseUtf8Bytes([byte, bytes[i+1], bytes[i+2]]) > TWO_BYTE_MAX) {
         continuation = 2;
-      } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR) {
+      } else if ((byte & FIRST_FIVE_BITS) === FOUR_BIT_CHAR &&
+      parseUtf8Bytes([byte, bytes[i+1], bytes[i+2], bytes[i+3]]) > THREE_BYTE_MAX) {
         continuation = 3;
       } else {
         return false;
@@ -45,3 +56,13 @@ export function validateUtf8(
 
   return !continuation;
 }
+
+function parseUtf8Bytes(arr: number[]): number {
+  arr[0] >>= (arr.length - 1);
+  for (let i = 1; i < arr.length; i++) {
+    arr[i] >>= 2;
+    arr[i] <<= i*8;
+    arr[0] = arr[0] | arr[i]
+  }
+  return arr[0];
+}
\ No newline at end of file
diff --git a/test/node/byte_utils.test.ts b/test/node/byte_utils.test.ts
index fa6d7f893..0a3e75ae0 100644
--- a/test/node/byte_utils.test.ts
+++ b/test/node/byte_utils.test.ts
@@ -8,6 +8,7 @@ import { webByteUtils } from '../../src/utils/web_byte_utils';
 import * as sinon from 'sinon';
 import { loadCJSModuleBSON, loadReactNativeCJSModuleBSON, loadESModuleBSON } from '../load_bson';
 import * as crypto from 'node:crypto';
+import { BSONError, BSONUTF8Error } from '../../src/error';
 
 type ByteUtilTest<K extends keyof ByteUtils> = {
   name: string;
@@ -399,6 +400,8 @@ const fromUTF8Tests: ByteUtilTest<'encodeUTF8Into'>[] = [
     }
   }
 ];
+
+
 const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
   {
     name: 'should create utf8 string from buffer input',
@@ -416,6 +419,14 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
       expect(output).to.be.a('string').with.lengthOf(0);
     }
   },
+  {
+    name: 'should insert replacement character fatal is false and string is invalid',
+    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
+    expectation({ error, output }) {
+      expect(error).to.not.exist;
+      expect(output).to.equal('abc\uFFFD');
+    }
+  },
   {
     name: 'should throw an error if fatal is set and string is invalid',
     inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, true],
@@ -424,14 +435,168 @@ const toUTF8Tests: ByteUtilTest<'toUTF8'>[] = [
     }
   },
   {
-    name: 'should insert replacement character fatal is false and string is invalid',
-    inputs: [Buffer.from('616263f09fa4', 'hex'), 0, 7, false],
-    expectation({ error, output }) {
-      expect(error).to.not.exist;
-      expect(output).to.equal('abc\uFFFD');
+    name: 'throw an error if fatal is set and string contains overlong encoding',
+    inputs: [Buffer.from('11000000025f0005000000f08282ac0000', 'hex'), 0, 18, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string contains invalid bytes',
+    inputs: [Buffer.from('abcff', 'hex'), 0, 2, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  {
+    name: 'throw an error if fatal is set and string contains an unexpected continuation byte',
+    inputs: [Buffer.from('7F80', 'hex'), 0, 2, true],
+    expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  }, 
+  { inputs: [Buffer.from('0xFF', 'hex'), 0, 1, true], name: 'throws when provided with invalid code' , expectation({ error }) {
+    expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xC0', 'hex'), 0, 1, true], name: 'throws when provided with ends early' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE0', 'hex'), 0, 1, true], name: 'throws when provided with ends early 2' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xC000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xC0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 2' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE000', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 3' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE0C0', 'hex'), 0, 2, true], name: 'throws when provided with invalid trail 4' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE08000', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 5' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE080C0', 'hex'), 0, 3, true], name: 'throws when provided with invalid trail 6' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with > 0x10FFFF' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFE8080808080', 'hex'), 0, 6, true], name: 'throws when provided with obsolete lead byte' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  // Overlong encodings
+  { inputs: [Buffer.from('0xC080', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+0000 - 2 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE08080', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+0000 - 3 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF0808080', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+0000 - 4 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF880808080', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+0000 - 5 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFC8080808080', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+0000 - 6 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  { inputs: [Buffer.from('0xC1BF', 'hex'), 0, 2, true], name: 'throws when provided with overlong U+007F - 2 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xE081BF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+007F - 3 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF08081BF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+007F - 4 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF8808081BF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+007F - 5 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFC80808081BF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+007F - 6 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  { inputs: [Buffer.from('0xE09FBF', 'hex'), 0, 3, true], name: 'throws when provided with overlong U+07FF - 3 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF0809FBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+07FF - 4 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF880809FBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+07FF - 5 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+    { inputs: [Buffer.from('0xFC8080809FBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+07FF - 6 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  { inputs: [Buffer.from('0xF08FBFBF', 'hex'), 0, 4, true], name: 'throws when provided with overlong U+FFFF - 4 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xF8808FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+FFFF - 5 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFC80808FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+FFFF - 6 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  { inputs: [Buffer.from('0xF8848FBFBF', 'hex'), 0, 5, true], name: 'throws when provided with overlong U+10FFFF - 5 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xFC80848FBFBF', 'hex'), 0, 6, true], name: 'throws when provided with overlong U+10FFFF - 6 bytes' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+
+  // UTF-16 surrogates encoded as code points in UTF-8
+  { inputs: [Buffer.from('0xEDA080', 'hex'), 0, 3, true], name: 'throws when provided with lead surrogate' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xEDB080', 'hex'), 0, 3, true], name: 'throws when provided with trail surrogate' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
+    }
+  },
+  { inputs: [Buffer.from('0xEDA080EDB080', 'hex'), 0, 6, true], name: 'throws when provided with surrogate pair' , expectation({ error }) {
+      expect(error).to.match(/Invalid UTF-8 string in BSON document/i);
     }
   }
 ];
+
 const utf8ByteLengthTests: ByteUtilTest<'utf8ByteLength'>[] = [
   {
     name: 'should return zero for empty string',
@@ -801,4 +966,54 @@ describe('ByteUtils', () => {
       });
     }
   }
+
+  let bad = [
+    { encoding: 'utf-8', input: [0xFF], name: 'invalid code' },
+    { encoding: 'utf-8', input: [0xC0], name: 'ends early' },
+    { encoding: 'utf-8', input: [0xE0], name: 'ends early 2' },
+    { encoding: 'utf-8', input: [0xC0, 0x00], name: 'invalid trail' },
+    { encoding: 'utf-8', input: [0xC0, 0xC0], name: 'invalid trail 2' },
+    { encoding: 'utf-8', input: [0xE0, 0x00], name: 'invalid trail 3' },
+    { encoding: 'utf-8', input: [0xE0, 0xC0], name: 'invalid trail 4' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0x00], name: 'invalid trail 5' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0], name: 'invalid trail 6' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: '> 0x10FFFF' },
+    { encoding: 'utf-8', input: [0xFE, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'obsolete lead byte' },
+
+    // Overlong encodings
+    { encoding: 'utf-8', input: [0xC0, 0x80], name: 'overlong U+0000 - 2 bytes' },
+    { encoding: 'utf-8', input: [0xE0, 0x80, 0x80], name: 'overlong U+0000 - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], name: 'overlong U+0000 - 6 bytes' },
+
+    { encoding: 'utf-8', input: [0xC1, 0xBF], name: 'overlong U+007F - 2 bytes' },
+    { encoding: 'utf-8', input: [0xE0, 0x81, 0xBF], name: 'overlong U+007F - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x81, 0xBF], name: 'overlong U+007F - 6 bytes' },
+
+    { encoding: 'utf-8', input: [0xE0, 0x9F, 0xBF], name: 'overlong U+07FF - 3 bytes' },
+    { encoding: 'utf-8', input: [0xF0, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x9F, 0xBF], name: 'overlong U+07FF - 6 bytes' },
+
+    { encoding: 'utf-8', input: [0xF0, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 4 bytes' },
+    { encoding: 'utf-8', input: [0xF8, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x8F, 0xBF, 0xBF], name: 'overlong U+FFFF - 6 bytes' },
+
+    { encoding: 'utf-8', input: [0xF8, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 5 bytes' },
+    { encoding: 'utf-8', input: [0xFC, 0x80, 0x84, 0x8F, 0xBF, 0xBF], name: 'overlong U+10FFFF - 6 bytes' },
+
+    // UTF-16 surrogates encoded as code points in UTF-8
+    { encoding: 'utf-8', input: [0xED, 0xA0, 0x80], name: 'lead surrogate' },
+    { encoding: 'utf-8', input: [0xED, 0xB0, 0x80], name: 'trail surrogate' },
+    { encoding: 'utf-8', input: [0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80], name: 'surrogate pair' },
+  ];
+  
+  for (const test of bad) {
+    it.only(`${test.name}`, () => {
+      expect(() => nodeJsByteUtils.toUTF8(Uint8Array.from(test.input), 0, test.input.length, true)).to.throw(BSONError);
+    });
+  }
 });