From 2dde54135b5bb157b93382722c28756486207459 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 22:54:44 -0400 Subject: [PATCH 1/7] add failing toString tests --- test/to-string.js | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/test/to-string.js b/test/to-string.js index 172c9e90..662f8c9e 100644 --- a/test/to-string.js +++ b/test/to-string.js @@ -115,3 +115,87 @@ test('utf8 to binary', function (t) { /* jshint +W100 */ t.end() }) + +test('utf8 replacement chars (1 byte sequence)', function (t) { + t.equal( + new B([ 0x80 ]).toString(), + '\uFFFD' + ) + t.equal( + new B([ 0x7F ]).toString(), + '\u007F' + ) + t.end() +}) + +test('utf8 replacement chars (2 byte sequences)', function (t) { + t.equal( + new B([ 0xC7 ]).toString(), + '\uFFFD' + ) + t.equal( + new B([ 0xC7, 0xB1 ]).toString(), + '\u01F1' + ) + t.equal( + new B([ 0xC0, 0xB1 ]).toString(), + '\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xC1, 0xB1 ]).toString(), + '\uFFFD\uFFFD' + ) + t.end() +}) + +test('utf8 replacement chars (3 byte sequences)', function (t) { + t.equal( + new B([ 0xE0 ]).toString(), + '\uFFFD' + ) + t.equal( + new B([ 0xE0, 0xAC ]).toString(), + '\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xE0, 0xAC, 0xB9 ]).toString(), + '\u0B39' + ) + t.end() +}) + +test('utf8 replacement chars (4 byte sequences)', function (t) { + t.equal( + new B([ 0xF4 ]).toString(), + '\uFFFD' + ) + t.equal( + new B([ 0xF4, 0x8F ]).toString(), + '\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xF4, 0x8F, 0x80 ]).toString(), + '\uFFFD\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xF4, 0x8F, 0x80, 0x84 ]).toString(), + '\uDBFC\uDC04' + ) + t.equal( + new B([ 0xFF ]).toString(), + '\uFFFD' + ) + t.equal( + new B([ 0xFF, 0x8F, 0x80, 0x84 ]).toString(), + '\uFFFD\uFFFD\uFFFD\uFFFD' + ) + t.end() +}) + +test('utf8 replacement chars on 256 random bytes', function (t) { + t.equal( + new B([ 152, 130, 206, 23, 243, 238, 197, 44, 27, 86, 208, 36, 163, 184, 164, 21, 94, 242, 178, 46, 25, 26, 253, 178, 72, 147, 207, 112, 236, 68, 179, 190, 29, 83, 239, 147, 125, 55, 143, 19, 157, 68, 157, 58, 212, 224, 150, 39, 128, 24, 94, 225, 120, 121, 75, 192, 112, 19, 184, 142, 203, 36, 43, 85, 26, 147, 227, 139, 242, 186, 57, 78, 11, 102, 136, 117, 180, 210, 241, 92, 3, 215, 54, 167, 249, 1, 44, 225, 146, 86, 2, 42, 68, 21, 47, 238, 204, 153, 216, 252, 183, 66, 222, 255, 15, 202, 16, 51, 134, 1, 17, 19, 209, 76, 238, 38, 76, 19, 7, 103, 249, 5, 107, 137, 64, 62, 170, 57, 16, 85, 179, 193, 97, 86, 166, 196, 36, 148, 138, 193, 210, 69, 187, 38, 242, 97, 195, 219, 252, 244, 38, 1, 197, 18, 31, 246, 53, 47, 134, 52, 105, 72, 43, 239, 128, 203, 73, 93, 199, 75, 222, 220, 166, 34, 63, 236, 11, 212, 76, 243, 171, 110, 78, 39, 205, 204, 6, 177, 233, 212, 243, 0, 33, 41, 122, 118, 92, 252, 0, 157, 108, 120, 70, 137, 100, 223, 243, 171, 232, 66, 126, 111, 142, 33, 3, 39, 117, 27, 107, 54, 1, 217, 227, 132, 13, 166, 3, 73, 53, 127, 225, 236, 134, 219, 98, 214, 125, 148, 24, 64, 142, 111, 231, 194, 42, 150, 185, 10, 182, 163, 244, 19, 4, 59, 135, 16 ]).toString(), + '\uFFFD\uFFFD\uFFFD\u0017\uFFFD\uFFFD\uFFFD\u002C\u001B\u0056\uFFFD\u0024\uFFFD\uFFFD\uFFFD\u0015\u005E\uFFFD\uFFFD\u002E\u0019\u001A\uFFFD\uFFFD\u0048\uFFFD\uFFFD\u0070\uFFFD\u0044\uFFFD\uFFFD\u001D\u0053\uFFFD\uFFFD\u007D\u0037\uFFFD\u0013\uFFFD\u0044\uFFFD\u003A\uFFFD\uFFFD\uFFFD\u0027\uFFFD\u0018\u005E\uFFFD\u0078\u0079\u004B\uFFFD\u0070\u0013\uFFFD\uFFFD\uFFFD\u0024\u002B\u0055\u001A\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\u0039\u004E\u000B\u0066\uFFFD\u0075\uFFFD\uFFFD\uFFFD\u005C\u0003\uFFFD\u0036\uFFFD\uFFFD\u0001\u002C\uFFFD\uFFFD\u0056\u0002\u002A\u0044\u0015\u002F\uFFFD\u0319\uFFFD\uFFFD\uFFFD\u0042\uFFFD\uFFFD\u000F\uFFFD\u0010\u0033\uFFFD\u0001\u0011\u0013\uFFFD\u004C\uFFFD\u0026\u004C\u0013\u0007\u0067\uFFFD\u0005\u006B\uFFFD\u0040\u003E\uFFFD\u0039\u0010\u0055\uFFFD\uFFFD\u0061\u0056\uFFFD\uFFFD\u0024\uFFFD\uFFFD\uFFFD\uFFFD\u0045\uFFFD\u0026\uFFFD\u0061\uFFFD\uFFFD\uFFFD\uFFFD\u0026\u0001\uFFFD\u0012\u001F\uFFFD\u0035\u002F\uFFFD\u0034\u0069\u0048\u002B\uFFFD\uFFFD\uFFFD\u0049\u005D\uFFFD\u004B\uFFFD\u0726\u0022\u003F\uFFFD\u000B\uFFFD\u004C\uFFFD\uFFFD\u006E\u004E\u0027\uFFFD\uFFFD\u0006\uFFFD\uFFFD\uFFFD\uFFFD\u0000\u0021\u0029\u007A\u0076\u005C\uFFFD\u0000\uFFFD\u006C\u0078\u0046\uFFFD\u0064\uFFFD\uFFFD\uFFFD\uFFFD\u0042\u007E\u006F\uFFFD\u0021\u0003\u0027\u0075\u001B\u006B\u0036\u0001\uFFFD\uFFFD\uFFFD\u000D\uFFFD\u0003\u0049\u0035\u007F\uFFFD\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u007D\uFFFD\u0018\u0040\uFFFD\u006F\uFFFD\uFFFD\u002A\uFFFD\uFFFD\u000A\uFFFD\uFFFD\uFFFD\u0013\u0004\u003B\uFFFD\u0010' + ) + t.end() +}) From 56223c5ce6276bcee8dac2402dce7cd3b5ef4ea2 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 22:55:17 -0400 Subject: [PATCH 2/7] add readUtf8 benchmark --- perf/readUtf8.js | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 perf/readUtf8.js diff --git a/perf/readUtf8.js b/perf/readUtf8.js new file mode 100644 index 00000000..a0109ff8 --- /dev/null +++ b/perf/readUtf8.js @@ -0,0 +1,19 @@ +var BrowserBuffer = require('../').Buffer // (this module) +var util = require('./util') +var suite = util.suite() + +// 256 random bytes +var array = [ 152, 130, 206, 23, 243, 238, 197, 44, 27, 86, 208, 36, 163, 184, 164, 21, 94, 242, 178, 46, 25, 26, 253, 178, 72, 147, 207, 112, 236, 68, 179, 190, 29, 83, 239, 147, 125, 55, 143, 19, 157, 68, 157, 58, 212, 224, 150, 39, 128, 24, 94, 225, 120, 121, 75, 192, 112, 19, 184, 142, 203, 36, 43, 85, 26, 147, 227, 139, 242, 186, 57, 78, 11, 102, 136, 117, 180, 210, 241, 92, 3, 215, 54, 167, 249, 1, 44, 225, 146, 86, 2, 42, 68, 21, 47, 238, 204, 153, 216, 252, 183, 66, 222, 255, 15, 202, 16, 51, 134, 1, 17, 19, 209, 76, 238, 38, 76, 19, 7, 103, 249, 5, 107, 137, 64, 62, 170, 57, 16, 85, 179, 193, 97, 86, 166, 196, 36, 148, 138, 193, 210, 69, 187, 38, 242, 97, 195, 219, 252, 244, 38, 1, 197, 18, 31, 246, 53, 47, 134, 52, 105, 72, 43, 239, 128, 203, 73, 93, 199, 75, 222, 220, 166, 34, 63, 236, 11, 212, 76, 243, 171, 110, 78, 39, 205, 204, 6, 177, 233, 212, 243, 0, 33, 41, 122, 118, 92, 252, 0, 157, 108, 120, 70, 137, 100, 223, 243, 171, 232, 66, 126, 111, 142, 33, 3, 39, 117, 27, 107, 54, 1, 217, 227, 132, 13, 166, 3, 73, 53, 127, 225, 236, 134, 219, 98, 214, 125, 148, 24, 64, 142, 111, 231, 194, 42, 150, 185, 10, 182, 163, 244, 19, 4, 59, 135, 16 ] + +var browserBuffer = new BrowserBuffer(array) +var nodeBuffer = new Buffer(array) + +suite + .add('BrowserBuffer#readUtf8', function () { + browserBuffer.toString() + }) + +if (!process.browser) suite + .add('NodeBuffer#readUtf8', function () { + nodeBuffer.toString() + }) From 0b3bc9aa8da69c96770912afbd316abc2893f2d9 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 22:56:19 -0400 Subject: [PATCH 3/7] do utf8 -> unicode -> utf16 the way node does --- index.js | 83 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/index.js b/index.js index 6c6be996..b9a511b5 100644 --- a/index.js +++ b/index.js @@ -623,20 +623,77 @@ function base64Slice (buf, start, end) { } function utf8Slice (buf, start, end) { - var res = '' - var tmp = '' end = Math.min(buf.length, end) - - for (var i = start; i < end; i++) { - if (buf[i] <= 0x7F) { - res += decodeUtf8Char(tmp) + String.fromCharCode(buf[i]) - tmp = '' + var firstByte, secondByte, thirdByte, fourthByte, bytesPerSequence, tempCodePoint, codePoint, res = [] + var i = start + + for (; i < end; i += bytesPerSequence) { + firstByte = buf[i] + codePoint = 0xFFFD + + if (firstByte > 0xEF) { + bytesPerSequence = 4 + } else if (firstByte > 0xDF) { + bytesPerSequence = 3 + } else if (firstByte > 0xBF) { + bytesPerSequence = 2 } else { - tmp += '%' + buf[i].toString(16) + bytesPerSequence = 1 } + + if (i + bytesPerSequence <= end) { + switch (bytesPerSequence) { + case 1: + if (firstByte < 0x80) { + codePoint = firstByte + } + break + case 2: + secondByte = buf[i + 1] + if ((secondByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0x1F) << 0x6 | (secondByte & 0x3F) + if (tempCodePoint > 0x7F) { + codePoint = tempCodePoint + } + } + break + case 3: + secondByte = buf[i + 1] + thirdByte = buf[i + 2] + if ((secondByte & 0xC0) === 0x80 && (thirdByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0xF) << 0xC | (secondByte & 0x3F) << 0x6 | (thirdByte & 0x3F) + if (tempCodePoint > 0x7FF) { + codePoint = tempCodePoint + } + } + break + case 4: + secondByte = buf[i + 1] + thirdByte = buf[i + 2] + fourthByte = buf[i + 3] + if ((secondByte & 0xC0) === 0x80 && (thirdByte & 0xC0) === 0x80 && (fourthByte & 0xC0) === 0x80) { + tempCodePoint = (firstByte & 0xF) << 0x12 | (secondByte & 0x3F) << 0xC | (thirdByte & 0x3F) << 0x6 | (fourthByte & 0x3F) + if (tempCodePoint > 0xFFFF && tempCodePoint < 0x110000) { + codePoint = tempCodePoint + } + } + } + } + + if (codePoint === 0xFFFD) { + // we generated an invalid codePoint so make sure to only advance by 1 byte + bytesPerSequence = 1 + } else if (codePoint > 0xFFFF) { + // encode to utf16 (surrogate pair dance) + codePoint -= 0x10000 + res.push(codePoint >>> 10 & 0x3FF | 0xD800) + codePoint = 0xDC00 | codePoint & 0x3FF + } + + res.push(codePoint) } - return res + decodeUtf8Char(tmp) + return String.fromCharCode.apply(String, res) } function asciiSlice (buf, start, end) { @@ -1453,11 +1510,3 @@ function blitBuffer (src, dst, offset, length) { } return i } - -function decodeUtf8Char (str) { - try { - return decodeURIComponent(str) - } catch (err) { - return String.fromCharCode(0xFFFD) // UTF 8 invalid char - } -} From 4af68f5b34e5e78271c7e24fcd1ecbca3690b69d Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 23:36:15 -0400 Subject: [PATCH 4/7] unicode top should be 0x10FFFF --- index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.js b/index.js index b9a511b5..cdd3092a 100644 --- a/index.js +++ b/index.js @@ -1458,7 +1458,7 @@ function utf8ToBytes (string, units) { codePoint >> 0x6 & 0x3F | 0x80, codePoint & 0x3F | 0x80 ) - } else if (codePoint < 0x200000) { + } else if (codePoint < 0x110000) { if ((units -= 4) < 0) break bytes.push( codePoint >> 0x12 | 0xF0, From 72fc9d6834ecd334505d6c73aef160b4837320c9 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Mon, 10 Aug 2015 23:45:07 -0400 Subject: [PATCH 5/7] standardize --- index.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index cdd3092a..623e7cfc 100644 --- a/index.js +++ b/index.js @@ -624,7 +624,14 @@ function base64Slice (buf, start, end) { function utf8Slice (buf, start, end) { end = Math.min(buf.length, end) - var firstByte, secondByte, thirdByte, fourthByte, bytesPerSequence, tempCodePoint, codePoint, res = [] + var firstByte + var secondByte + var thirdByte + var fourthByte + var bytesPerSequence + var tempCodePoint + var codePoint + var res = [] var i = start for (; i < end; i += bytesPerSequence) { From 095ac43d808a3f8a063e2af685a78fb381db2169 Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Tue, 11 Aug 2015 00:43:36 -0400 Subject: [PATCH 6/7] add failing test for missing replacement characters in the surrogate pair range --- test/to-string.js | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/to-string.js b/test/to-string.js index 662f8c9e..8fd9db1f 100644 --- a/test/to-string.js +++ b/test/to-string.js @@ -199,3 +199,27 @@ test('utf8 replacement chars on 256 random bytes', function (t) { ) t.end() }) + +test('utf8 replacement chars for anything in the surrogate pair range', function (t) { + t.equal( + new B([ 0xED, 0x9F, 0xBF ]).toString(), + '\uD7FF' + ) + t.equal( + new B([ 0xED, 0xA0, 0x80 ]).toString(), + '\uFFFD\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xED, 0xBE, 0x8B ]).toString(), + '\uFFFD\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xED, 0xBF, 0xBF ]).toString(), + '\uFFFD\uFFFD\uFFFD' + ) + t.equal( + new B([ 0xEE, 0x80, 0x80 ]).toString(), + '\uE000' + ) + t.end() +}) From 3e1bdd0dae6d387c7ec3ab6d800ede055814d20f Mon Sep 17 00:00:00 2001 From: Jesse Tane Date: Tue, 11 Aug 2015 00:44:05 -0400 Subject: [PATCH 7/7] replace unicode in the surrogate pair range the way node@3 does --- index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.js b/index.js index 623e7cfc..b5f4588d 100644 --- a/index.js +++ b/index.js @@ -669,7 +669,7 @@ function utf8Slice (buf, start, end) { thirdByte = buf[i + 2] if ((secondByte & 0xC0) === 0x80 && (thirdByte & 0xC0) === 0x80) { tempCodePoint = (firstByte & 0xF) << 0xC | (secondByte & 0x3F) << 0x6 | (thirdByte & 0x3F) - if (tempCodePoint > 0x7FF) { + if (tempCodePoint > 0x7FF && (tempCodePoint < 0xD800 || tempCodePoint > 0xDFFF)) { codePoint = tempCodePoint } }