Skip to content

Commit

Permalink
perf: faster utf8ToBytes (#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
Alan Shaw authored Sep 12, 2023
1 parent aff9d25 commit a7e62cb
Showing 1 changed file with 29 additions and 86 deletions.
115 changes: 29 additions & 86 deletions lib/byte-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -275,101 +275,44 @@ export function compare (b1, b2) {
return 0
}

// The below code is mostly taken from https://github.com/feross/buffer
// Licensed MIT. Copyright (c) Feross Aboukhadijeh
// The below code is taken from https://github.com/google/closure-library/blob/8598d87242af59aac233270742c8984e2b2bdbe0/closure/goog/crypt/crypt.js#L117-L143
// Licensed Apache-2.0.

/**
* @param {string} string
* @param {number} [units]
* @param {string} str
* @returns {number[]}
*/
function utf8ToBytes (string, units = Infinity) {
let codePoint
const length = string.length
let leadSurrogate = null
const bytes = []

for (let i = 0; i < length; ++i) {
codePoint = string.charCodeAt(i)

// is surrogate component
if (codePoint > 0xd7ff && codePoint < 0xe000) {
// last char was a lead
if (!leadSurrogate) {
// no lead yet
/* c8 ignore next 9 */
if (codePoint > 0xdbff) {
// unexpected trail
if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
continue
} else if (i + 1 === length) {
// unpaired lead
if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
continue
}

// valid lead
leadSurrogate = codePoint

continue
}

// 2 leads in a row
/* c8 ignore next 5 */
if (codePoint < 0xdc00) {
if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
leadSurrogate = codePoint
continue
}

// valid surrogate pair
codePoint = (leadSurrogate - 0xd800 << 10 | codePoint - 0xdc00) + 0x10000
/* c8 ignore next 4 */
} else if (leadSurrogate) {
// valid bmp char, but last char was a lead
if ((units -= 3) > -1) bytes.push(0xef, 0xbf, 0xbd)
}

leadSurrogate = null

// encode utf8
if (codePoint < 0x80) {
/* c8 ignore next 1 */
if ((units -= 1) < 0) break
bytes.push(codePoint)
} else if (codePoint < 0x800) {
/* c8 ignore next 1 */
if ((units -= 2) < 0) break
bytes.push(
codePoint >> 0x6 | 0xc0,
codePoint & 0x3f | 0x80
)
} else if (codePoint < 0x10000) {
/* c8 ignore next 1 */
if ((units -= 3) < 0) break
bytes.push(
codePoint >> 0xc | 0xe0,
codePoint >> 0x6 & 0x3f | 0x80,
codePoint & 0x3f | 0x80
)
/* c8 ignore next 9 */
} else if (codePoint < 0x110000) {
if ((units -= 4) < 0) break
bytes.push(
codePoint >> 0x12 | 0xf0,
codePoint >> 0xc & 0x3f | 0x80,
codePoint >> 0x6 & 0x3f | 0x80,
codePoint & 0x3f | 0x80
)
function utf8ToBytes (str) {
const out = []
let p = 0
for (let i = 0; i < str.length; i++) {
let c = str.charCodeAt(i)
if (c < 128) {
out[p++] = c
} else if (c < 2048) {
out[p++] = (c >> 6) | 192
out[p++] = (c & 63) | 128
} else if (
((c & 0xFC00) === 0xD800) && (i + 1) < str.length &&
((str.charCodeAt(i + 1) & 0xFC00) === 0xDC00)) {
// Surrogate Pair
c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF)
out[p++] = (c >> 18) | 240
out[p++] = ((c >> 12) & 63) | 128
out[p++] = ((c >> 6) & 63) | 128
out[p++] = (c & 63) | 128
} else {
/* c8 ignore next 2 */
throw new Error('Invalid code point')
out[p++] = (c >> 12) | 224
out[p++] = ((c >> 6) & 63) | 128
out[p++] = (c & 63) | 128
}
}

return bytes
return out
}

// The below code is mostly taken from https://github.com/feross/buffer
// Licensed MIT. Copyright (c) Feross Aboukhadijeh

/**
* @param {Uint8Array} buf
* @param {number} offset
Expand Down

0 comments on commit a7e62cb

Please sign in to comment.