Skip to content

Commit

Permalink
buffer: optimize Buffer.byteLength
Browse files Browse the repository at this point in the history
Buffer.byteLength is called whenever a new string Buffer is created.
UTF8 is used as the default encoding, and base64 is also popular. These
must be fast and take up a relatively significant part of Buffer
instantiation.

This commit moves the Buffer.byteLength calculations into only JS-land,
moving it from C++ land for base64 and UTF8.

It also adds a benchmark for both encodings; the improvements hover
around 40-60% for UTF8 strings and 170% for base64.
  • Loading branch information
brendanashworth committed May 16, 2015
1 parent 0a48a8b commit cb04c10
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 5 deletions.
36 changes: 36 additions & 0 deletions benchmark/buffers/buffer-bytelength.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
var common = require('../common');

var bench = common.createBenchmark(main, {
encoding: ['utf8', 'base64'],
len: [4, 16, 64, 256, 1024],
n: [1e7]
});

// 16 chars of each byte length
var encodings = {
'utf8': {
0: 'hello brendan!!!', // 1 byte
1: 'ΰαβγδεζηθικλμνξο', // 2 bytes
2: '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
3: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
},
'base64': {
0: 'aGVsbG8gd29ybGQh', // no padding
1: 'YW55IGNhcm5hbCBwbGVhc3VyZS4=', // one byte
2: 'YWJjZGVmZ2hpamtsbW5eYw==', // two bytes
3: 'YXN1cmUu', // nada
}
};

function main(conf) {
var n = conf.n | 0;
var len = conf.len | 0;
var encoding = conf.encoding;
var chars = encodings[encoding];

bench.start();
for (var i = 0; i < n; i++) {
Buffer.byteLength(chars[n % 4], encoding);
}
bench.end(n);
}
63 changes: 58 additions & 5 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,19 +272,66 @@ Buffer.concat = function(list, length) {
};


function base64ByteLength(str) {
var len = str.length;
var bytes = len;

// Handle padding
if (str[len - 1] === '=')
bytes--;
if (len > 2 && str[len - 2] === '=')
bytes--;

// Base64 ratio: 3/4
bytes = (bytes / 4) * 3;

return Math.floor(bytes);
}

function utf8ByteLength(str) {
var bytes = str.length;

for (var i = 0, l = str.length; i < l; i++) {
var code = str.charCodeAt(i);

// Based on where the code lies (0x7F-0x10FFFF), derive char value.
// One byte for each character is already covered by str.length, so
// only 1 must be added for 2, 2 for 3, etc.
if (code <= 0x7F)
{ } // nop
else if (code <= 0x7FF)
bytes += 1;
else if (code <= 0xFFFF)
bytes += 2;
else if (code <= 0x10FFFF)
bytes += 3;
}

return bytes;
}


function byteLength(string, encoding) {
if (typeof(string) !== 'string')
string = String(string);
if (typeof string !== 'string')
string = '' + string;
if (typeof encoding !== 'string')
encoding = 'utf8';

if (string.length === 0)
return 0;

switch (encoding) {
switch (encoding.toLowerCase()) {
case 'ascii':
case 'binary':
// Deprecated
case 'raw':
case 'raws':
return string.length;

case 'utf8':
case 'utf-8':
return utf8ByteLength(string);

case 'ucs2':
case 'ucs-2':
case 'utf16le':
Expand All @@ -293,9 +340,15 @@ function byteLength(string, encoding) {

case 'hex':
return string.length >>> 1;
}

return binding.byteLength(string, encoding);
case 'base64':
return base64ByteLength(string);

// Previously, the C++ binding did not error on an unrecognized encoding,
// instead opting to use UTF8.
default:
return utf8ByteLength(string);
}
}

Buffer.byteLength = byteLength;
Expand Down
1 change: 1 addition & 0 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
}


// Not called by JS, left for C++ API
void ByteLength(const FunctionCallbackInfo<Value> &args) {
Environment* env = Environment::GetCurrent(args);

Expand Down
4 changes: 4 additions & 0 deletions test/parallel/test-buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,10 @@ assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));

// should use UTF8 with an unrecognized encoding
assert.equal(11, Buffer.byteLength('hello world', 'abc'));
assert.equal(10, Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'));

// slice(0,0).length === 0
assert.equal(0, Buffer('hello').slice(0, 0).length);

Expand Down

0 comments on commit cb04c10

Please sign in to comment.