Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

buffer: optimize Buffer.byteLength #1713

Closed
55 changes: 55 additions & 0 deletions benchmark/buffers/buffer-bytelength.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
var common = require('../common');

var bench = common.createBenchmark(main, {
encoding: ['utf8', 'base64'],
len: [1, 2, 4, 16, 64, 256], // x16
n: [5e6]
});

// 16 chars each
var chars = [
'hello brendan!!!', // 1 byte
'ΰαβγδεζηθικλμνξο', // 2 bytes
'挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
'𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
];

function main(conf) {
var n = conf.n | 0;
var len = conf.len | 0;
var encoding = conf.encoding;

var strings = [];
for (var string of chars) {
// Strings must be built differently, depending on encoding
var data = buildString(string, len);
if (encoding === 'utf8') {
strings.push(data);
} else if (encoding === 'base64') {
// Base64 strings will be much longer than their UTF8 counterparts
strings.push(new Buffer(data, 'utf8').toString('base64'));
}
}

// Check the result to ensure it is *properly* optimized
var results = strings.map(function(val) {
return Buffer.byteLength(val, encoding);
});

bench.start();
for (var i = 0; i < n; i++) {
var index = n % strings.length;
// Go!
var r = Buffer.byteLength(strings[index], encoding);

if (r !== results[index])
throw Error('incorrect return value');
}
bench.end(n);
}

function buildString(str, times) {
if (times == 1) return str;

return str + buildString(str, times - 1);
}
68 changes: 51 additions & 17 deletions lib/buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,30 +272,64 @@ Buffer.concat = function(list, length) {
};


function base64ByteLength(str, len) {
var bytes = len;

// Handle padding
if (str[len - 1] === '=')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comparing numbers is faster than strings: str.charCodeAt(len - 1) === 0x3D

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Better perf. Learn something new every day 😄

bytes--;
if (len > 2 && str[len - 2] === '=')
bytes--;

// Base64 ratio: 3/4
return (bytes * 3) >>> 2;
}


function byteLength(string, encoding) {
if (typeof(string) !== 'string')
string = String(string);
if (typeof string !== 'string')
string = '' + string;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a change in functionality? ByteLength() would throw if a non string was passed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, String(val) also does coercion. We seem to prefer '' + val though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bnoordhuis do we care if node aborts if someone does:

process.binding('buffer').byteLengthUtf8(42);

Or do we want to continue throwing?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. Fooling around with process.binding() voids the warranty.


if (string.length === 0)
var len = string.length;
if (len === 0)
return 0;

switch (encoding) {
case 'ascii':
case 'binary':
case 'raw':
return string.length;
// Use a for loop to avoid recursion
var loweredCase = false;
for (;;) {
switch (encoding) {
case 'ascii':
case 'binary':
// Deprecated
case 'raw':
case 'raws':
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was supported by the C++ version, but isn't currently supported by Buffer.isEncoding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, I think 'raw' and 'raws' have been deprecated since v0.2 or v0.3. I've never seen them being used in the wild either.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can follow this PR up with a deprecation commit (to keep it semver-patch) - technically speaking only raws was deprecated, not raw, for this function:

> Buffer.byteLength('abc', 'raw')
3
> Buffer.byteLength('abc', 'raws')
'raws' encoding has been renamed to 'binary'. Please update your code.
3

return len;

case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
return string.length * 2;
case 'utf8':
case 'utf-8':
return binding.byteLengthUTF8(string);

case 'hex':
return string.length >>> 1;
}
case 'ucs2':
case 'ucs-2':
case 'utf16le':
case 'utf-16le':
return len * 2;

case 'hex':
return len >>> 1;

return binding.byteLength(string, encoding);
case 'base64':
return base64ByteLength(string, len);

default:
// The C++ binding defaulted to UTF8, we should too.
if (loweredCase)
return binding.byteLengthUTF8(string);

encoding = ('' + encoding).toLowerCase();
loweredCase = true;
}
}
}

Buffer.byteLength = byteLength;
Expand Down
12 changes: 6 additions & 6 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -541,17 +541,17 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
}


void ByteLength(const FunctionCallbackInfo<Value> &args) {
void ByteLengthUTF8(const FunctionCallbackInfo<Value> &args) {
Environment* env = Environment::GetCurrent(args);

if (!args[0]->IsString())
return env->ThrowTypeError("Argument must be a string");

Local<String> s = args[0]->ToString(env->isolate());
enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8);
// Fast case: avoid StringBytes on UTF8 string. Jump to v8.
Local<String> str = args[0]->ToString(env->isolate());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's guaranteed to be a string so args[0].As<String>(); should be used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I drop the Environment::GetCurrent above then, or must that be kept?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea the string check and env can be dropped, this is always called with a string from js.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave in a CHECK(args[0]->IsString()) to catch bugs.

int len = str->Utf8Length();

uint32_t size = StringBytes::Size(env->isolate(), s, e);
args.GetReturnValue().Set(size);
args.GetReturnValue().Set(len);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can reduce this function to just:

void ByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
  CHECK(args[0]->IsString());
  args.GetReturnValue().Set(args[0].As<String>()->Utf8Length());
}

EDIT: s/UTF8/Utf8/ for consistency with other functions/types operating on UTF-8.



Expand Down Expand Up @@ -745,7 +745,7 @@ void Initialize(Handle<Object> target,

env->SetMethod(target, "setupBufferJS", SetupBufferJS);

env->SetMethod(target, "byteLength", ByteLength);
env->SetMethod(target, "byteLengthUTF8", ByteLengthUTF8);
env->SetMethod(target, "compare", Compare);
env->SetMethod(target, "fill", Fill);
env->SetMethod(target, "indexOfBuffer", IndexOfBuffer);
Expand Down
43 changes: 43 additions & 0 deletions test/parallel/test-buffer-bytelength.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
var common = require('../common');
var assert = require('assert');
var Buffer = require('buffer').Buffer;

// coerce values to string
assert.equal(Buffer.byteLength(32, 'raw'), 2);
assert.equal(Buffer.byteLength(NaN, 'utf8'), 3);
assert.equal(Buffer.byteLength({}, 'raws'), 15);
assert.equal(Buffer.byteLength(), 9);

// special case: zero length string
assert.equal(Buffer.byteLength('', 'ascii'), 0);
assert.equal(Buffer.byteLength('', 'HeX'), 0);

// utf8
assert.equal(Buffer.byteLength('∑éllö wørl∂!', 'utf-8'), 19);
assert.equal(Buffer.byteLength('κλμνξο', 'utf8'), 12);
assert.equal(Buffer.byteLength('挵挶挷挸挹', 'utf-8'), 15);
assert.equal(Buffer.byteLength('𠝹𠱓𠱸', 'UTF8'), 12);
// without an encoding, utf8 should be assumed
assert.equal(Buffer.byteLength('hey there'), 9);
assert.equal(Buffer.byteLength('𠱸挶νξ#xx :)'), 17);
assert.equal(Buffer.byteLength('hello world', ''), 11);
// it should also be assumed with unrecognized encoding
assert.equal(Buffer.byteLength('hello world', 'abc'), 11);
assert.equal(Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'), 10);

// base64
assert.equal(Buffer.byteLength('aGVsbG8gd29ybGQ=', 'base64'), 11);
assert.equal(Buffer.byteLength('bm9kZS5qcyByb2NrcyE=', 'base64'), 14);
assert.equal(Buffer.byteLength('aGkk', 'base64'), 3);
assert.equal(Buffer.byteLength('bHNrZGZsa3NqZmtsc2xrZmFqc2RsZmtqcw==', 'base64'), 25);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Long line.

// special padding
assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);

assert.equal(Buffer.byteLength('Il était tué'), 14);
assert.equal(Buffer.byteLength('Il était tué', 'utf8'), 14);
assert.equal(Buffer.byteLength('Il était tué', 'ascii'), 12);
assert.equal(Buffer.byteLength('Il était tué', 'binary'), 12);
['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
assert.equal(24, Buffer.byteLength('Il était tué', encoding));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"He was killed"?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just copied it from the old test-buffer.js.

});
13 changes: 0 additions & 13 deletions test/parallel/test-buffer.js
Original file line number Diff line number Diff line change
Expand Up @@ -560,15 +560,6 @@ assert.equal(sb, s);
b = new Buffer('abcde');
assert.equal('bcde', b.slice(1).toString());

// byte length
assert.equal(14, Buffer.byteLength('Il était tué'));
assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) {
assert.equal(24, Buffer.byteLength('Il était tué', encoding));
});
assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));

// slice(0,0).length === 0
assert.equal(0, Buffer('hello').slice(0, 0).length);

Expand Down Expand Up @@ -1073,10 +1064,6 @@ assert.equal(buf.readInt8(0), -1);
assert.ok(typeof Buffer(5).slice(0, 5).parent === 'object');
})();

// Make sure byteLength properly checks for base64 padding
assert.equal(Buffer.byteLength('aaa=', 'base64'), 2);
assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3);

// Regression test for #5482: should throw but not assert in C++ land.
assert.throws(function() {
Buffer('', 'buffer');
Expand Down