-
Notifications
You must be signed in to change notification settings - Fork 30.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
buffer: optimize Buffer.byteLength #1713
Changes from 6 commits
7b5655c
447fe0b
35eeaea
30dea88
69c0889
3c05601
84bb8a4
5156d31
47a5209
2f1d3b5
0610eb8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
var common = require('../common'); | ||
|
||
var bench = common.createBenchmark(main, { | ||
encoding: ['utf8', 'base64'], | ||
len: [1, 2, 4, 16, 64, 256], // x16 | ||
n: [5e6] | ||
}); | ||
|
||
// 16 chars each | ||
var chars = [ | ||
'hello brendan!!!', // 1 byte | ||
'ΰαβγδεζηθικλμνξο', // 2 bytes | ||
'挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes | ||
'𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes | ||
]; | ||
|
||
function main(conf) { | ||
var n = conf.n | 0; | ||
var len = conf.len | 0; | ||
var encoding = conf.encoding; | ||
|
||
var strings = []; | ||
for (var string of chars) { | ||
// Strings must be built differently, depending on encoding | ||
var data = buildString(string, len); | ||
if (encoding === 'utf8') { | ||
strings.push(data); | ||
} else if (encoding === 'base64') { | ||
// Base64 strings will be much longer than their UTF8 counterparts | ||
strings.push(new Buffer(data, 'utf8').toString('base64')); | ||
} | ||
} | ||
|
||
// Check the result to ensure it is *properly* optimized | ||
var results = strings.map(function(val) { | ||
return Buffer.byteLength(val, encoding); | ||
}); | ||
|
||
bench.start(); | ||
for (var i = 0; i < n; i++) { | ||
var index = n % strings.length; | ||
// Go! | ||
var r = Buffer.byteLength(strings[index], encoding); | ||
|
||
if (r !== results[index]) | ||
throw Error('incorrect return value'); | ||
} | ||
bench.end(n); | ||
} | ||
|
||
function buildString(str, times) { | ||
if (times == 1) return str; | ||
|
||
return str + buildString(str, times - 1); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -272,30 +272,64 @@ Buffer.concat = function(list, length) { | |
}; | ||
|
||
|
||
function base64ByteLength(str, len) { | ||
var bytes = len; | ||
|
||
// Handle padding | ||
if (str[len - 1] === '=') | ||
bytes--; | ||
if (len > 2 && str[len - 2] === '=') | ||
bytes--; | ||
|
||
// Base64 ratio: 3/4 | ||
return (bytes * 3) >>> 2; | ||
} | ||
|
||
|
||
function byteLength(string, encoding) { | ||
if (typeof(string) !== 'string') | ||
string = String(string); | ||
if (typeof string !== 'string') | ||
string = '' + string; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a change in functionality? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bnoordhuis do we care if node aborts if someone does: process.binding('buffer').byteLengthUtf8(42); Or do we want to continue throwing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. Fooling around with process.binding() voids the warranty. |
||
|
||
if (string.length === 0) | ||
var len = string.length; | ||
if (len === 0) | ||
return 0; | ||
|
||
switch (encoding) { | ||
case 'ascii': | ||
case 'binary': | ||
case 'raw': | ||
return string.length; | ||
// Use a for loop to avoid recursion | ||
var loweredCase = false; | ||
for (;;) { | ||
switch (encoding) { | ||
case 'ascii': | ||
case 'binary': | ||
// Deprecated | ||
case 'raw': | ||
case 'raws': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was supported by the C++ version, but isn't currently supported by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, I think 'raw' and 'raws' have been deprecated since v0.2 or v0.3. I've never seen them being used in the wild either. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can follow this PR up with a deprecation commit (to keep it semver-patch) - technically speaking only > Buffer.byteLength('abc', 'raw')
3
> Buffer.byteLength('abc', 'raws')
'raws' encoding has been renamed to 'binary'. Please update your code.
3 |
||
return len; | ||
|
||
case 'ucs2': | ||
case 'ucs-2': | ||
case 'utf16le': | ||
case 'utf-16le': | ||
return string.length * 2; | ||
case 'utf8': | ||
case 'utf-8': | ||
return binding.byteLengthUTF8(string); | ||
|
||
case 'hex': | ||
return string.length >>> 1; | ||
} | ||
case 'ucs2': | ||
case 'ucs-2': | ||
case 'utf16le': | ||
case 'utf-16le': | ||
return len * 2; | ||
|
||
case 'hex': | ||
return len >>> 1; | ||
|
||
return binding.byteLength(string, encoding); | ||
case 'base64': | ||
return base64ByteLength(string, len); | ||
|
||
default: | ||
// The C++ binding defaulted to UTF8, we should too. | ||
if (loweredCase) | ||
return binding.byteLengthUTF8(string); | ||
|
||
encoding = ('' + encoding).toLowerCase(); | ||
loweredCase = true; | ||
} | ||
} | ||
} | ||
|
||
Buffer.byteLength = byteLength; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -541,17 +541,17 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) { | |
} | ||
|
||
|
||
void ByteLength(const FunctionCallbackInfo<Value> &args) { | ||
void ByteLengthUTF8(const FunctionCallbackInfo<Value> &args) { | ||
Environment* env = Environment::GetCurrent(args); | ||
|
||
if (!args[0]->IsString()) | ||
return env->ThrowTypeError("Argument must be a string"); | ||
|
||
Local<String> s = args[0]->ToString(env->isolate()); | ||
enum encoding e = ParseEncoding(env->isolate(), args[1], UTF8); | ||
// Fast case: avoid StringBytes on UTF8 string. Jump to v8. | ||
Local<String> str = args[0]->ToString(env->isolate()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's guaranteed to be a string so There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I drop the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea the string check and env can be dropped, this is always called with a string from js. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would leave in a |
||
int len = str->Utf8Length(); | ||
|
||
uint32_t size = StringBytes::Size(env->isolate(), s, e); | ||
args.GetReturnValue().Set(size); | ||
args.GetReturnValue().Set(len); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can reduce this function to just: void ByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
CHECK(args[0]->IsString());
args.GetReturnValue().Set(args[0].As<String>()->Utf8Length());
} EDIT: s/UTF8/Utf8/ for consistency with other functions/types operating on UTF-8. |
||
|
||
|
||
|
@@ -745,7 +745,7 @@ void Initialize(Handle<Object> target, | |
|
||
env->SetMethod(target, "setupBufferJS", SetupBufferJS); | ||
|
||
env->SetMethod(target, "byteLength", ByteLength); | ||
env->SetMethod(target, "byteLengthUTF8", ByteLengthUTF8); | ||
env->SetMethod(target, "compare", Compare); | ||
env->SetMethod(target, "fill", Fill); | ||
env->SetMethod(target, "indexOfBuffer", IndexOfBuffer); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
var common = require('../common'); | ||
var assert = require('assert'); | ||
var Buffer = require('buffer').Buffer; | ||
|
||
// coerce values to string | ||
assert.equal(Buffer.byteLength(32, 'raw'), 2); | ||
assert.equal(Buffer.byteLength(NaN, 'utf8'), 3); | ||
assert.equal(Buffer.byteLength({}, 'raws'), 15); | ||
assert.equal(Buffer.byteLength(), 9); | ||
|
||
// special case: zero length string | ||
assert.equal(Buffer.byteLength('', 'ascii'), 0); | ||
assert.equal(Buffer.byteLength('', 'HeX'), 0); | ||
|
||
// utf8 | ||
assert.equal(Buffer.byteLength('∑éllö wørl∂!', 'utf-8'), 19); | ||
assert.equal(Buffer.byteLength('κλμνξο', 'utf8'), 12); | ||
assert.equal(Buffer.byteLength('挵挶挷挸挹', 'utf-8'), 15); | ||
assert.equal(Buffer.byteLength('𠝹𠱓𠱸', 'UTF8'), 12); | ||
// without an encoding, utf8 should be assumed | ||
assert.equal(Buffer.byteLength('hey there'), 9); | ||
assert.equal(Buffer.byteLength('𠱸挶νξ#xx :)'), 17); | ||
assert.equal(Buffer.byteLength('hello world', ''), 11); | ||
// it should also be assumed with unrecognized encoding | ||
assert.equal(Buffer.byteLength('hello world', 'abc'), 11); | ||
assert.equal(Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'), 10); | ||
|
||
// base64 | ||
assert.equal(Buffer.byteLength('aGVsbG8gd29ybGQ=', 'base64'), 11); | ||
assert.equal(Buffer.byteLength('bm9kZS5qcyByb2NrcyE=', 'base64'), 14); | ||
assert.equal(Buffer.byteLength('aGkk', 'base64'), 3); | ||
assert.equal(Buffer.byteLength('bHNrZGZsa3NqZmtsc2xrZmFqc2RsZmtqcw==', 'base64'), 25); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Long line. |
||
// special padding | ||
assert.equal(Buffer.byteLength('aaa=', 'base64'), 2); | ||
assert.equal(Buffer.byteLength('aaaa==', 'base64'), 3); | ||
|
||
assert.equal(Buffer.byteLength('Il était tué'), 14); | ||
assert.equal(Buffer.byteLength('Il était tué', 'utf8'), 14); | ||
assert.equal(Buffer.byteLength('Il était tué', 'ascii'), 12); | ||
assert.equal(Buffer.byteLength('Il était tué', 'binary'), 12); | ||
['ucs2', 'ucs-2', 'utf16le', 'utf-16le'].forEach(function(encoding) { | ||
assert.equal(24, Buffer.byteLength('Il était tué', encoding)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "He was killed"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just copied it from the old |
||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comparing numbers is faster than strings:
str.charCodeAt(len - 1) === 0x3D
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! Better perf. Learn something new every day 😄