buffer: optimize Buffer.byteLength

Buffer.byteLength is called whenever a new string Buffer is created. UTF8 is used as the default encoding, and base64 is also popular. These must be fast and take up a relatively significant part of Buffer instantiation. This commit moves the Buffer.byteLength calculations into only JS-land, moving it from C++ land for base64 and UTF8. It also adds a benchmark for both encodings; the improvements hover around 40-60% for UTF8 strings and 170% for base64.
nodejs · May 16, 2015 · cb04c10 · cb04c10
1 parent 0a48a8b
commit cb04c10
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 5 deletions.
diff --git a/benchmark/buffers/buffer-bytelength.js b/benchmark/buffers/buffer-bytelength.js
@@ -0,0 +1,36 @@
+var common = require('../common');
+
+var bench = common.createBenchmark(main, {
+  encoding: ['utf8', 'base64'],
+  len: [4, 16, 64, 256, 1024],
+  n: [1e7]
+});
+
+// 16 chars of each byte length
+var encodings = {
+  'utf8': {
+    0: 'hello brendan!!!', // 1 byte
+    1: 'ΰαβγδεζηθικλμνξο', // 2 bytes
+    2: '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', // 3 bytes
+    3: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢' // 4 bytes
+  },
+  'base64': {
+    0: 'aGVsbG8gd29ybGQh', // no padding
+    1: 'YW55IGNhcm5hbCBwbGVhc3VyZS4=', // one byte
+    2: 'YWJjZGVmZ2hpamtsbW5eYw==', // two bytes
+    3: 'YXN1cmUu', // nada
+  }
+};
+
+function main(conf) {
+  var n = conf.n | 0;
+  var len = conf.len | 0;
+  var encoding = conf.encoding;
+  var chars = encodings[encoding];
+
+  bench.start();
+  for (var i = 0; i < n; i++) {
+    Buffer.byteLength(chars[n % 4], encoding);
+  }
+  bench.end(n);
+}
diff --git a/lib/buffer.js b/lib/buffer.js
@@ -272,19 +272,66 @@ Buffer.concat = function(list, length) {
 };
 
 
+function base64ByteLength(str) {
+  var len = str.length;
+  var bytes = len;
+
+  // Handle padding
+  if (str[len - 1] === '=')
+    bytes--;
+  if (len > 2 && str[len - 2] === '=')
+    bytes--;
+
+  // Base64 ratio: 3/4
+  bytes = (bytes / 4) * 3;
+
+  return Math.floor(bytes);
+}
+
+function utf8ByteLength(str) {
+  var bytes = str.length;
+
+  for (var i = 0, l = str.length; i < l; i++) {
+    var code = str.charCodeAt(i);
+
+    // Based on where the code lies (0x7F-0x10FFFF), derive char value.
+    // One byte for each character is already covered by str.length, so
+    // only 1 must be added for 2, 2 for 3, etc.
+    if (code <= 0x7F)
+      { } // nop
+    else if (code <= 0x7FF)
+      bytes += 1;
+    else if (code <= 0xFFFF)
+      bytes += 2;
+    else if (code <= 0x10FFFF)
+      bytes += 3;
+  }
+
+  return bytes;
+}
+
+
 function byteLength(string, encoding) {
-  if (typeof(string) !== 'string')
-    string = String(string);
+  if (typeof string !== 'string')
+    string = '' + string;
+  if (typeof encoding !== 'string')
+    encoding = 'utf8';
 
   if (string.length === 0)
     return 0;
 
-  switch (encoding) {
+  switch (encoding.toLowerCase()) {
     case 'ascii':
     case 'binary':
+    // Deprecated
     case 'raw':
+    case 'raws':
       return string.length;
 
+    case 'utf8':
+    case 'utf-8':
+      return utf8ByteLength(string);
+
     case 'ucs2':
     case 'ucs-2':
     case 'utf16le':
@@ -293,9 +340,15 @@ function byteLength(string, encoding) {
 
     case 'hex':
       return string.length >>> 1;
-  }
 
-  return binding.byteLength(string, encoding);
+    case 'base64':
+      return base64ByteLength(string);
+
+    // Previously, the C++ binding did not error on an unrecognized encoding,
+    // instead opting to use UTF8.
+    default:
+      return utf8ByteLength(string);
+  }
 }
 
 Buffer.byteLength = byteLength;

diff --git a/src/node_buffer.cc b/src/node_buffer.cc
@@ -541,6 +541,7 @@ void WriteDoubleBE(const FunctionCallbackInfo<Value>& args) {
 }
 
 
+// Not called by JS, left for C++ API
 void ByteLength(const FunctionCallbackInfo<Value> &args) {
   Environment* env = Environment::GetCurrent(args);
 

diff --git a/test/parallel/test-buffer.js b/test/parallel/test-buffer.js
@@ -569,6 +569,10 @@ assert.equal(14, Buffer.byteLength('Il était tué', 'utf8'));
 assert.equal(12, Buffer.byteLength('Il était tué', 'ascii'));
 assert.equal(12, Buffer.byteLength('Il était tué', 'binary'));
 
+// should use UTF8 with an unrecognized encoding
+assert.equal(11, Buffer.byteLength('hello world', 'abc'));
+assert.equal(10, Buffer.byteLength('ßœ∑≈', 'unkn0wn enc0ding'));
+
 // slice(0,0).length === 0
 assert.equal(0, Buffer('hello').slice(0, 0).length);
-Original file line number
+Diff line change
@@ Expand Up @@
     }
+    // Not called by JS, left for C++ API
     void ByteLength(const FunctionCallbackInfo<Value> &args) {
       Environment* env = Environment::GetCurrent(args);
@@ Expand Down @@