From efae4bfd1357d0f81d3a04fee7bb1914a0a2401e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 21 Nov 2023 10:42:02 -0500 Subject: [PATCH 1/4] src: implement FastByteLengthUtf8 with simdutf::utf8_length_from_latin1 --- src/node_buffer.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index ff041274f90d24..ab7647bc87b197 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -743,14 +743,7 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { uint32_t FastByteLengthUtf8(Local receiver, const v8::FastOneByteString& source) { - uint32_t result = 0; - uint32_t length = source.length; - const uint8_t* data = reinterpret_cast(source.data); - for (uint32_t i = 0; i < length; ++i) { - result += (data[i] >> 7); - } - result += length; - return result; + return simdutf::utf8_length_from_latin1(source.data, source.length); } static v8::CFunction fast_byte_length_utf8( From 1eb654e274ce48c2ada2cba6f93e07b554222b1f Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 26 Nov 2023 17:10:50 -0500 Subject: [PATCH 2/4] guard simdutf usage and adds latin1 string to bench --- benchmark/buffers/buffer-bytelength-string.js | 3 ++- src/node_buffer.cc | 13 ++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmark/buffers/buffer-bytelength-string.js b/benchmark/buffers/buffer-bytelength-string.js index fc0c005e7f9e6a..9f5c3f744c7222 100644 --- a/benchmark/buffers/buffer-bytelength-string.js +++ b/benchmark/buffers/buffer-bytelength-string.js @@ -2,7 +2,7 @@ const common = require('../common'); const bench = common.createBenchmark(main, { - type: ['one_byte', 'two_bytes', 'three_bytes', 'four_bytes'], + type: ['one_byte', 'two_bytes', 'three_bytes', 'four_bytes', 'latin1'], encoding: ['utf8', 'base64'], repeat: [1, 2, 16, 256], // x16 n: [4e6], @@ -14,6 +14,7 @@ const chars = { two_bytes: 'ΰαβγδεζηθικλμνξο', three_bytes: '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', four_bytes: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢', + latin1: 'Un homme sage est supérieur à toutes les insultes qui peuvent lui être adressées, et la meilleure réponse est la patience et la modération.' }; function getInput(type, repeat, encoding) { diff --git a/src/node_buffer.cc b/src/node_buffer.cc index ab7647bc87b197..02470d07bcb27e 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -743,7 +743,18 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo& args) { uint32_t FastByteLengthUtf8(Local receiver, const v8::FastOneByteString& source) { - return simdutf::utf8_length_from_latin1(source.data, source.length); + // For short inputs, the function call overhead to simdutf is maybe + // not worth it, reserve simdutf for long strings. + if(source.length > 128) { + return simdutf::utf8_length_from_latin1(source.data, source.length); + } + uint32_t length = source.length; + uint32_t result = length; + const uint8_t* data = reinterpret_cast(source.data); + for (uint32_t i = 0; i < length; ++i) { + result += (data[i] >> 7); + } + return result; } static v8::CFunction fast_byte_length_utf8( From 074dfbbe5e8ed53aae275d65e59ec80197b2280f Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 27 Nov 2023 12:10:20 -0500 Subject: [PATCH 3/4] lint --- src/node_buffer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 02470d07bcb27e..300060f9d24290 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -745,7 +745,7 @@ uint32_t FastByteLengthUtf8(Local receiver, const v8::FastOneByteString& source) { // For short inputs, the function call overhead to simdutf is maybe // not worth it, reserve simdutf for long strings. - if(source.length > 128) { + if (source.length > 128) { return simdutf::utf8_length_from_latin1(source.data, source.length); } uint32_t length = source.length; From eb021106099319f2b538aebdb8996a51a3a8ceff Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Mon, 4 Dec 2023 18:13:53 -0500 Subject: [PATCH 4/4] fix: lint --- benchmark/buffers/buffer-bytelength-string.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmark/buffers/buffer-bytelength-string.js b/benchmark/buffers/buffer-bytelength-string.js index 9f5c3f744c7222..143da0215a613b 100644 --- a/benchmark/buffers/buffer-bytelength-string.js +++ b/benchmark/buffers/buffer-bytelength-string.js @@ -2,7 +2,8 @@ const common = require('../common'); const bench = common.createBenchmark(main, { - type: ['one_byte', 'two_bytes', 'three_bytes', 'four_bytes', 'latin1'], + type: ['one_byte', 'two_bytes', 'three_bytes', + 'four_bytes', 'latin1'], encoding: ['utf8', 'base64'], repeat: [1, 2, 16, 256], // x16 n: [4e6], @@ -14,7 +15,8 @@ const chars = { two_bytes: 'ΰαβγδεζηθικλμνξο', three_bytes: '挰挱挲挳挴挵挶挷挸挹挺挻挼挽挾挿', four_bytes: '𠜎𠜱𠝹𠱓𠱸𠲖𠳏𠳕𠴕𠵼𠵿𠸎𠸏𠹷𠺝𠺢', - latin1: 'Un homme sage est supérieur à toutes les insultes qui peuvent lui être adressées, et la meilleure réponse est la patience et la modération.' + latin1: 'Un homme sage est supérieur à toutes ' + + 'les insultes qui peuvent lui être adressées, et la meilleure réponse est la patience et la modération.', }; function getInput(type, repeat, encoding) {