diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 67b469511b0f2d..bb77387e8f87b7 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -994,10 +994,12 @@ void IndexOfString(const FunctionCallbackInfo& args) { bool is_forward = args[4]->IsTrue(); const char* haystack = ts_obj_data; - const size_t haystack_length = ts_obj_length; - // Extended latin-1 characters are 2 bytes in Utf8. + // Round down to the nearest multiple of 2 in case of UCS2. + const size_t haystack_length = (enc == UCS2) ? + ts_obj_length &~ 1 : ts_obj_length; // NOLINT(whitespace/operators) + const size_t needle_length = - enc == BINARY ? needle->Length() : needle->Utf8Length(); + StringBytes::Size(args.GetIsolate(), needle, enc); if (needle_length == 0 || haystack_length == 0) { return args.GetReturnValue().Set(-1); @@ -1009,7 +1011,8 @@ void IndexOfString(const FunctionCallbackInfo& args) { } size_t offset = static_cast(opt_offset); CHECK_LT(offset, haystack_length); - if (is_forward && needle_length + offset > haystack_length) { + if ((is_forward && needle_length + offset > haystack_length) || + needle_length > haystack_length) { return args.GetReturnValue().Set(-1); } @@ -1111,7 +1114,8 @@ void IndexOfBuffer(const FunctionCallbackInfo& args) { } size_t offset = static_cast(opt_offset); CHECK_LT(offset, haystack_length); - if (is_forward && needle_length + offset > haystack_length) { + if ((is_forward && needle_length + offset > haystack_length) || + needle_length > haystack_length) { return args.GetReturnValue().Set(-1); } diff --git a/src/string_search.h b/src/string_search.h index bf246702d7e75e..0426fd840c0bd1 100644 --- a/src/string_search.h +++ b/src/string_search.h @@ -252,7 +252,7 @@ inline const void* MemrchrFill(const void* haystack, uint8_t needle, // `subject`. Does not check that the whole pattern matches. template inline size_t FindFirstCharacter(Vector pattern, - Vector subject, size_t index) { + Vector subject, size_t index) { const Char pattern_first_char = pattern[0]; const size_t max_n = (subject.length() - pattern.length() + 1); @@ -261,19 +261,19 @@ inline size_t FindFirstCharacter(Vector pattern, const uint8_t search_byte = GetHighestValueByte(pattern_first_char); size_t pos = index; do { - size_t bytes_to_search; + const size_t bytes_to_search = (max_n - pos) * sizeof(Char); const void* void_pos; if (subject.forward()) { // Assert that bytes_to_search won't overflow CHECK_LE(pos, max_n); CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); - bytes_to_search = (max_n - pos) * sizeof(Char); void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); } else { CHECK_LE(pos, subject.length()); CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); - bytes_to_search = (subject.length() - pos) * sizeof(Char); - void_pos = MemrchrFill(subject.start(), search_byte, bytes_to_search); + void_pos = MemrchrFill(subject.start() + pattern.length() - 1, + search_byte, + bytes_to_search); } const Char* char_pos = static_cast(void_pos); if (char_pos == nullptr) @@ -308,7 +308,9 @@ inline size_t FindFirstCharacter(Vector pattern, if (subject.forward()) { pos = memchr(subject.start() + index, pattern_first_char, max_n - index); } else { - pos = MemrchrFill(subject.start(), pattern_first_char, subj_len - index); + pos = MemrchrFill(subject.start() + pattern.length() - 1, + pattern_first_char, + max_n - index); } const uint8_t* char_pos = static_cast(pos); if (char_pos == nullptr) { diff --git a/test/parallel/test-buffer-indexof.js b/test/parallel/test-buffer-indexof.js index 0a20d2ce9af021..7d54ea0d47d0a1 100644 --- a/test/parallel/test-buffer-indexof.js +++ b/test/parallel/test-buffer-indexof.js @@ -222,6 +222,15 @@ var allCharsBufferUcs2 = Buffer.from(allCharsString, 'ucs2'); assert.equal(-1, allCharsBufferUtf8.indexOf('notfound')); assert.equal(-1, allCharsBufferUcs2.indexOf('notfound')); +// Needle is longer than haystack, but only because it's encoded as UTF-16 +assert.strictEqual(Buffer.from('aaaa').indexOf('a'.repeat(4), 'ucs2'), -1); + +assert.strictEqual(Buffer.from('aaaa').indexOf('a'.repeat(4), 'utf8'), 0); +assert.strictEqual(Buffer.from('aaaa').indexOf('你好', 'ucs2'), -1); + +// Haystack has odd length, but the needle is UCS2. +assert.strictEqual(Buffer.from('aaaaa').indexOf('b', 'ucs2'), -1); + { // Find substrings in Utf8. const lengths = [1, 3, 15]; // Single char, simple and complex. @@ -345,12 +354,35 @@ assert.equal(b.lastIndexOf('b', {}), 1); assert.equal(b.lastIndexOf('b', []), -1); assert.equal(b.lastIndexOf('b', [2]), 1); +// Test needles longer than the haystack. +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 'ucs2'), -1); +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 'utf8'), -1); +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 'binary'), -1); +assert.strictEqual(b.lastIndexOf(Buffer.from('aaaaaaaaaaaaaaa')), -1); +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 2, 'ucs2'), -1); +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 3, 'utf8'), -1); +assert.strictEqual(b.lastIndexOf('aaaaaaaaaaaaaaa', 5, 'binary'), -1); +assert.strictEqual(b.lastIndexOf(Buffer.from('aaaaaaaaaaaaaaa'), 7), -1); + +// 你好 expands to a total of 6 bytes using UTF-8 and 4 bytes using UTF-16 +assert.strictEqual(buf_bc.lastIndexOf('你好', 'ucs2'), -1); +assert.strictEqual(buf_bc.lastIndexOf('你好', 'utf8'), -1); +assert.strictEqual(buf_bc.lastIndexOf('你好', 'binary'), -1); +assert.strictEqual(buf_bc.lastIndexOf(Buffer.from('你好')), -1); +assert.strictEqual(buf_bc.lastIndexOf('你好', 2, 'ucs2'), -1); +assert.strictEqual(buf_bc.lastIndexOf('你好', 3, 'utf8'), -1); +assert.strictEqual(buf_bc.lastIndexOf('你好', 5, 'binary'), -1); +assert.strictEqual(buf_bc.lastIndexOf(Buffer.from('你好'), 7), -1); + // Test lastIndexOf on a longer buffer: var bufferString = new Buffer('a man a plan a canal panama'); assert.equal(15, bufferString.lastIndexOf('canal')); assert.equal(21, bufferString.lastIndexOf('panama')); assert.equal(0, bufferString.lastIndexOf('a man a plan a canal panama')); assert.equal(-1, bufferString.lastIndexOf('a man a plan a canal mexico')); +assert.equal(-1, bufferString.lastIndexOf('a man a plan a canal mexico city')); +assert.equal(-1, bufferString.lastIndexOf(Buffer.from('a'.repeat(1000)))); +assert.equal(0, bufferString.lastIndexOf('a man a plan', 4)); assert.equal(13, bufferString.lastIndexOf('a ')); assert.equal(13, bufferString.lastIndexOf('a ', 13)); assert.equal(6, bufferString.lastIndexOf('a ', 12)); @@ -359,6 +391,23 @@ assert.equal(13, bufferString.lastIndexOf('a ', -1)); assert.equal(0, bufferString.lastIndexOf('a ', -27)); assert.equal(-1, bufferString.lastIndexOf('a ', -28)); +// Test lastIndexOf for the case that the first character can be found, +// but in a part of the buffer that does not make search to search +// due do length constraints. +const abInUCS2 = Buffer.from('ab', 'ucs2'); +assert.strictEqual(-1, Buffer.from('µaaaa¶bbbb', 'binary').lastIndexOf('µ')); +assert.strictEqual(-1, Buffer.from('bc').lastIndexOf('ab')); +assert.strictEqual(-1, Buffer.from('abc').lastIndexOf('qa')); +assert.strictEqual(-1, Buffer.from('abcdef').lastIndexOf('qabc')); +assert.strictEqual(-1, Buffer.from('bc').lastIndexOf(Buffer.from('ab'))); +assert.strictEqual(-1, Buffer.from('bc', 'ucs2').lastIndexOf('ab', 'ucs2')); +assert.strictEqual(-1, Buffer.from('bc', 'ucs2').lastIndexOf(abInUCS2)); + +assert.strictEqual(0, Buffer.from('abc').lastIndexOf('ab')); +assert.strictEqual(0, Buffer.from('abc').lastIndexOf('ab', 1)); +assert.strictEqual(0, Buffer.from('abc').lastIndexOf('ab', 2)); +assert.strictEqual(0, Buffer.from('abc').lastIndexOf('ab', 3)); + // The above tests test the LINEAR and SINGLE-CHAR strategies. // Now, we test the BOYER-MOORE-HORSPOOL strategy. // Test lastIndexOf on a long buffer w multiple matches: