From ff785fd51792a95281014b5e9b526c6619450dc9 Mon Sep 17 00:00:00 2001
From: Brian White <mscdex@mscdex.net>
Date: Wed, 8 Feb 2017 03:23:03 -0500
Subject: [PATCH] querystring: fix empty pairs and optimize parse()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit fixes handling of empty pairs that occur before the end
of the query string so that they are also ignored.

Additionally, some optimizations have been made, including:

* Avoid unnecessary code execution where possible
* Use a lookup table when checking for hex characters
* Avoid forced decoding when '+' characters are encountered and we
are using the default decoder

Fixes: https://github.com/nodejs/node/issues/10454
PR-URL: https://github.com/nodejs/node/pull/11234
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Nicu Micleușanu <micnic90@gmail.com>
---
 benchmark/querystring/querystring-parse.js |  40 ++--
 lib/querystring.js                         | 246 +++++++++++----------
 test/parallel/test-querystring.js          |   9 +
 3 files changed, 166 insertions(+), 129 deletions(-)

diff --git a/benchmark/querystring/querystring-parse.js b/benchmark/querystring/querystring-parse.js
index fe14d95a53f0a0..0ed677751280c0 100644
--- a/benchmark/querystring/querystring-parse.js
+++ b/benchmark/querystring/querystring-parse.js
@@ -12,7 +12,9 @@ var inputs = {
   multivalue: 'foo=bar&foo=baz&foo=quux&quuy=quuz',
   multivaluemany: 'foo=bar&foo=baz&foo=quux&quuy=quuz&foo=abc&foo=def&' +
                   'foo=ghi&foo=jkl&foo=mno&foo=pqr&foo=stu&foo=vwxyz',
-  manypairs: 'a&b&c&d&e&f&g&h&i&j&k&l&m&n&o&p&q&r&s&t&u&v&w&x&y&z'
+  manypairs: 'a&b&c&d&e&f&g&h&i&j&k&l&m&n&o&p&q&r&s&t&u&v&w&x&y&z',
+  manyblankpairs: '&&&&&&&&&&&&&&&&&&&&&&&&',
+  altspaces: 'foo+bar=baz+quux&xyzzy+thud=quuy+quuz&abc=def+ghi'
 };
 
 var bench = common.createBenchmark(main, {
@@ -20,34 +22,38 @@ var bench = common.createBenchmark(main, {
   n: [1e6],
 });
 
+// A deopt followed by a reopt of main() can happen right when the timed loop
+// starts, which seems to have a noticeable effect on the benchmark results.
+// So we explicitly disable optimization of main() to avoid this potential
+// issue.
+v8.setFlagsFromString('--allow_natives_syntax');
+eval('%NeverOptimizeFunction(main)');
+
 function main(conf) {
   var type = conf.type;
   var n = conf.n | 0;
   var input = inputs[type];
+  var i;
 
-  // Force-optimize querystring.parse() so that the benchmark doesn't get
-  // disrupted by the optimizer kicking in halfway through.
-  v8.setFlagsFromString('--allow_natives_syntax');
-  if (type !== 'multicharsep') {
-    querystring.parse(input);
-    eval('%OptimizeFunctionOnNextCall(querystring.parse)');
-    querystring.parse(input);
-  } else {
-    querystring.parse(input, '&&&&&&&&&&');
-    eval('%OptimizeFunctionOnNextCall(querystring.parse)');
-    querystring.parse(input, '&&&&&&&&&&');
-  }
+  // Note: we do *not* use OptimizeFunctionOnNextCall() here because currently
+  // it causes a deopt followed by a reopt, which could make its way into the
+  // timed loop. Instead, just execute the function a "sufficient" number of
+  // times before the timed loop to ensure the function is optimized just once.
+  if (type === 'multicharsep') {
+    for (i = 0; i < n; i += 1)
+      querystring.parse(input, '&&&&&&&&&&');
 
-  var i;
-  if (type !== 'multicharsep') {
     bench.start();
     for (i = 0; i < n; i += 1)
-      querystring.parse(input);
+      querystring.parse(input, '&&&&&&&&&&');
     bench.end(n);
   } else {
+    for (i = 0; i < n; i += 1)
+      querystring.parse(input);
+
     bench.start();
     for (i = 0; i < n; i += 1)
-      querystring.parse(input, '&&&&&&&&&&');
+      querystring.parse(input);
     bench.end(n);
   }
 }
diff --git a/lib/querystring.js b/lib/querystring.js
index 9b4ca27640aa71..32ba30712090a4 100644
--- a/lib/querystring.js
+++ b/lib/querystring.js
@@ -250,6 +250,25 @@ function stringify(obj, sep, eq, options) {
   return '';
 }
 
+const isHexTable = [
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 - 15
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 - 31
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 32 - 47
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 48 - 63
+  0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64 - 79
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80 - 95
+  0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96 - 111
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112 - 127
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 128 ...
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // ... 256
+];
+
 function charCodes(str) {
   if (str.length === 0) return [];
   if (str.length === 1) return [str.charCodeAt(0)];
@@ -292,7 +311,6 @@ function parse(qs, sep, eq, options) {
   const customDecode = (decode !== qsUnescape);
 
   const keys = [];
-  var posIdx = 0;
   var lastPos = 0;
   var sepIdx = 0;
   var eqIdx = 0;
@@ -300,6 +318,7 @@ function parse(qs, sep, eq, options) {
   var value = '';
   var keyEncoded = customDecode;
   var valEncoded = customDecode;
+  const plusChar = (customDecode ? '%20' : ' ');
   var encodeCheck = 0;
   for (var i = 0; i < qs.length; ++i) {
     const code = qs.charCodeAt(i);
@@ -310,142 +329,145 @@ function parse(qs, sep, eq, options) {
         // Key/value pair separator match!
         const end = i - sepIdx + 1;
         if (eqIdx < eqLen) {
-          // If we didn't find the key/value separator, treat the substring as
-          // part of the key instead of the value
-          if (lastPos < end)
+          // We didn't find the (entire) key/value separator
+          if (lastPos < end) {
+            // Treat the substring as part of the key instead of the value
             key += qs.slice(lastPos, end);
-        } else if (lastPos < end)
-          value += qs.slice(lastPos, end);
-        if (keyEncoded)
-          key = decodeStr(key, decode);
-        if (valEncoded)
-          value = decodeStr(value, decode);
-
-        if (key || value || lastPos - posIdx > sepLen || i === 0) {
-          // Use a key array lookup instead of using hasOwnProperty(), which is
-          // slower
-          if (keys.indexOf(key) === -1) {
-            obj[key] = value;
-            keys[keys.length] = key;
+            if (keyEncoded)
+              key = decodeStr(key, decode);
           } else {
-            const curValue = obj[key] || '';
-            // A simple Array-specific property check is enough here to
-            // distinguish from a string value and is faster and still safe
-            // since we are generating all of the values being assigned.
-            if (curValue.pop)
-              curValue[curValue.length] = value;
-            else if (curValue)
-              obj[key] = [curValue, value];
+            // We saw an empty substring between separators
+            if (--pairs === 0)
+              return obj;
+            lastPos = i + 1;
+            sepIdx = eqIdx = 0;
+            continue;
+          }
+        } else {
+          if (lastPos < end) {
+            value += qs.slice(lastPos, end);
+            if (valEncoded)
+              value = decodeStr(value, decode);
           }
-        } else if (i === 1) {
-          // A pair with repeated sep could be added into obj in the first loop
-          // and it should be deleted
-          delete obj[key];
+          if (keyEncoded)
+            key = decodeStr(key, decode);
+        }
+
+        // Use a key array lookup instead of using hasOwnProperty(), which is
+        // slower
+        if (keys.indexOf(key) === -1) {
+          obj[key] = value;
+          keys[keys.length] = key;
+        } else {
+          const curValue = obj[key];
+          // A simple Array-specific property check is enough here to
+          // distinguish from a string value and is faster and still safe
+          // since we are generating all of the values being assigned.
+          if (curValue.pop)
+            curValue[curValue.length] = value;
+          else
+            obj[key] = [curValue, value];
         }
         if (--pairs === 0)
-          break;
+          return obj;
         keyEncoded = valEncoded = customDecode;
-        encodeCheck = 0;
         key = value = '';
-        posIdx = lastPos;
+        encodeCheck = 0;
         lastPos = i + 1;
         sepIdx = eqIdx = 0;
       }
-      continue;
     } else {
       sepIdx = 0;
-      if (!valEncoded) {
-        // Try to match an (valid) encoded byte (once) to minimize unnecessary
-        // calls to string decoding functions
-        if (code === 37/*%*/) {
-          encodeCheck = 1;
-        } else if (encodeCheck > 0 &&
-                   ((code >= 48/*0*/ && code <= 57/*9*/) ||
-                    (code >= 65/*A*/ && code <= 70/*F*/) ||
-                    (code >= 97/*a*/ && code <= 102/*f*/))) {
-          if (++encodeCheck === 3)
-            valEncoded = true;
+      // Try matching key/value separator (e.g. '=') if we haven't already
+      if (eqIdx < eqLen) {
+        if (code === eqCodes[eqIdx]) {
+          if (++eqIdx === eqLen) {
+            // Key/value separator match!
+            const end = i - eqIdx + 1;
+            if (lastPos < end)
+              key += qs.slice(lastPos, end);
+            encodeCheck = 0;
+            lastPos = i + 1;
+          }
+          continue;
         } else {
-          encodeCheck = 0;
+          eqIdx = 0;
+          if (!keyEncoded) {
+            // Try to match an (valid) encoded byte once to minimize unnecessary
+            // calls to string decoding functions
+            if (code === 37/*%*/) {
+              encodeCheck = 1;
+              continue;
+            } else if (encodeCheck > 0) {
+              // eslint-disable-next-line no-extra-boolean-cast
+              if (!!isHexTable[code]) {
+                if (++encodeCheck === 3)
+                  keyEncoded = true;
+                continue;
+              } else {
+                encodeCheck = 0;
+              }
+            }
+          }
         }
-      }
-    }
-
-    // Try matching key/value separator (e.g. '=') if we haven't already
-    if (eqIdx < eqLen) {
-      if (code === eqCodes[eqIdx]) {
-        if (++eqIdx === eqLen) {
-          // Key/value separator match!
-          const end = i - eqIdx + 1;
-          if (lastPos < end)
-            key += qs.slice(lastPos, end);
-          encodeCheck = 0;
+        if (code === 43/*+*/) {
+          if (lastPos < i)
+            key += qs.slice(lastPos, i);
+          key += plusChar;
           lastPos = i + 1;
+          continue;
         }
-        continue;
-      } else {
-        eqIdx = 0;
-        if (!keyEncoded) {
-          // Try to match an (valid) encoded byte once to minimize unnecessary
-          // calls to string decoding functions
-          if (code === 37/*%*/) {
-            encodeCheck = 1;
-          } else if (encodeCheck > 0 &&
-                     ((code >= 48/*0*/ && code <= 57/*9*/) ||
-                      (code >= 65/*A*/ && code <= 70/*F*/) ||
-                      (code >= 97/*a*/ && code <= 102/*f*/))) {
+      }
+      if (code === 43/*+*/) {
+        if (lastPos < i)
+          value += qs.slice(lastPos, i);
+        value += plusChar;
+        lastPos = i + 1;
+      } else if (!valEncoded) {
+        // Try to match an (valid) encoded byte (once) to minimize unnecessary
+        // calls to string decoding functions
+        if (code === 37/*%*/) {
+          encodeCheck = 1;
+        } else if (encodeCheck > 0) {
+          // eslint-disable-next-line no-extra-boolean-cast
+          if (!!isHexTable[code]) {
             if (++encodeCheck === 3)
-              keyEncoded = true;
+              valEncoded = true;
           } else {
             encodeCheck = 0;
           }
         }
       }
     }
-
-    if (code === 43/*+*/) {
-      if (eqIdx < eqLen) {
-        if (lastPos < i)
-          key += qs.slice(lastPos, i);
-        key += '%20';
-        keyEncoded = true;
-      } else {
-        if (lastPos < i)
-          value += qs.slice(lastPos, i);
-        value += '%20';
-        valEncoded = true;
-      }
-      lastPos = i + 1;
-    }
   }
 
-  // Check if we have leftover key or value data
-  if (pairs !== 0 && (lastPos < qs.length || eqIdx > 0)) {
-    if (lastPos < qs.length) {
-      if (eqIdx < eqLen)
-        key += qs.slice(lastPos);
-      else if (sepIdx < sepLen)
-        value += qs.slice(lastPos);
-    }
-    if (keyEncoded)
-      key = decodeStr(key, decode);
-    if (valEncoded)
-      value = decodeStr(value, decode);
-    // Use a key array lookup instead of using hasOwnProperty(), which is
-    // slower
-    if (keys.indexOf(key) === -1) {
-      obj[key] = value;
-      keys[keys.length] = key;
-    } else {
-      const curValue = obj[key];
-      // A simple Array-specific property check is enough here to
-      // distinguish from a string value and is faster and still safe since
-      // we are generating all of the values being assigned.
-      if (curValue.pop)
-        curValue[curValue.length] = value;
-      else
-        obj[key] = [curValue, value];
-    }
+  // Deal with any leftover key or value data
+  if (lastPos < qs.length) {
+    if (eqIdx < eqLen)
+      key += qs.slice(lastPos);
+    else if (sepIdx < sepLen)
+      value += qs.slice(lastPos);
+  } else if (eqIdx === 0) {
+    // We ended on an empty substring
+    return obj;
+  }
+  if (keyEncoded)
+    key = decodeStr(key, decode);
+  if (valEncoded)
+    value = decodeStr(value, decode);
+  // Use a key array lookup instead of using hasOwnProperty(), which is slower
+  if (keys.indexOf(key) === -1) {
+    obj[key] = value;
+    keys[keys.length] = key;
+  } else {
+    const curValue = obj[key];
+    // A simple Array-specific property check is enough here to
+    // distinguish from a string value and is faster and still safe since
+    // we are generating all of the values being assigned.
+    if (curValue.pop)
+      curValue[curValue.length] = value;
+    else
+      obj[key] = [curValue, value];
   }
 
   return obj;
diff --git a/test/parallel/test-querystring.js b/test/parallel/test-querystring.js
index baa426094c77c6..04034377fea76d 100644
--- a/test/parallel/test-querystring.js
+++ b/test/parallel/test-querystring.js
@@ -59,6 +59,15 @@ const qsTestCases = [
   ['&&&&', '', {}],
   ['&=&', '=', { '': '' }],
   ['&=&=', '=&=', { '': [ '', '' ]}],
+  ['a&&b', 'a=&b=', { 'a': '', 'b': '' }],
+  ['a=a&&b=b', 'a=a&b=b', { 'a': 'a', 'b': 'b' }],
+  ['&a', 'a=', { 'a': '' }],
+  ['&=', '=', { '': '' }],
+  ['a&a&', 'a=&a=', { a: [ '', '' ] }],
+  ['a&a&a&', 'a=&a=&a=', { a: [ '', '', '' ] }],
+  ['a&a&a&a&', 'a=&a=&a=&a=', { a: [ '', '', '', '' ] }],
+  ['a=&a=value&a=', 'a=&a=value&a=', { a: [ '', 'value', '' ] }],
+  ['foo+bar=baz+quux', 'foo%20bar=baz%20quux', { 'foo bar': 'baz quux' }],
   [null, '', {}],
   [undefined, '', {}]
 ];