Add serialization and deserialization of numerals larger than `Number…

….MAX_SAFE_INTEGER` Signed-off-by: Miki <miki@amazon.com>
opensearch-project · Jul 12, 2023 · d697969 · d697969
1 parent 4e176cd
commit d697969
Show file tree

Hide file tree

Showing 5 changed files with 361 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ## [Unreleased]
 
 ### Added
+- Add serialization and deserialization of numerals larger than `Number.MAX_SAFE_INTEGER` ([#544](https://github.com/opensearch-project/opensearch-js/pull/544))
 ### Dependencies
 - Bumps `prettier` from 2.8.7 to 2.8.8
 - Bumps `ora` from 6.1.2 to 6.3.1

diff --git a/lib/Serializer.js b/lib/Serializer.js
@@ -35,6 +35,60 @@ const sjson = require('secure-json-parse');
 const { SerializationError, DeserializationError } = require('./errors');
 const kJsonOptions = Symbol('secure json parse options');
 
+/* In JavaScript, a `Number` is a 64-bit floating-point value which can store 16 digits. However, the
+ * serializer and deserializer will need to cater to numeric values generated by other languages which
+ * can have up to 19 digits. Native JSON parser and stringifier, incapable of handling the extra
+ * digits, corrupt the values, making them unusable.
+ *
+ * To work around this limitation, the deserializer converts long sequences of digits into strings and
+ * marks them before applying the parser. During the parsing, string values that begin with the mark
+ * are converted to `BigInt` values.
+ * Similarly, during stringification, the serializer converts `BigInt` values to marked strings and
+ * when done, it replaces them with plain numerals.
+ *
+ * `Number.MAX_SAFE_INTEGER`, 9,007,199,254,740,991, is the largest number that the native methods can
+ * parse and stringify, and any numeral greater than that would need to be translated using the
+ * workaround; all 17-digits or longer and only tail-end of the 16-digits need translation. It would
+ * be unfair to all the 16-digit numbers if the translation applied to `\d{16,}` only to cover the
+ * less than 10%. Hence, a RegExp is created to only match numerals too long to be a number.
+ *
+ * To make the explanation simpler, let's assume that MAX_SAFE_INTEGER is 8921 which has 4 digits.
+ * Starting from the right, we take each digit onwards, `[<start>-9]`:
+ *    1) 7922 - 7929: 792[2-9]\d{0}
+ *    2) 7930 - 7999: 79[3-9]\d{1}
+ *    9) 9 + 1 = 10 which results in a rollover; no need to do anything.
+ *    8) 9000 - 9999: [9-9]\d{3}
+ *    Finally we add anything 5 digits or longer: `\d{5,}
+ *
+ * PS, a better solution would use AST but considering its performance penalty, RegExp is the next
+ * the best solution.
+ */
+const isBigIntSupported = typeof BigInt !== 'undefined';
+const maxIntAsString = String(Number.MAX_SAFE_INTEGER);
+const maxIntLength = maxIntAsString.length;
+// Sub-patterns for each digit
+const bigIntMatcherTokens = [`\\d{${maxIntAsString.length + 1},}`];
+for (let i = 0; i < maxIntLength; i++) {
+  if (maxIntAsString[i] !== '9') {
+    bigIntMatcherTokens.push(
+      maxIntAsString.substring(0, i) +
+        `[${parseInt(maxIntAsString[i], 10) + 1}-9]` +
+        `\\d{${maxIntLength - i - 1}}`
+    );
+  }
+}
+
+/* The matcher that looks for `": <numerals>, ...}` and `[..., <numeral>, ...]`
+ *
+ * The pattern starts by looking for `":` not immediately preceded by a `\`. That should be
+ * followed by any of the numeric sub-patterns. A comma, end of an array, end of an object, or
+ * the end of the input are the only acceptable elements after it.
+ */
+const bigIntMatcher = new RegExp(
+  `((?:\\[|,|(?<!\\\\)"\\s*:)\\s*)(-?(?:${bigIntMatcherTokens.join('|')}))(\\s*)(?=,|}|]|$)`,
+  'g'
+);
+
 class Serializer {
   constructor(opts = {}) {
     const disable = opts.disablePrototypePoisoningProtection;
@@ -44,11 +98,173 @@ class Serializer {
     };
   }
 
+  /* The characters with a highly unlikely chance of occurrence in strings, alone or in combination.
+   *
+   * ToDo: When support for ancient versions of Node.js are dropped, replace with
+   * _bigIntMarkChars = ['෴', '߷', '֍'];
+   */
+  get _bigIntMarkChars() {
+    return ['෴', '߷', '֍'];
+  }
+
+  /* Generates an array of all combinations of `_bigIntMarkChars` with the requested length. */
+  _bigIntMarkerCombinations(length = 3) {
+    const results = [];
+    const arr = this._bigIntMarkChars;
+    const arrLength = arr.length;
+    const temp = Array(length);
+
+    (function fill(pos, start) {
+      if (pos === length) return results.push(temp.join(''));
+
+      for (let i = start; i < arrLength; i++) {
+        temp[pos] = arr[i];
+        fill(pos + 1, i);
+      }
+    })(0, 0);
+
+    return results;
+  }
+
+  /* Experiments with different combinations of various lengths, until one is found to not be in
+   * the input string.
+   */
+  _getSuitableBigIntMarker(json) {
+    let bigIntMarker;
+    let length = 0;
+    do {
+      length++;
+      this._bigIntMarkerCombinations(length).some((marker) => {
+        if (json.indexOf(marker) === -1) {
+          bigIntMarker = marker;
+          return true;
+        }
+      });
+    } while (!bigIntMarker);
+
+    return {
+      bigIntMarker,
+      length,
+    };
+  }
+
+  _parseWithBigInt(json) {
+    const { bigIntMarker, length } = this._getSuitableBigIntMarker(json);
+
+    let hadException;
+    let markedJSON = json.replace(bigIntMatcher, `$1"${bigIntMarker}$2"$3`);
+
+    /* RegExp cannot replace AST and the process of marking adds quotes. So, any false-positive hit
+     * will make the JSON string unparseable.
+     *
+     * To find those instances, we try to parse and watch for the location of any errors. If an error
+     * is caused by the marking, we remove that single marking and try again.
+     */
+    do {
+      try {
+        hadException = false;
+        JSON.parse(markedJSON);
+      } catch (e) {
+        hadException = true;
+        /* There are two types of exception objects that can be raised:
+         *  1) a proper object with lineNumber and columnNumber which we can use
+         *  2) a textual message with the position that we need to parse
+         */
+        let { lineNumber, columnNumber } = e;
+        if (!lineNumber || !columnNumber) {
+          const match =
+            // ToDo: When support for ancient versions of Node.js are dropped, replace with
+            // e?.message?.match?.()
+            e &&
+            e.message &&
+            typeof e.message.match === 'function' &&
+            e.message.match(/^Unexpected token.*at position (\d+)$/);
+          if (match) {
+            lineNumber = 1;
+            // The position is zero-indexed; adding 1 to normalize it for the -2 that comes later
+            columnNumber = parseInt(match[1], 10) + 1;
+          }
+        }
+
+        if (lineNumber < 1 || columnNumber < 2) {
+          // The problem is not with this replacement; just return a failure.
+          return;
+        }
+
+        /* We need to skip e.lineNumber - 1 number of `\n` occurrences.
+         * Then, we need to go to e.columnNumber - 2 to look for `"<mark>\d+"`; we need to `-1` to
+         * account for the quote but an additional `-1` is needed because columnNumber starts from 1.
+         */
+        const re = new RegExp(
+          `^((?:.*\\n){${lineNumber - 1}}[^\\n]{${columnNumber - 2}})"${bigIntMarker}(-?\\d+)"`
+        );
+        if (!re.test(markedJSON)) {
+          // The exception is not caused by adding the marker
+          return;
+        }
+
+        // We have found a bad replacement; let's remove it.
+        markedJSON = markedJSON.replace(re, '$1$2');
+      }
+    } while (hadException);
+
+    const bigIntMarkFinder = new RegExp(`^${bigIntMarker}-?\\d+$`);
+
+    // Exceptions will trickle up to the caller
+    return sjson.parse(
+      markedJSON,
+      (key, val) =>
+        /* Convert marked values to BigInt values.
+         * The `startsWith` is purely for performance, to avoid running `test` if not needed.
+         */
+        typeof val === 'string' && val.startsWith(bigIntMarker) && bigIntMarkFinder.test(val)
+          ? BigInt(val.substring(length)) // eslint-disable-line no-undef
+          : val,
+      this[kJsonOptions]
+    );
+  }
+
+  _stringifyWithBigInt(object, candidate) {
+    const { bigIntMarker } = this._getSuitableBigIntMarker(candidate);
+
+    /* The matcher that looks for "<marker><numerals>"
+     * Because we have made sure that `bigIntMarker` was never present in the original object, we can
+     * carelessly assume every "<marker><numerals>" is due to our marking.
+     */
+    const markedBigIntMatcher = new RegExp(`"${bigIntMarker}(-?\\d+)"`, 'g');
+
+    return (
+      JSON.stringify(
+        object,
+        /* Convert BigInt values to a string and mark them.
+         * Can't be bothered with Number values beyond safe values because they are already corrupted.
+         */
+        (key, val) => (typeof val === 'bigint' ? `${bigIntMarker}${val.toString()}` : val)
+      )
+        // Replace marked substrings with just the numerals
+        .replace(markedBigIntMatcher, '$1')
+    );
+  }
+
   serialize(object) {
     debug('Serializing', object);
     let json;
+    let numeralsAreNumbers = true;
+    const checkForBigInts = (key, val) => {
+      if (typeof val === 'bigint') {
+        numeralsAreNumbers = false;
+        // Number() is much faster than parseInt() on BigInt values
+        return Number(val);
+      }
+      return val;
+    };
     try {
-      json = JSON.stringify(object);
+      json = JSON.stringify(object, isBigIntSupported ? checkForBigInts : null);
+
+      if (isBigIntSupported && !numeralsAreNumbers) {
+        const temp = this._stringifyWithBigInt(object, json);
+        if (temp) json = temp;
+      }
     } catch (err) {
       throw new SerializationError(err.message, object);
     }
@@ -58,8 +274,31 @@ class Serializer {
   deserialize(json) {
     debug('Deserializing', json);
     let object;
+    let numeralsAreNumbers = true;
+    const checkForLargeNumerals = (key, val) => {
+      if (
+        numeralsAreNumbers &&
+        typeof val === 'number' &&
+        (val < Number.MAX_SAFE_INTEGER || val > Number.MAX_SAFE_INTEGER)
+      ) {
+        numeralsAreNumbers = false;
+      }
+
+      return val;
+    };
     try {
-      object = sjson.parse(json, this[kJsonOptions]);
+      object = sjson.parse(
+        json,
+        isBigIntSupported ? checkForLargeNumerals : null,
+        this[kJsonOptions]
+      );
+
+      if (isBigIntSupported && !numeralsAreNumbers) {
+        const temp = this._parseWithBigInt(json);
+        if (temp) {
+          object = temp;
+        }
+      }
     } catch (err) {
       throw new DeserializationError(err.message, json);
     }

diff --git a/test/fixtures/longnumerals-dataset.ndjson b/test/fixtures/longnumerals-dataset.ndjson
@@ -0,0 +1,3 @@
+{"number":18014398509481982,"description":"-18014398509481982 , -1 , 1 , 18014398509481982"}
+{"number":-18014398509481982,"description":"෴18014398509481982"}
+{"number":9007199254740891,"description":"Safer than [18014398509481982]"}
diff --git a/test/integration/serializer/longnumerals.test.js b/test/integration/serializer/longnumerals.test.js
@@ -0,0 +1,90 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ *
+ */
+
+/*
+ * Licensed to Elasticsearch B.V. under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch B.V. licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+'use strict';
+
+const { createReadStream } = require('fs');
+const { join } = require('path');
+const split = require('split2');
+const { test, beforeEach, afterEach } = require('tap');
+
+const { Client } = require('../../../');
+
+const INDEX = `test-serializer-${process.pid}`;
+const client = new Client({
+  node: process.env.TEST_OPENSEARCH_SERVER || 'http://localhost:9200',
+});
+
+beforeEach(async () => {
+  await client.indices.create({ index: INDEX });
+  const stream = createReadStream(
+    join(__dirname, '..', '..', 'fixtures', 'longnumerals-dataset.ndjson')
+  );
+  const result = await client.helpers.bulk({
+    datasource: stream.pipe(split()),
+    refreshOnCompletion: true,
+    onDocument() {
+      return {
+        index: { _index: INDEX },
+      };
+    },
+  });
+  if (result.failed > 0) {
+    throw new Error('Failed bulk indexing docs');
+  }
+});
+
+afterEach(async () => {
+  await client.indices.delete({ index: INDEX }, { ignore: 404 });
+});
+
+test('long numerals', async (t) => {
+  const results = await client.helpers.search({
+    index: INDEX,
+    body: {
+      query: {
+        range: {
+          number: {
+            lt: 999999999999999999n,
+          },
+        },
+      },
+    },
+  });
+  t.equal(results.length, 3);
+  const object = {};
+  for (const result of results) {
+    object[result.description] = result.number;
+  }
+  t.same(object, {
+    '-18014398509481982 , -1 , 1 , 18014398509481982': 18014398509481982n,
+    '෴18014398509481982': -18014398509481982n,
+    'Safer than [18014398509481982]': 9007199254740891,
+  });
+});