From 6dce811105e2d417234034d4e0baccd9376479ef Mon Sep 17 00:00:00 2001 From: streamich Date: Sun, 6 Oct 2024 14:33:40 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20add=20json-size=20implem?= =?UTF-8?q?entation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/json-size/README.md | 38 ++++++++ src/json-size/__bench__/json-size.ts | 80 ++++++++++++++++ src/json-size/__tests__/fuzz.spec.ts | 16 ++++ src/json-size/__tests__/json.spec.ts | 10 ++ src/json-size/__tests__/jsonSizeFast.spec.ts | 61 ++++++++++++ .../__tests__/maxEncodingCapacity.spec.ts | 47 ++++++++++ src/json-size/__tests__/testJsonSize.ts | 66 +++++++++++++ src/json-size/index.ts | 3 + src/json-size/json.ts | 94 +++++++++++++++++++ src/json-size/jsonSizeFast.ts | 56 +++++++++++ src/json-size/maxEncodingCapacity.ts | 55 +++++++++++ 11 files changed, 526 insertions(+) create mode 100644 src/json-size/README.md create mode 100644 src/json-size/__bench__/json-size.ts create mode 100644 src/json-size/__tests__/fuzz.spec.ts create mode 100644 src/json-size/__tests__/json.spec.ts create mode 100644 src/json-size/__tests__/jsonSizeFast.spec.ts create mode 100644 src/json-size/__tests__/maxEncodingCapacity.spec.ts create mode 100644 src/json-size/__tests__/testJsonSize.ts create mode 100644 src/json-size/index.ts create mode 100644 src/json-size/json.ts create mode 100644 src/json-size/jsonSizeFast.ts create mode 100644 src/json-size/maxEncodingCapacity.ts diff --git a/src/json-size/README.md b/src/json-size/README.md new file mode 100644 index 0000000..1868202 --- /dev/null +++ b/src/json-size/README.md @@ -0,0 +1,38 @@ +# `json-size` + +This library implements methods to calculate the size of JSON objects. +It calculates the size of bytes necessary to store the final serialized JSON +in UTF-8 encoding. + +## Usage + +```ts +import {jsonSize} from 'json-joy/{lib,es6}/json-size'; + +jsonSize({1: 2, foo: 'bar'}); // 19 +``` + +## Reference + +- `jsonSize` — calculates exact JSON size, as `JSON.stringify()` would return. +- `jsonSizeApprox` — a faster version, which uses string nominal length for calculation. +- `jsonSizeFast` — the fastest version, which uses nominal values for all JSON types. See + source code for description. +- `msgpackSizeFast` — same as `jsonSizeFast`, but for MessagePack values. In addition + to regular JSON values it also supports binary data (by `Buffer` or `Uint8Array`), + `JsonPackExtension`, and `JsonPackValue`. + +## Performance + +In most cases `json-size` will be faster than `JSON.stringify`. + +``` +node benchmarks/json-size.js +json-joy/json-size jsonSize() x 377,980 ops/sec ±0.12% (100 runs sampled), 2646 ns/op +json-joy/json-size jsonSizeApprox() x 377,841 ops/sec ±0.09% (98 runs sampled), 2647 ns/op +json-joy/json-size jsonSizeFast() x 2,229,344 ops/sec ±0.30% (101 runs sampled), 449 ns/op +json-joy/json-size msgpackSizeFast() x 1,260,284 ops/sec ±0.10% (96 runs sampled), 793 ns/op +JSON.stringify x 349,696 ops/sec ±0.08% (100 runs sampled), 2860 ns/op +JSON.stringify + utf8Count x 182,977 ops/sec ±0.10% (100 runs sampled), 5465 ns/op +Fastest is json-joy/json-size jsonSizeFast() +``` diff --git a/src/json-size/__bench__/json-size.ts b/src/json-size/__bench__/json-size.ts new file mode 100644 index 0000000..845a2d0 --- /dev/null +++ b/src/json-size/__bench__/json-size.ts @@ -0,0 +1,80 @@ +/* tslint:disable no-console */ + +// npx ts-node src/json-size/__bench__/json-size.ts + +import * as Benchmark from 'benchmark'; +import {utf8Size} from '@jsonjoy.com/util/lib/strings/utf8'; +import {jsonSize, jsonSizeApprox} from '../json'; +import {jsonSizeFast} from '../jsonSizeFast'; +import {msgpackSizeFast} from '../msgpackSizeFast'; + +const json = [ + {op: 'add', path: '/foo/baz', value: 666}, + {op: 'add', path: '/foo/bx', value: 666}, + {op: 'add', path: '/asdf', value: 'asdfadf asdf'}, + {op: 'move', path: '/arr/0', from: '/arr/1'}, + {op: 'replace', path: '/foo/baz', value: 'lorem ipsum'}, + { + op: 'add', + path: '/docs/latest', + value: { + name: 'blog post', + json: { + id: 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx', + longString: + 'lorem ipsum dolorem, alamorem colomorem, ipsum pipsum, lorem ipsum dolorem, alamorem colomorem, ipsum pipsum, lorem ipsum dolorem, alamorem colomorem, ipsum pipsum, lorem ipsum dolorem, alamorem colomorem, ipsum pipsum, lorem ipsum dolorem, alamorem colomorem, ipsum pipsum', + author: { + name: 'John 💪', + handle: '@johny', + }, + lastSeen: -12345, + tags: [null, 'Sports 🏀', 'Personal', 'Travel'], + pins: [ + { + id: 1239494, + }, + ], + marks: [ + { + x: 1, + y: 1.234545, + w: 0.23494, + h: 0, + }, + ], + hasRetweets: false, + approved: true, + '👍': 33, + }, + }, + }, +]; + +const suite = new Benchmark.Suite(); + +suite + .add(`json-joy/json-size jsonSize()`, () => { + jsonSize(json); + }) + .add(`json-joy/json-size jsonSizeApprox()`, () => { + jsonSizeApprox(json); + }) + .add(`json-joy/json-size jsonSizeFast()`, () => { + jsonSizeFast(json); + }) + .add(`json-joy/json-size msgpackSizeFast()`, () => { + msgpackSizeFast(json); + }) + .add(`JSON.stringify`, () => { + JSON.stringify(json).length; + }) + .add(`JSON.stringify + utf8Count`, () => { + utf8Size(JSON.stringify(json)); + }) + .on('cycle', (event: any) => { + console.log(String(event.target) + `, ${Math.round(1000000000 / event.target.hz)} ns/op`); + }) + .on('complete', () => { + console.log('Fastest is ' + suite.filter('fastest').map('name')); + }) + .run(); diff --git a/src/json-size/__tests__/fuzz.spec.ts b/src/json-size/__tests__/fuzz.spec.ts new file mode 100644 index 0000000..310e527 --- /dev/null +++ b/src/json-size/__tests__/fuzz.spec.ts @@ -0,0 +1,16 @@ +import {jsonSize} from '..'; +import {RandomJson} from '../../json-random/RandomJson'; +import {utf8Size} from '../../strings/utf8'; + +const random = new RandomJson(); +const iterations = 100; + +for (let i = 0; i < iterations; i++) { + test(`calculates json size - ${i + 1}`, () => { + const json = random.create(); + // console.log(json); + const size1 = jsonSize(json); + const size2 = utf8Size(JSON.stringify(json)); + expect(size1).toBe(size2); + }); +} diff --git a/src/json-size/__tests__/json.spec.ts b/src/json-size/__tests__/json.spec.ts new file mode 100644 index 0000000..9382295 --- /dev/null +++ b/src/json-size/__tests__/json.spec.ts @@ -0,0 +1,10 @@ +import {jsonSize, jsonSizeApprox} from '../json'; +import {testJsonSize} from './testJsonSize'; + +describe('jsonSize', () => { + testJsonSize(jsonSize); +}); + +describe('jsonSizeApprox', () => { + testJsonSize(jsonSizeApprox, {simpleStringsOnly: true}); +}); diff --git a/src/json-size/__tests__/jsonSizeFast.spec.ts b/src/json-size/__tests__/jsonSizeFast.spec.ts new file mode 100644 index 0000000..784427a --- /dev/null +++ b/src/json-size/__tests__/jsonSizeFast.spec.ts @@ -0,0 +1,61 @@ +import {jsonSizeFast} from '../jsonSizeFast'; + +test('computes size of single values', () => { + expect(jsonSizeFast(null)).toBe(1); + expect(jsonSizeFast(true)).toBe(1); + expect(jsonSizeFast(false)).toBe(1); + expect(jsonSizeFast(1)).toBe(9); + expect(jsonSizeFast(1.1)).toBe(9); + expect(jsonSizeFast('123')).toBe(7); + expect(jsonSizeFast('')).toBe(4); + expect(jsonSizeFast('A')).toBe(5); + expect(jsonSizeFast([])).toBe(2); + expect(jsonSizeFast({})).toBe(2); +}); + +test('computes size complex object', () => { + // prettier-ignore + const json = { // 2 + a: 1, // 2 + 1 + 9 + b: true, // 2 + 1 + 1 + c: false, // 2 + 1 + 1 + d: null, // 2 + 1 + 1 + 'e.e': 2.2, // 2 + 3 + 9 + f: '', // 2 + 1 + 4 + 0 + g: 'asdf', // 2 + 1 + 4 + 4 + h: {}, // 2 + 1 + 2 + i: [ // 2 + 1 + 2 + 1, // 9 + true, // 1 + false, // 1 + null, // 1 + 2.2, // 9 + '', // 4 + 0 + 'asdf', // 4 + 4 + {}, // 2 + ], + }; + const size = jsonSizeFast(json); + + // prettier-ignore + expect(size).toBe( + 2 + + 2 + 1 + 9 + + 2 + 1 + 1 + + 2 + 1 + 1 + + 2 + 1 + 1 + + 2 + 3 + 9 + + 2 + 1 + 4 + 0 + + 2 + 1 + 4 + 4 + + 2 + 1 + 2 + + 2 + 1 + 2 + + 9 + + 1 + + 1 + + 1 + + 9 + + 4 + 0 + + 4 + 4 + + 2 + ); +}); diff --git a/src/json-size/__tests__/maxEncodingCapacity.spec.ts b/src/json-size/__tests__/maxEncodingCapacity.spec.ts new file mode 100644 index 0000000..19344b0 --- /dev/null +++ b/src/json-size/__tests__/maxEncodingCapacity.spec.ts @@ -0,0 +1,47 @@ +import {maxEncodingCapacity} from '../maxEncodingCapacity'; + +test('computes size of single values', () => { + expect(maxEncodingCapacity(null)).toBe(4); + expect(maxEncodingCapacity(true)).toBe(5); + expect(maxEncodingCapacity(false)).toBe(5); + expect(maxEncodingCapacity(1)).toBe(22); + expect(maxEncodingCapacity(1.1)).toBe(22); + expect(maxEncodingCapacity('123')).toBe(20); + expect(maxEncodingCapacity('')).toBe(5); + expect(maxEncodingCapacity('A')).toBe(10); + expect(maxEncodingCapacity([])).toBe(5); + expect(maxEncodingCapacity({})).toBe(5); + expect(maxEncodingCapacity({foo: 1})).toBe(49); + expect(maxEncodingCapacity({foo: [1]})).toBe(55); +}); + +test('a larger value', () => { + expect( + maxEncodingCapacity({ + name: 'cooking receipt', + json: { + id: '0001', + type: 'donut', + name: 'Cake', + ppu: 0.55, + batters: { + batter: [ + {id: '1001', type: 'Regular'}, + {id: '1002', type: 'Chocolate'}, + {id: '1003', type: 'Blueberry'}, + {id: '1004', type: "Devil's Food"}, + ], + }, + topping: [ + {id: '5001', type: 'None'}, + {id: '5002', type: 'Glazed'}, + {id: '5005', type: 'Sugar'}, + {id: '5007', type: 'Powdered Sugar'}, + {id: '5006', type: 'Chocolate with Sprinkles'}, + {id: '5003', type: 'Chocolate'}, + {id: '5004', type: 'Maple'}, + ], + }, + }), + ).toBe(1875); +}); diff --git a/src/json-size/__tests__/testJsonSize.ts b/src/json-size/__tests__/testJsonSize.ts new file mode 100644 index 0000000..1342410 --- /dev/null +++ b/src/json-size/__tests__/testJsonSize.ts @@ -0,0 +1,66 @@ +import {utf8Size} from '../../strings/utf8'; + +export const testJsonSize = ( + jsonSize: (val: unknown) => number, + {simpleStringsOnly = false}: {simpleStringsOnly?: boolean} = {}, +) => { + test('calculates null size', () => { + expect(jsonSize(null)).toBe(4); + }); + + test('calculates boolean sizes', () => { + expect(jsonSize(true)).toBe(4); + expect(jsonSize(false)).toBe(5); + }); + + test('calculates number sizes', () => { + expect(jsonSize(1)).toBe(1); + expect(jsonSize(1.1)).toBe(3); + expect(jsonSize(0)).toBe(1); + expect(jsonSize(1.123)).toBe(5); + expect(jsonSize(-1.123)).toBe(6); + }); + + if (!simpleStringsOnly) { + test('calculates string sizes', () => { + expect(jsonSize('')).toBe(2); + expect(jsonSize('a')).toBe(3); + expect(jsonSize('abc')).toBe(5); + expect(jsonSize('👨‍👩‍👦‍👦')).toBe(27); + expect(jsonSize('büro')).toBe(7); + expect(jsonSize('office')).toBe(8); + }); + } + + if (!simpleStringsOnly) { + test('calculates string sizes with escaped characters', () => { + expect(jsonSize('\\')).toBe(4); + expect(jsonSize('"')).toBe(4); + expect(jsonSize('\b')).toBe(4); + expect(jsonSize('\f')).toBe(4); + expect(jsonSize('\n')).toBe(4); + expect(jsonSize('\r')).toBe(4); + expect(jsonSize('\t')).toBe(4); + }); + } + + test('calculates array sizes', () => { + expect(jsonSize([])).toBe(2); + expect(jsonSize([1])).toBe(3); + expect(jsonSize([1, 2, 3])).toBe(7); + expect(jsonSize([1, 'büro', 3])).toBe(13); + }); + + test('calculates object sizes', () => { + expect(jsonSize({})).toBe(2); + expect(jsonSize({a: 1})).toBe(2 + 3 + 1 + 1); + expect(jsonSize({1: 2, foo: 'bar'})).toBe(2 + 3 + 1 + 1 + 1 + 5 + 1 + 5); + }); + + test('calculates size of array of length 2 that begins with empty string', () => { + const json = ['', -1]; + const size1 = jsonSize(json); + const size2 = utf8Size(JSON.stringify(json)); + expect(size1).toBe(size2); + }); +}; diff --git a/src/json-size/index.ts b/src/json-size/index.ts new file mode 100644 index 0000000..4613fbc --- /dev/null +++ b/src/json-size/index.ts @@ -0,0 +1,3 @@ +export * from './json'; +export * from './jsonSizeFast'; +export * from './maxEncodingCapacity'; diff --git a/src/json-size/json.ts b/src/json-size/json.ts new file mode 100644 index 0000000..cbf3d2c --- /dev/null +++ b/src/json-size/json.ts @@ -0,0 +1,94 @@ +import {utf8Size} from '../strings/utf8'; + +const numberSize = (num: number) => { + const isInteger = num === Math.round(num); + if (isInteger) return Math.max(Math.floor(Math.log10(Math.abs(num))), 0) + 1 + (num < 0 ? 1 : 0); + return JSON.stringify(num).length; +}; + +const stringSize = (str: string) => { + const strLength = str.length; + let byteLength = strLength; + let pos = 0; + while (pos < strLength) { + const value = str.charCodeAt(pos++); + if (value < 128) { + switch (value) { + case 8: // \b + case 9: // \t + case 10: // \n + case 12: // \f + case 13: // \r + case 34: // \" + case 92: // \\ + byteLength += 1; + break; + } + continue; + } else return utf8Size(JSON.stringify(str)); + } + return byteLength + 2; +}; + +const booleanSize = (bool: boolean) => (bool ? 4 : 5); + +const arraySize = (arr: unknown[]) => { + let size = 0; + const length = arr.length; + for (let i = 0; i < length; i++) size += jsonSize(arr[i]); + return size + 2 + (length > 1 ? length - 1 : 0); +}; + +const objectSize = (obj: Record) => { + let size = 2; + let length = 0; + for (const key in obj) + if (obj.hasOwnProperty(key)) { + length++; + size += stringSize(key) + jsonSize(obj[key]); + } + const colonSize = length; + const commaSize = length > 1 ? length - 1 : 0; + return size + colonSize + commaSize; +}; + +/** + * Computes exact prices JSON size as would be output from JSON.stringify(). + * + * @param value JSON value to approximate size of + * @returns Size in bytes of JSON value + */ +export const jsonSize = (value: unknown): number => { + if (value === null) return 4; + switch (typeof value) { + case 'number': + return numberSize(value); + case 'string': + return stringSize(value); + case 'boolean': + return booleanSize(value); + } + if (value instanceof Array) return arraySize(value); + return objectSize(value as Record); +}; + +/** + * Same as `jsonSize` function, but approximates the size of strings to improve performance. + * Uses `.length` property of strings to approximate their size. + * + * @param value JSON value to approximate size of + * @returns Size in bytes of JSON value + */ +export const jsonSizeApprox = (value: unknown): number => { + if (value === null) return 4; + switch (typeof value) { + case 'number': + return numberSize(value); + case 'string': + return value.length; + case 'boolean': + return booleanSize(value); + } + if (value instanceof Array) return arraySize(value); + return objectSize(value as Record); +}; diff --git a/src/json-size/jsonSizeFast.ts b/src/json-size/jsonSizeFast.ts new file mode 100644 index 0000000..1717661 --- /dev/null +++ b/src/json-size/jsonSizeFast.ts @@ -0,0 +1,56 @@ +const arraySize = (arr: unknown[]): number => { + let size = 2; + for (let i = arr.length - 1; i >= 0; i--) size += jsonSizeFast(arr[i]); + return size; +}; + +const objectSize = (obj: Record): number => { + let size = 2; + for (const key in obj) if (obj.hasOwnProperty(key)) size += 2 + key.length + jsonSizeFast(obj[key]); + return size; +}; + +/** + * This function is the fastest way to approximate size of JSON object in bytes. + * + * It uses the following heuristics: + * + * - Boolean: 1 byte. + * - Null: 1 byte. + * - Number: 9 bytes (1 byte to store the number type, 8 bytes to store the number). + * - String: 4 bytes + string length. String length is encoded in UTF-8, so it is not + * exactly the same as the number of bytes in the string. + * - Array: 2 bytes + sum of sizes of elements. + * - Object: 2 bytes + 2 bytes for each key + length of each key + sum of sizes of values. + * + * Rationale: + * + * - Booleans and `null` are stored as one byte in MessagePack. + * - Maximum size of a number in MessagePack is 9 bytes (1 byte for the type, + * 8 bytes for the number). + * - Maximum overhead for string storage is 4 bytes in MessagePack. We use that, especially + * because we approximate the size of strings in UTF-8, which can consume more bytes if + * non-ASCII characters are present. + * - Maximum overhead for arrays is 4 bytes in MessagePack, but we use 2 bytes for the + * array length, as we don't expect most arrays to be longer than 65,535 elements. + * - Maximum overhead for objects is 4 bytes in MessagePack, but we use 2 bytes for the + * object length, as we don't expect most objects to have more than 65,535 keys. + * - For object keys we use 2 bytes overhead for each key, as we don't expect most + * keys to be longer than 65,535 characters. + * + * @param value JSON value to calculate approximate size of + * @returns Number of bytes required to store the JSON value + */ +export const jsonSizeFast = (value: unknown): number => { + if (value === null) return 1; + switch (typeof value) { + case 'number': + return 9; + case 'string': + return 4 + value.length; + case 'boolean': + return 1; + } + if (value instanceof Array) return arraySize(value); + return objectSize(value as Record); +}; diff --git a/src/json-size/maxEncodingCapacity.ts b/src/json-size/maxEncodingCapacity.ts new file mode 100644 index 0000000..5ad7f90 --- /dev/null +++ b/src/json-size/maxEncodingCapacity.ts @@ -0,0 +1,55 @@ +export const enum MaxEncodingOverhead { + Null = 4, // Literal "null" + Boolean = 5, // Literal "false" + Number = 22, // Literal "1.1111111111111111e+21" = JSON.stringify(1111111111111111111112) + String = 1 + 4, // As per TLV: 1 byte for type, 4 bytes for length. + StringLengthMultiplier = 5, // 4x UTF-8 overhead + 1.3x Base64 overhead, plus, 1 byte for each non-ASCII character. + Binary = 2 + 37 + 2, // 2 for two quotes, 37 for "data:application/octet-stream;base64,'" literal, 2 bytes for Base64 padding. + BinaryLengthMultiplier = 2, // 1.3x Base64 overhead. + Array = 1 + 4, // As per TLV: 1 byte for type, 4 bytes for length. + ArrayElement = 1, // Separator "," literal. + Object = 1 + 4, // As per TLV: 1 byte for type, 4 bytes for length. + ObjectElement = 1 + 1, // 1 byte for Key-value separator ":" literal, and 1 byte for separator "," literal. + Undefined = Binary + BinaryLengthMultiplier * 2, +} + +export const maxEncodingCapacity = (value: unknown): number => { + switch (typeof value) { + case 'number': + return MaxEncodingOverhead.Number; + case 'string': + return MaxEncodingOverhead.String + value.length * MaxEncodingOverhead.StringLengthMultiplier; + case 'boolean': + return MaxEncodingOverhead.Boolean; + case 'object': { + if (!value) return MaxEncodingOverhead.Null; + const constructor = value.constructor; + switch (constructor) { + case Array: { + const arr = value as unknown[]; + const length = arr.length; + let size = MaxEncodingOverhead.Array + length * MaxEncodingOverhead.ArrayElement; + for (let i = arr.length - 1; i >= 0; i--) size += maxEncodingCapacity(arr[i]); + return size; + } + case Uint8Array: { + return MaxEncodingOverhead.Binary + (value as Uint8Array).length * MaxEncodingOverhead.BinaryLengthMultiplier; + } + case Object: { + let size = MaxEncodingOverhead.Object; + const obj = value as Record; + for (const key in obj) + if (obj.hasOwnProperty(key)) + size += MaxEncodingOverhead.ObjectElement + maxEncodingCapacity(key) + maxEncodingCapacity(obj[key]); + return size; + } + default: + return MaxEncodingOverhead.Undefined; + } + } + case 'bigint': + return MaxEncodingOverhead.Number; + default: + return MaxEncodingOverhead.Undefined; + } +};