From 5bf989bf7dde0a6124fb1605f3caf899273c6d4e Mon Sep 17 00:00:00 2001
From: Rod Vagg <rod@vagg.org>
Date: Mon, 11 Mar 2024 12:13:47 +1100
Subject: [PATCH] docs: add byte strings example using tags to retain
 typedarray types

Ref: https://github.com/rvagg/cborg/issues/69
---
 README.md              |   5 ++
 example-bytestrings.js | 180 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 example-bytestrings.js

diff --git a/README.md b/README.md
index c1d1d82..7218d99 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,7 @@
   * [Round-trip consistency](#round-trip-consistency)
 * [JSON mode](#json-mode)
   * [Example](#example-1)
+* [Advanced types and tags](#advanced-types-and-tags)
 * [License and Copyright](#license-and-copyright)
 
 ## Example
@@ -501,6 +502,10 @@ encoded: Uint8Array(34) [
 encoded (string): {"this":{"is":"JSON!","yay":true}}
 ```
 
+## Advanced types and tags
+
+As demonstrated above, the ability to provide custom `typeEncoders` to `encode()`, `tags` and even a custom `tokenizer` to `decode()` allow for quite a bit of flexibility in manipulating both the encode and decode process. An advanced example that uses all of these features can be found in [example-bytestrings.js](./example-bytestrings.js) which demonstrates how one might implement [RFC 8746](https://www.rfc-editor.org/rfc/rfc8746.html) to allow typed arrays to round-trip through CBOR and retain their original types. Since cborg is designed to speak purely in terms of `Uint8Array`s, its default behaviour will squash all typed arrays down to their byte array forms and materialise them as plain `Uint8Arrays`. Where round-trip fidelity is important and CBOR tags are an option, this form of usage is an option.
+
 ## License and Copyright
 
 Copyright 2020 Rod Vagg
diff --git a/example-bytestrings.js b/example-bytestrings.js
new file mode 100644
index 0000000..1b12527
--- /dev/null
+++ b/example-bytestrings.js
@@ -0,0 +1,180 @@
+/*
+RFC 8746 defines a set of tags to use for typed arrays. Out of the box, cborg doesn't care about
+tags and just squashes all concerns around byte arrays to Uint8Array with major type 2. This is
+fine for most use cases, but it is lossy, you can't round-trip and retain your original type.
+
+This example shows how to use cborg to round-trip a typed array with tags.
+
+https://www.rfc-editor.org/rfc/rfc8746.html
+*/
+
+import { encode, decode, Token, Tokenizer, Type } from 'cborg.js'
+
+const tagUint8Array = 64
+const tagUint64Array = 71
+// etc... see https://www.rfc-editor.org/rfc/rfc8746.html#name-iana-considerations
+
+/* ENCODERS */
+
+/**
+ * @param {any} obj
+ * @returns {[Token]}
+ */
+function uint8ArrayEncoder (obj) {
+  if (!(obj instanceof Uint8Array)) {
+    throw new Error('expected Uint8Array')
+  }
+  return [
+    new Token(Type.tag, tagUint8Array),
+    new Token(Type.bytes, obj)
+  ]
+}
+
+/**
+ * @param {any} obj
+ * @returns {[Token]}
+ */
+function uint64ArrayEncoder (obj) {
+  if (!(obj instanceof BigUint64Array)) {
+    throw new Error('expected BigUint64Array')
+  }
+  return [
+    new Token(Type.tag, tagUint64Array),
+    // BigUint64Array to a Uint8Array, but we have to pay attention to the possibility of it being
+    // a view of a larger ArrayBuffer.
+    new Token(Type.bytes, new Uint8Array(obj.buffer, obj.byteOffset, obj.byteLength))
+  ]
+}
+
+// etc...
+
+const typeEncoders = {
+  Uint8Array: uint8ArrayEncoder,
+  BigUint64Array: uint64ArrayEncoder
+}
+
+/* DECODERS */
+
+/**
+ * @param {ArrayBuffer} bytes
+ * @returns {any}
+ */
+function uint8ArrayDecoder (bytes) {
+  if (!(bytes instanceof ArrayBuffer)) {
+    throw new Error('expected ArrayBuffer')
+  }
+  return new Uint8Array(bytes)
+}
+
+/**
+ * @param {ArrayBuffer} bytes
+ * @returns {any}
+ */
+function uint64ArrayDecoder (bytes) {
+  if (!(bytes instanceof ArrayBuffer)) {
+    throw new Error('expected ArrayBuffer')
+  }
+  return new BigUint64Array(bytes)
+}
+
+// etc...
+
+const tags = []
+tags[tagUint8Array] = uint8ArrayDecoder
+tags[tagUint64Array] = uint64ArrayDecoder
+
+/* TOKENIZER */
+
+// We have to deal with the fact that cborg talks in Uint8Arrays but we now want it to treat major 2
+// as ArrayBuffers, so we have to transform the token stream to replace the Uint8Array with an
+// ArrayBuffer.
+
+class ArrayBufferTransformingTokeniser extends Tokenizer {
+  next () {
+    const nextToken = super.next()
+    if (nextToken.type === Type.bytes) {
+      // Transform the (assumed) Uint8Array value to an ArrayBuffer of the same bytes, note though
+      // that all tags we care about are going to be <tag><bytes>, so we're also transforming those
+      // into ArrayBuffers, so our tag decoders need to also assume they are getting ArrayBuffers
+      // now. An alternative would be to watch the token stream for <tag> and not transform the next
+      // token if it's <bytes>, but that's a bit more complicated for demo purposes.
+      nextToken.value = nextToken.value.buffer
+    }
+    return nextToken
+  }
+}
+
+// Optional: a new decode() wrapper, mainly so we don't have to deal with the complications of\
+// instantiating a Tokenizer which needs both data and the options.
+function byteStringDecoder (data, options) {
+  options = Object.assign({}, options, {
+    tags,
+    tokenizer: new ArrayBufferTransformingTokeniser(data, options)
+  })
+  return decode(data, options)
+}
+
+/* ROUND-TRIP */
+
+const original = {
+  u8: new Uint8Array([1, 2, 3, 4, 5]),
+  u64: new BigUint64Array([10000000000000000n, 20000000000000000n, 30000000000000000n, 40000000000000000n, 50000000000000000n]),
+  ab: new Uint8Array([6, 7, 8, 9, 10]).buffer
+}
+
+const encoded = encode(original, { typeEncoders })
+
+const decoded = byteStringDecoder(encoded)
+
+console.log('Original:', original)
+console.log('Encoded:', Buffer.from(encoded).toString('hex')) // excuse the Buffer, sorry browser peeps
+console.log('Decoded:', decoded)
+
+/* Output:
+
+Original: {
+  u8: Uint8Array(5) [ 1, 2, 3, 4, 5 ],
+  u64: BigUint64Array(5) [
+    10000000000000000n,
+    20000000000000000n,
+    30000000000000000n,
+    40000000000000000n,
+    50000000000000000n
+  ],
+  ab: ArrayBuffer { [Uint8Contents]: <06 07 08 09 0a>, byteLength: 5 }
+}
+Encoded: a362616245060708090a627538d84045010203040563753634d84758280000c16ff2862300000082dfe40d47000000434fd7946a00000004bfc91b8e000000c52ebca2b100
+Decoded: {
+  ab: ArrayBuffer { [Uint8Contents]: <06 07 08 09 0a>, byteLength: 5 },
+  u8: Uint8Array(5) [ 1, 2, 3, 4, 5 ],
+  u64: BigUint64Array(5) [
+    10000000000000000n,
+    20000000000000000n,
+    30000000000000000n,
+    40000000000000000n,
+    50000000000000000n
+  ]
+}
+
+*/
+
+/* Diagnostic:
+
+$ cborg hex2diag a362616245060708090a627538d84045010203040563753634d84758280000c16ff2862300000082dfe40d47000000434fd7946a00000004bfc91b8e000000c52ebca2b100
+a3                                                # map(3)
+  62                                              #   string(2)
+    6162                                          #     "ab"
+  45                                              #   bytes(5)
+    060708090a                                    #     "\x06\x07\x08\x09\x0a"
+  62                                              #   string(2)
+    7538                                          #     "u8"
+  d8 40                                           #   tag(64)
+    45                                            #     bytes(5)
+      0102030405                                  #       "\x01\x02\x03\x04\x05"
+  63                                              #   string(3)
+    753634                                        #     "u64"
+  d8 47                                           #   tag(71)
+    58 28                                         #     bytes(40)
+      0000c16ff2862300000082dfe40d47000000434fd7  #       "\x00\x00Áoò\x86#\x00\x00\x00\x82ßä\x0dG\x00\x00\x00CO×"
+      946a00000004bfc91b8e000000c52ebca2b100      #       "\x94j\x00\x00\x00\x04¿É\x1b\x8e\x00\x00\x00Å.¼¢±\x00
+*/