From 067e68291db7a5ce0aaf03050017f11b955ccf3f Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sat, 15 May 2021 10:50:15 +0000
Subject: [PATCH 1/8] jv_string_implode: avoid producing unprintable string
 from reserved code points

---
 src/jv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/jv.c b/src/jv.c
index 498a14149d..c1d48bce07 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -1362,7 +1362,7 @@ jv jv_string_implode(jv j) {
     assert(JVP_HAS_KIND(n, JV_KIND_NUMBER));
     int nv = jv_number_value(n);
     jv_free(n);
-    if (nv > 0x10FFFF)
+    if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
       nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
     s = jv_string_append_codepoint(s, nv);
   }

From 6aff473b771cd022ab7b43b6b5db3e61678c0aa0 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Fri, 21 Jul 2023 19:34:12 +1200
Subject: [PATCH 2/8] Binary strings: preserve UTF-8 and UTF-16 errors

The internal string representation is changed from UTF-8 with replacement
characters to a modified form of "WTF-8" that is able to distinctly encode
UTF-8 errors and UTF-16 errors.

This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16
errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using
the original raw bytes) are maintained when emitting JSON. When emitting raw
strings, UTF-8 errors are maintained and UTF-16 errors are converted into
replacement characters.
---
 scripts/gen_utf8_tables.py |   3 +-
 src/jv.c                   |  28 ++++----
 src/jv.h                   |   1 +
 src/jv_parse.c             |  77 +++++++++++++-------
 src/jv_print.c             |  26 ++++++-
 src/jv_unicode.c           | 143 ++++++++++++++++++++++++++++++++++---
 src/jv_unicode.h           |  12 ++++
 src/jv_utf8_tables.h       |   4 +-
 src/main.c                 |   9 ++-
 tests/jq.test              |   5 ++
 tests/shtest               |   9 +++
 11 files changed, 264 insertions(+), 53 deletions(-)

diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py
index 6fe0a5312b..7706462351 100644
--- a/scripts/gen_utf8_tables.py
+++ b/scripts/gen_utf8_tables.py
@@ -16,8 +16,7 @@ def print_table(type, name, t):
 def utf8info(c):
     if c < 0x80: return 1, mask(7)
     if 0x80 <= c <= 0xBF: return 255, mask(6)
-    if 0xC0 <= c <= 0xC1: return 0, 0
-    if 0xC2 <= c <= 0xDF: return 2, mask(5)
+    if 0xC0 <= c <= 0xDF: return 2, mask(5)
     if 0xE0 <= c <= 0xEF: return 3, mask(4)
     if 0xF0 <= c <= 0xF4: return 4, mask(3)
     if 0xF4 <= c <= 0xFF: return 0, 0
diff --git a/src/jv.c b/src/jv.c
index c1d48bce07..92d8336faf 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -1085,20 +1085,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
   return s;
 }
 
-/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
+/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
 static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   const char* end = data + length;
   const char* i = data;
   const char* cstart;
 
-  uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
+  uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
   jvp_string* s = jvp_string_alloc(maxlength);
   char* out = s->data;
   int c = 0;
 
-  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+  while ((i = jvp_utf8_wtf_next((cstart = i), end, 0, &c))) {
     if (c == -1) {
-      c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+      int error = (unsigned char)*cstart;
+      assert(error >= 0x80 && error <= 0xFF);
+      c = -error;
+      /* Ensure each UTF-8 error byte is consumed separately */
+      i = cstart + 1;
     }
     out += jvp_utf8_encode(c, out);
     assert(out < s->data + maxlength);
@@ -1110,8 +1114,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   return r;
 }
 
-/* Assumes valid UTF8 */
-static jv jvp_string_new(const char* data, uint32_t length) {
+/* Assumes valid WTF-8b */
+jv jv_string_wtf_sized(const char* data, int length) {
   jvp_string* s = jvp_string_alloc(length);
   s->length_hashed = length << 1;
   if (data != NULL)
@@ -1252,7 +1256,7 @@ static int jvp_string_equal(jv a, jv b) {
 jv jv_string_sized(const char* str, int len) {
   return
     jvp_utf8_is_valid(str, str+len) ?
-    jvp_string_new(str, len) :
+    jv_string_wtf_sized(str, len) :
     jvp_string_copy_replace_bad(str, len);
 }
 
@@ -1318,14 +1322,14 @@ jv jv_string_split(jv j, jv sep) {
 
   if (seplen == 0) {
     int c;
-    while ((jstr = jvp_utf8_next(jstr, jend, &c)))
+    while ((jstr = jvp_utf8_wtf_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
       a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
   } else {
     for (p = jstr; p < jend; p = s + seplen) {
       s = _jq_memmem(p, jend - p, sepstr, seplen);
       if (s == NULL)
         s = jend;
-      a = jv_array_append(a, jv_string_sized(p, s - p));
+      a = jv_array_append(a, jv_string_wtf_sized(p, s - p));
       // Add an empty string to denote that j ends on a sep
       if (s + seplen == jend && seplen != 0)
         a = jv_array_append(a, jv_string(""));
@@ -1397,7 +1401,7 @@ jv jv_string_slice(jv j, int start, int end) {
 
   /* Look for byte offset corresponding to start codepoints */
   for (p = s, i = 0; i < start; i++) {
-    p = jvp_utf8_next(p, s + len, &c);
+    p = jvp_utf8_wtf_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (p == NULL) {
       jv_free(j);
       return jv_string_empty(16);
@@ -1409,7 +1413,7 @@ jv jv_string_slice(jv j, int start, int end) {
   }
   /* Look for byte offset corresponding to end codepoints */
   for (e = p; e != NULL && i < end; i++) {
-    e = jvp_utf8_next(e, s + len, &c);
+    e = jvp_utf8_wtf_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (e == NULL) {
       e = s + len;
       break;
@@ -1427,7 +1431,7 @@ jv jv_string_slice(jv j, int start, int end) {
    * memory like a drunken navy programmer.  There's probably nothing we
    * can do about it.
    */
-  res = jv_string_sized(p, e - p);
+  res = jv_string_wtf_sized(p, e - p);
   jv_free(j);
   return res;
 }
diff --git a/src/jv.h b/src/jv.h
index 8c96f822f0..8a328ec91b 100644
--- a/src/jv.h
+++ b/src/jv.h
@@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv);
 
 jv jv_string(const char*);
 jv jv_string_sized(const char*, int);
+jv jv_string_wtf_sized(const char*, int);
 jv jv_string_empty(int len);
 int jv_string_length_bytes(jv);
 int jv_string_length_codepoints(jv);
diff --git a/src/jv_parse.c b/src/jv_parse.c
index 3a8718ae82..a7e1d2463b 100644
--- a/src/jv_parse.c
+++ b/src/jv_parse.c
@@ -428,7 +428,7 @@ static void tokenadd(struct jv_parser* p, char c) {
   p->tokenbuf[p->tokenpos++] = c;
 }
 
-static int unhex4(char* hex) {
+static int unhex4(const char* hex) {
   int r = 0;
   for (int i=0; i<4; i++) {
     char c = *hex++;
@@ -444,15 +444,19 @@ static int unhex4(char* hex) {
 }
 
 static pfunc found_string(struct jv_parser* p) {
-  char* in = p->tokenbuf;
-  char* out = p->tokenbuf;
-  char* end = p->tokenbuf + p->tokenpos;
-
-  while (in < end) {
-    char c = *in++;
+  const char* in = p->tokenbuf;
+  // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors)
+  char* newbuf = NULL;
+  char* buf = p->tokenbuf;
+  char* out = buf;
+  const char* end = p->tokenbuf + p->tokenpos;
+  const char* cstart;
+  int c;
+
+  while ((in = jvp_utf8_wtf_next((cstart = in), end, 0, &c))) {
     if (c == '\\') {
       if (in >= end)
-        return "Expected escape character at end of string";
+        return jv_mem_free(newbuf), "Expected escape character at end of string";
       c = *in++;
       switch (c) {
       case '\\':
@@ -467,38 +471,61 @@ static pfunc found_string(struct jv_parser* p) {
       case 'u':
         /* ahh, the complicated case */
         if (in + 4 > end)
-          return "Invalid \\uXXXX escape";
+          return jv_mem_free(newbuf), "Invalid \\uXXXX escape";
         int hexvalue = unhex4(in);
         if (hexvalue < 0)
-          return "Invalid characters in \\uXXXX escape";
+          return jv_mem_free(newbuf), "Invalid characters in \\uXXXX escape";
         unsigned long codepoint = (unsigned long)hexvalue;
         in += 4;
+        // leading surrogate
         if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
-          /* who thought UTF-16 surrogate pairs were a good idea? */
-          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
-            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
-          unsigned long surrogate = unhex4(in+2);
-          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
-            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
-          in += 6;
-          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
-                                 |(surrogate - 0xDC00));
+          // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8
+          if (in + 6 <= end && in[0] == '\\' && in[1] == 'u') {
+            unsigned long surrogate = unhex4(in+2);
+            if (0xDC00 <= surrogate && surrogate <= 0xDFFF) {
+              in += 6;
+              codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
+                                     |(surrogate - 0xDC00));
+            }
+          }
         }
-        if (codepoint > 0x10FFFF)
-          codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+        // UTF-16 surrogates can not encode a greater codepoint
+        assert(codepoint <= 0x10FFFF);
+        // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8
         out += jvp_utf8_encode(codepoint, out);
         break;
 
       default:
-        return "Invalid escape";
+        return jv_mem_free(newbuf), "Invalid escape";
       }
     } else {
       if (c > 0 && c < 0x001f)
-        return "Invalid string: control characters from U+0000 through U+001F must be escaped";
-      *out++ = c;
+        return jv_mem_free(newbuf), "Invalid string: control characters from U+0000 through U+001F must be escaped";
+      if (c == -1) {
+        int error = (unsigned char)*cstart;
+        assert(error >= 0x80 && error <= 0xFF);
+        c = -error;
+        /* Ensure each UTF-8 error byte is consumed separately */
+        const int wtf8_length = 2;
+        assert(jvp_utf8_encode_length(c) == wtf8_length);
+        in = cstart + 1;
+        if (newbuf == NULL && out + wtf8_length > in) {
+          /* Output is about to overflow input, move output to temporary buffer */
+          int current_size = out - p->tokenbuf;
+          int remaining = end - cstart;
+          newbuf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX
+          memcpy(newbuf, buf, current_size);
+          buf = newbuf;
+          out = buf + current_size;
+        }
+      } else
+        assert(jvp_utf8_encode_length(c) == in - cstart);
+      out += jvp_utf8_encode(c, out);
     }
   }
-  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
+  jv v = jv_string_wtf_sized(buf, out - buf);
+  jv_mem_free(newbuf);
+  TRY(value(p, v));
   p->tokenpos = 0;
   return 0;
 }
diff --git a/src/jv_print.c b/src/jv_print.c
index d1db88aa89..7c4258ee3b 100644
--- a/src/jv_print.c
+++ b/src/jv_print.c
@@ -98,6 +98,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) {
   put_buf(&c, 1, fout, strout, T);
 }
 
+static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) {
+  assert(c >= 0x80 && c <= 0xFF);
+  if (strout) {
+    // encode as an invalid UTF-8 byte in output
+    *strout = jv_string_append_codepoint(*strout, -c);
+  } else {
+    put_char(c, fout, strout, T);
+  }
+}
+
 static void put_str(const char* s, FILE* fout, jv* strout, int T) {
   put_buf(s, strlen(s), fout, strout, T);
 }
@@ -121,7 +131,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
   int c = 0;
   char buf[32];
   put_char('"', F, S, T);
-  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+  while ((i = jvp_utf8_wtf_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
     assert(c != -1);
     int unicode_escape = 0;
     if (0x20 <= c && c <= 0x7E) {
@@ -130,6 +140,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
         put_char('\\', F, S, T);
       }
       put_char(c, F, S, T);
+    } else if (c >= -0xFF && c <= -0x80) {
+      // Invalid UTF-8 byte
+      if (ascii_only) {
+        // refusing to emit invalid UTF-8
+        // TODO: convince the world to adopt a "\xXX" notation for JSON?
+        c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+        unicode_escape = 1;
+      } else {
+        // pass through
+        put_invalid_utf8_byte(-c, F, S, T);
+      }
     } else if (c < 0x20 || c == 0x7F) {
       // ASCII control character
       switch (c) {
@@ -160,6 +181,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) {
     } else {
       if (ascii_only) {
         unicode_escape = 1;
+      } else if (c >= 0xD800 && c <= 0xDFFF) {
+        // lone surrogate; can't be encoded to UTF-8
+        unicode_escape = 1;
       } else {
         put_buf(cstart, i - cstart, F, S, T);
       }
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index d197349f48..cbd812b454 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -27,6 +27,112 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_
 }
 
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
+  return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
+}
+
+// jvp_utf8_wtf_next_bytes iterates through chunks of UTF-8 bytes represented
+// by a WTF-8b string. *bytes_out is set to the start of the current chunk and
+// *bytes_len is set to the size of the current chunk. Valid sequences of UTF-8
+// bytes are emitted as maximally sized chunks (pointing into the `in` string).
+// Ill-formed UTF-8 bytes are emitted individually (pointing into a static
+// array containing the byte). Ill-formed UTF-16 code units are emitted as
+// UTF-8 replacement characters (pointing into a static array containing the
+// bytes for U+FFFD).
+const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len) {
+  // U+FFFD REPLACEMENT CHARACTER
+  static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD};
+  // array of bytes from 0x80 to 0xFF (inclusive)
+  static const unsigned char UTF8_ILL_FORMED[] = {
+    #define ROW(x) \
+      x + 0, x + 1, x + 2, x + 3, \
+      x + 4, x + 5, x + 6, x + 7, \
+      x + 8, x + 9, x + 10, x + 11, \
+      x + 12, x + 13, x + 14, x + 15
+    ROW(0x80), ROW(0x90), ROW(0xA0), ROW(0xB0),
+    ROW(0xC0), ROW(0xD0), ROW(0xE0), ROW(0xF0)
+    #undef ROW
+  };
+
+  const char* i = in;
+  const char* cstart;
+  int c;
+
+  while ((i = jvp_utf8_wtf_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) {
+    if (c >= -0xFF && c <= -0x80) {
+      // invalid UTF-8 byte; pass through
+      if (cstart > in) {
+        // can't emit single byte yet; emit previous chunk first
+        break;
+      }
+      *bytes_len = 1;
+      *bytes_out = (const char*)&UTF8_ILL_FORMED[-c - 0x80];
+      return i;
+    }
+    if (c >= 0xD800 && c <= 0xDFFF) {
+      // lone surrogate; can't be encoded to UTF-8
+      if (cstart > in) {
+        // can't emit replacement bytes yet; emit previous chunk first
+        break;
+      }
+      *bytes_len = sizeof UTF8_REPLACEMENT;
+      *bytes_out = (const char*)UTF8_REPLACEMENT;
+      return i;
+    }
+  }
+
+  uint32_t len = cstart - in;
+  *bytes_len = len;
+  *bytes_out = in;
+  return len == 0? NULL : cstart;
+}
+
+/*
+  The internal representation of jv strings uses an encoding that is hereby
+  referred to as "WTF-8b" (until someone demonstrates use of another term to
+  refer to the same encoding).
+
+  WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence
+  of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and
+  WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same
+  sequence of Unicode scalar values (roughly, code points) in WTF-8b.
+
+  Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using
+  the "generalized UTF-8" representation of code points between U+D800 and
+  U+DFFF. These errors occur in JSON terms such as:
+    "_\uD8AB_\uDBCD_"
+
+  Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF
+  that are not part of a valid UTF-8 sequence) using the first 128 "overlong"
+  codings (unused 2-byte representations of U+00 to U+7F). These errors can
+  occur in any byte stream that is interpreted as UTF-8, for example:
+    "\xED\xA2\xAB"
+  The above example is in fact the WTF-8b (and WTF-8) encoding for the lone
+  UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct
+  encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB"
+  and "\uD8AB" would be interpreted as the same string, so at least one of the
+  forms would not be preserved when printed as JSON output.
+
+  It should also be noted that the process of converting from invalid UTF-8 to
+  WTF-8b is not (and can not be) idempotent, since the "generalized UTF-8"
+  representation of UTF-16 surrogates is intentionally not able to be
+  generated from invalid UTF-8, only through some other means (usually "\uXXXX"
+  notation).
+
+  Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes.
+  Each UTF-8 error is encoded as 2 WTF-8b bytes.
+
+  When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16
+  errors are emitted in the form of code points in the range U+D800 to U+DFFF.
+  These code points can be reencoded as usual using `jvp_utf8_encode`.
+
+  When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8
+  errors are emitted in the form of code points in the negative range -0x80 to
+  -0xFF. These negative code points can be negated to determine the original
+  error bytes. These code points can be reencoded as usual using
+  `jvp_utf8_encode`.
+*/
+
+const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) {
   assert(in <= end);
   if (in == end) {
     return 0;
@@ -40,9 +146,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
     length = 1;
   } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
     /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
     length = 1;
   } else if (in + length > end) {
     /* String ends before UTF8 sequence ends */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
     length = end - in;
   } else {
     codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
@@ -50,6 +158,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
       unsigned ch = (unsigned char)in[i];
       if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
         /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
+        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
         codepoint = -1;
         length = i;
         break;
@@ -58,17 +167,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
     }
     if (codepoint < utf8_first_codepoint[length]) {
       /* Overlong UTF8 sequence */
-      codepoint = -1;
+      if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
+        /* UTF-8 error is emitted as a negative codepoint */
+        codepoint = -(codepoint + 0x80);
+      } else {
+        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
+        codepoint = -1;
+      }
     }
     if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
-      /* Surrogate codepoints can't be encoded in UTF8 */
-      codepoint = -1;
+      /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
+      if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
+        /* Surrogate codepoints can't be encoded in UTF8 */
+        codepoint = -1;
+      }
     }
     if (codepoint > 0x10FFFF) {
       /* Outside Unicode range */
+      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
       codepoint = -1;
     }
   }
+  if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
+    codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
   assert(length > 0);
   *codepoint_ret = codepoint;
   return in + length;
@@ -76,7 +197,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
 
 int jvp_utf8_is_valid(const char* in, const char* end) {
   int codepoint;
-  while ((in = jvp_utf8_next(in, end, &codepoint))) {
+  while ((in = jvp_utf8_wtf_next(in, end, 0, &codepoint))) {
     if (codepoint == -1) return 0;
   }
   return 1;
@@ -91,20 +212,24 @@ int jvp_utf8_decode_length(char startchar) {
 }
 
 int jvp_utf8_encode_length(int codepoint) {
-  if (codepoint <= 0x7F) return 1;
+  if (codepoint >= 0 && codepoint <= 0x7F) return 1;
   else if (codepoint <= 0x7FF) return 2;
   else if (codepoint <= 0xFFFF) return 3;
   else return 4;
 }
 
 int jvp_utf8_encode(int codepoint, char* out) {
-  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
+  assert((codepoint >= 0 && codepoint <= 0x10FFFF) || (codepoint >= -0xFF && codepoint <= -0x80));
   char* start = out;
-  if (codepoint <= 0x7F) {
+  if (codepoint >= 0 && codepoint <= 0x7F) {
     *out++ = codepoint;
   } else if (codepoint <= 0x7FF) {
-    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
-    *out++ = 0x80 + ((codepoint & 0x03F));
+    // encode UTF-8 errors as overlong representations of U+00 to U+7F
+    int cp = codepoint >= -0xFF && codepoint <= -0x80?
+      -codepoint - 0x80 :
+      codepoint;
+    *out++ = 0xC0 + ((cp & 0x7C0) >> 6);
+    *out++ = 0x80 + ((cp & 0x03F));
   } else if(codepoint <= 0xFFFF) {
     *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
     *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 558721a8fd..cb0a481a5e 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -1,8 +1,20 @@
 #ifndef JV_UNICODE_H
 #define JV_UNICODE_H
 
+enum jvp_utf8_flags {
+  /* Emit replacement character instead of -1 for errors */
+  JVP_UTF8_REPLACE = 1,
+  /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */
+  JVP_UTF8_ERRORS_UTF16 = 2,
+  /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */
+  JVP_UTF8_ERRORS_UTF8 = 4,
+  JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
+};
+
 const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
+const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
+const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len);
 int jvp_utf8_is_valid(const char* in, const char* end);
 
 int jvp_utf8_decode_length(char startchar);
diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h
index f1a4252fce..7c68749e97 100644
--- a/src/jv_utf8_tables.h
+++ b/src/jv_utf8_tables.h
@@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] =
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-  0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
   0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
   0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
   0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
@@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] =
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
   0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
-  0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
+  0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
   0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
   0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
   0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
diff --git a/src/main.c b/src/main.c
index 48af5a31c1..83faeb3197 100644
--- a/src/main.c
+++ b/src/main.c
@@ -31,6 +31,7 @@ extern void jv_tsd_dtoa_ctx_init();
 #include "jv.h"
 #include "jq.h"
 #include "jv_alloc.h"
+#include "jv_unicode.h"
 #include "util.h"
 #include "src/version.h"
 
@@ -182,8 +183,12 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts, int options)
       if (options & ASCII_OUTPUT) {
         jv_dumpf(jv_copy(result), stdout, JV_PRINT_ASCII);
       } else {
-        priv_fwrite(jv_string_value(result), jv_string_length_bytes(jv_copy(result)),
-            stdout, dumpopts & JV_PRINT_ISATTY);
+        const char *start = jv_string_value(result);
+        const char *end = start + jv_string_length_bytes(jv_copy(result));
+        const char *bytes;
+        uint32_t bytes_len;
+        while ((start = jvp_utf8_wtf_next_bytes(start, end, &bytes, &bytes_len)))
+          priv_fwrite(bytes, bytes_len, stdout, dumpopts & JV_PRINT_ISATTY);
       }
       ret = JQ_OK;
       jv_free(result);
diff --git a/tests/jq.test b/tests/jq.test
index da35e9a84c..eb8674c976 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -57,6 +57,11 @@ null
 "Aa\r\n\t\b\f\u03bc"
 "Aa\u000d\u000a\u0009\u0008\u000c\u03bc"
 
+# Check that unpaired surrogates are preserved in output
+"\u2200\ud800\u2203\udc00\u2205\udfff"
+null
+"∀\ud800∃\udc00∅\udfff"
+
 "inter\("pol" + "ation")"
 null
 "interpolation"
diff --git a/tests/shtest b/tests/shtest
index d681ab45ad..35443cfb45 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -122,6 +122,15 @@ fi
 cmp $d/out $d/expected
 
 
+clean=false
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
+dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
+$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
+$VALGRIND $Q $JQ -j . $d/out.json >$d/out
+cmp $d/out $d/rand
+clean=true
+
+
 ## Test --exit-status
 data='{"i": 1}\n{"i": 2}\n{"i": 3}\n'
 printf "$data" | $JQ --exit-status 'select(.i==1)' > /dev/null 2>&1

From 79f0479e3164884e0e6dd6a1fd9893f971c1d062 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sun, 16 May 2021 09:18:51 +0000
Subject: [PATCH 3/8] Update `@base64`, `utf8bytelength` and `fromjson` to
 handle binary strings

---
 docs/content/manual/manual.yml |  1 -
 jq.1.prebuilt                  |  2 +-
 src/builtin.c                  | 49 ++++++++++++++++------
 src/jv.h                       |  1 +
 src/jv_parse.c                 | 76 +++++++++++++++++++++++++---------
 tests/base64.test              | 10 +++++
 tests/shtest                   | 19 ++++++---
 7 files changed, 118 insertions(+), 40 deletions(-)

diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index ff68482a3d..c44ad3818a 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -2040,7 +2040,6 @@ sections:
           * `@base64d`:
 
             The inverse of `@base64`, input is decoded as specified by RFC 4648.
-            Note\: If the decoded string is not UTF-8, the results are undefined.
 
           This syntax can be combined with string interpolation in a
           useful way. You can follow a `@foo` token with a string
diff --git a/jq.1.prebuilt b/jq.1.prebuilt
index 80933f748e..490791b831 100644
--- a/jq.1.prebuilt
+++ b/jq.1.prebuilt
@@ -2226,7 +2226,7 @@ The input is converted to base64 as specified by RFC 4648\.
 \fB@base64d\fR:
 .
 .IP
-The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\. Note\e: If the decoded string is not UTF\-8, the results are undefined\.
+The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\.
 .
 .P
 This syntax can be combined with string interpolation in a useful way\. You can follow a \fB@foo\fR token with a string literal\. The contents of the string literal will \fInot\fR be escaped\. However, all interpolations made inside that string literal will be escaped\. For instance,
diff --git a/src/builtin.c b/src/builtin.c
index b38d4c2f4f..ea419db1e7 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -470,7 +470,7 @@ static jv f_dump(jq_state *jq, jv input) {
 static jv f_json_parse(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings can be parsed");
-  jv res = jv_parse_sized(jv_string_value(input),
+  jv res = jv_parse_wtf_sized(jv_string_value(input),
                           jv_string_length_bytes(jv_copy(input)));
   jv_free(input);
   return res;
@@ -520,7 +520,15 @@ static jv f_tostring(jq_state *jq, jv input) {
 static jv f_utf8bytelength(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings have UTF-8 byte length");
-  return jv_number(jv_string_length_bytes(input));
+  const char* i = jv_string_value(input);
+  const char* end = i + jv_string_length_bytes(jv_copy(input));
+  uint32_t len = 0;
+  const char *bytes;
+  uint32_t bytes_len;
+  while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len)))
+    len += bytes_len;
+  jv_free(input);
+  return jv_number(len);
 }
 
 #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
@@ -695,21 +703,36 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
     jv_free(fmt);
     input = f_tostring(jq, input);
     jv line = jv_string("");
-    const unsigned char* data = (const unsigned char*)jv_string_value(input);
-    int len = jv_string_length_bytes(jv_copy(input));
-    for (int i=0; i<len; i+=3) {
-      uint32_t code = 0;
-      int n = len - i >= 3 ? 3 : len-i;
-      for (int j=0; j<3; j++) {
+    const char* i = jv_string_value(input);
+    const char* end = i + jv_string_length_bytes(jv_copy(input));
+    uint32_t code = 0;
+    int n = 0;
+    const char *bytes;
+    uint32_t bytes_len;
+    while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len))) {
+      unsigned char *ubuf = (unsigned char *)bytes;
+      for (uint32_t x = 0; x < bytes_len; x++) {
         code <<= 8;
-        code |= j < n ? (unsigned)data[i+j] : 0;
+        code |= ubuf[x];
+        if (++n == 3) {
+          char buf[4];
+          for (int j = 0; j < 4; j++)
+            buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
+          line = jv_string_append_buf(line, buf, sizeof(buf));
+          n = 0;
+          code = 0;
+        }
       }
+    }
+    if (n > 0) {
+      assert(n < 3);
+      code <<= 8*(3 - n);
       char buf[4];
-      for (int j=0; j<4; j++) {
+      for (int j = 0; j < 4; j++)
         buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
-      }
-      if (n < 3) buf[3] = '=';
-      if (n < 2) buf[2] = '=';
+      buf[3] = '=';
+      if (n < 2)
+        buf[2] = '=';
       line = jv_string_append_buf(line, buf, sizeof(buf));
     }
     jv_free(input);
diff --git a/src/jv.h b/src/jv.h
index 8a328ec91b..3f0dedbf2f 100644
--- a/src/jv.h
+++ b/src/jv.h
@@ -228,6 +228,7 @@ enum {
 
 jv jv_parse(const char* string);
 jv jv_parse_sized(const char* string, int length);
+jv jv_parse_wtf_sized(const char* string, int length);
 
 typedef void (*jv_nomem_handler_f)(void *);
 void jv_nomem_handler(jv_nomem_handler_f, void *);
diff --git a/src/jv_parse.c b/src/jv_parse.c
index a7e1d2463b..1573328a99 100644
--- a/src/jv_parse.c
+++ b/src/jv_parse.c
@@ -885,35 +885,63 @@ jv jv_parser_next(struct jv_parser* p) {
   }
 }
 
-jv jv_parse_sized(const char* string, int length) {
+static jv jvp_parse_sized(const char* string, int length, int extended) {
   struct jv_parser parser;
   parser_init(&parser, 0);
-  jv_parser_set_buf(&parser, string, length, 0);
-  jv value = jv_parser_next(&parser);
-  if (jv_is_valid(value)) {
-    jv next = jv_parser_next(&parser);
-    if (jv_is_valid(next)) {
-      // multiple JSON values, we only wanted one
-      jv_free(value);
-      jv_free(next);
-      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
-    } else if (jv_invalid_has_msg(jv_copy(next))) {
-      // parser error after the first JSON value
-      jv_free(value);
-      value = next;
+  const char *i = string;
+  const char *end = string + length;
+  jv value = jv_invalid();
+  int count = 0;
+  while (i != NULL) {
+    const char *bytes;
+    uint32_t bytes_len;
+    if (extended) {
+      // TOOD: consider handling string values containing UTF-16 errors; this
+      // won't normally occur when using the output of eg, `tojson`, but could
+      // occur when constructing JSON manually, eg:
+      // > "\"\uD800\"" | fromjson
+      // NOTE: a simple but crude way to do this might be to replace UTF-16
+      // errors in the input with \uXXXX sequences, since UTF-16 errors should
+      // only be allowed within string literals, where escape sequences can be
+      // equivalently used
+      i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len);
     } else {
-      // a single valid JSON value
-      jv_free(next);
+      bytes = string;
+      bytes_len = length;
+      i = NULL;
     }
-  } else if (jv_invalid_has_msg(jv_copy(value))) {
-    // parse error, we'll return it
-  } else {
+    jv_parser_set_buf(&parser, bytes, bytes_len, i != NULL);
+    for (;;) {
+      jv next = jv_parser_next(&parser);
+      if (!jv_is_valid(next)) {
+        if (jv_invalid_has_msg(jv_copy(next))) {
+          // parse error, we'll return it
+          count++;
+          jv_free(value);
+          value = next;
+          i = NULL;
+        }
+        break;
+      }
+      jv_free(value);
+      if (count++ == 0) {
+        // a single valid JSON value
+        value = next;
+      } else {
+        // multiple JSON values, we only wanted one
+        jv_free(next);
+        value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
+        i = NULL;
+        break;
+      }
+    }
+  }
+  if (count == 0) {
     // no value at all
     jv_free(value);
     value = jv_invalid_with_msg(jv_string("Expected JSON value"));
   }
   parser_free(&parser);
-
   if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
     jv msg = jv_invalid_get_msg(value);
     value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
@@ -924,6 +952,14 @@ jv jv_parse_sized(const char* string, int length) {
   return value;
 }
 
+jv jv_parse_sized(const char* string, int length) {
+  return jvp_parse_sized(string, length, 0);
+}
+
+jv jv_parse_wtf_sized(const char* string, int length) {
+  return jvp_parse_sized(string, length, 1);
+}
+
 jv jv_parse(const char* string) {
   return jv_parse_sized(string, strlen(string));
 }
diff --git a/tests/base64.test b/tests/base64.test
index 0f82b0b71d..6507bb83b7 100644
--- a/tests/base64.test
+++ b/tests/base64.test
@@ -33,3 +33,13 @@
 . | try @base64d catch .
 "QUJDa"
 "string (\"QUJDa\") trailing base64 byte found"
+
+# random binary data
+(. | @base64d | @base64) == .
+"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q=="
+true
+
+# replace lone surrogates
+@base64
+"foo\udca9\ud83dbar"
+"Zm9v77+977+9YmFy"
diff --git a/tests/shtest b/tests/shtest
index 35443cfb45..36b7205bf2 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -123,11 +123,20 @@ cmp $d/out $d/expected
 
 
 clean=false
-# Invalid UTF-8 bytes are preserved when encoding/decoding JSON
-dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null
-$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
-$VALGRIND $Q $JQ -j . $d/out.json >$d/out
-cmp $d/out $d/rand
+# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings
+if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
+    $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json
+    $VALGRIND $Q $JQ -j . $d/out.json >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
+    cmp $d/out $d/rand
+    base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
+    cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
+    cmp $d/out $d/rand
+fi
 clean=true
 
 

From 7fde46e5440e7f1b83a3bc2e93f6409d806f33a9 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Tue, 25 May 2021 22:59:59 +1200
Subject: [PATCH 4/8] Correct UTF-8 and UTF-16 errors during concatenation

UTF-8 errors and UTF-16 errors that were previously encoded into the ends of
strings will now potentially be used to form correct code points.

This is mostly a matter of making string equality behave expectedly, since
without this normalisation, it is possible to produce `jv` strings that are
converted to UTF-8 or UTF-16 the same way but are not equal due well-formed
code units that may or may not be encoded as errors.
---
 src/jv.c         |  13 ++-
 src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++---------
 src/jv_unicode.h |   3 +
 tests/jq.test    |  15 +++
 4 files changed, 230 insertions(+), 49 deletions(-)

diff --git a/src/jv.c b/src/jv.c
index 92d8336faf..d44f04b5b7 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -1155,20 +1155,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
   jvp_string* s = jvp_string_ptr(string);
   uint32_t currlen = jvp_string_length(s);
 
+  char join_buf[4];
+  int join_len = jvp_utf8_wtf_join(s->data, &currlen, &data, &len, join_buf);
+
   if (jvp_refcnt_unshared(string.u.ptr) &&
-      jvp_string_remaining_space(s) >= len) {
+      jvp_string_remaining_space(s) >= join_len + len) {
     // the next string fits at the end of a
+    memcpy(s->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(s->data + currlen, data, len);
     s->data[currlen + len] = 0;
     s->length_hashed = (currlen + len) << 1;
     return string;
   } else {
     // allocate a bigger buffer and copy
-    uint32_t allocsz = (currlen + len) * 2;
+    uint32_t allocsz = (currlen + join_len + len) * 2;
     if (allocsz < 32) allocsz = 32;
     jvp_string* news = jvp_string_alloc(allocsz);
-    news->length_hashed = (currlen + len) << 1;
+    news->length_hashed = (currlen + join_len + len) << 1;
     memcpy(news->data, s->data, currlen);
+    memcpy(news->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(news->data + currlen, data, len);
     news->data[currlen + len] = 0;
     jvp_string_free(string);
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index cbd812b454..a8858a2cf4 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -1,8 +1,72 @@
 #include <stdio.h>
+#include <string.h>
 #include <assert.h>
 #include "jv_unicode.h"
 #include "jv_utf8_tables.h"
 
+// length of encoding of erroneous UTF-8 byte
+#define UTF8_ERR_LEN 2
+// length of encoding of erroneous UTF-16 surrogate
+#define UTF16_ERR_LEN 3
+
+#define U32(a, b, c, d) ( \
+  (uint32_t) (a) << 0 | \
+  (uint32_t) (b) << 8 | \
+  (uint32_t) (c) << 16 | \
+  (uint32_t) (d) << 24 \
+)
+
+#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF))
+
+#define B0 0x00 // 00000000
+#define B1 0x80 // 10000000
+#define B2 0xC0 // 11000000
+#define B3 0xE0 // 11100000
+#define B4 0xF0 // 11110000
+#define B5 0xF8 // 11111000
+
+// NOTE: these flags are likely to be optimised out as `decode` gets inlined
+enum decode_flags {
+  DECODE_1 = 1,
+  DECODE_2 = 2,
+  DECODE_3 = 8,
+  DECODE_4 = 16
+};
+
+// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong
+// codings or out-of-range code points, works by testing all fixed bits in each
+// of the 4 coding patterns, then shifting the value bits according to the
+// pattern
+static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
+  if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){
+    *codepoint_ret = BYTE(data, 0);
+    return 1;
+  }
+  if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B3) << 6 |
+      (BYTE(data, 1) & ~B2) << 0;
+    return 2;
+  }
+  if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B4) << 12 |
+      (BYTE(data, 1) & ~B2) << 6 |
+      (BYTE(data, 2) & ~B2) << 0;
+    return 3;
+  }
+  if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){
+    *codepoint_ret =
+      (BYTE(data, 0) & ~B5) << 18 |
+      (BYTE(data, 1) & ~B2) << 12 |
+      (BYTE(data, 2) & ~B2) << 6 |
+      (BYTE(data, 3) & ~B2) << 0;
+    return 4;
+  }
+  *codepoint_ret = -1;
+  return 1;
+}
+
 // jvp_utf8_backtrack returns the beginning of the last codepoint in the
 // string, assuming that start is the last byte in the string.
 // If the last codepoint is incomplete, returns the number of missing bytes via
@@ -137,56 +201,42 @@ const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_fla
   if (in == end) {
     return 0;
   }
-  int codepoint = -1;
-  unsigned char first = (unsigned char)in[0];
-  int length = utf8_coding_length[first];
-  if ((first & 0x80) == 0) {
+  uint32_t data = in[0] & 0xFF;
+  if ((data & B1) == 0) {
     /* Fast-path for ASCII */
-    codepoint = first;
-    length = 1;
-  } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) {
-    /* Bad single byte - either an invalid byte or an out-of-place continuation byte */
-    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte");
-    length = 1;
-  } else if (in + length > end) {
-    /* String ends before UTF8 sequence ends */
-    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun");
-    length = end - in;
-  } else {
-    codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
-    for (int i=1; i<length; i++) {
-      unsigned ch = (unsigned char)in[i];
-      if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){
-        /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */
-        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes");
-        codepoint = -1;
-        length = i;
-        break;
-      }
-      codepoint = (codepoint << 6) | (ch & 0x3f);
-    }
-    if (codepoint < utf8_first_codepoint[length]) {
-      /* Overlong UTF8 sequence */
-      if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) {
-        /* UTF-8 error is emitted as a negative codepoint */
-        codepoint = -(codepoint + 0x80);
-      } else {
-        if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
-        codepoint = -1;
-      }
-    }
-    if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
-      /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
-      if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
-        /* Surrogate codepoints can't be encoded in UTF8 */
-        codepoint = -1;
-      }
+    *codepoint_ret = data;
+    return in + 1;
+  }
+  switch (end - in) {
+    default: // fall through
+    case 4: data |= (uint32_t)(in[3] & 0xFF) << 24; // fall through
+    case 3: data |= (uint32_t)(in[2] & 0xFF) << 16; // fall through
+    case 2: data |= (uint32_t)(in[1] & 0xFF) << 8; // fall through
+    case 1: break;
+  }
+  int codepoint;
+  int length = decode(DECODE_2 | DECODE_3 | DECODE_4, data, &codepoint);
+  if (codepoint == -1) {
+    if (flags & JVP_UTF8_ERRORS_UTF8) assert(0 && "Invalid WTF-8b sequence: no match");
+  } else if (codepoint < utf8_first_codepoint[length]) {
+    /* Overlong UTF-8 sequence */
+    if ((flags & JVP_UTF8_ERRORS_UTF8) && length == UTF8_ERR_LEN && 0x00 <= codepoint && codepoint <= 0x7F) {
+      /* UTF-8 error is emitted as a negative codepoint */
+      codepoint = -(codepoint + 0x80);
+    } else {
+      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong");
+      codepoint = -1;
     }
-    if (codepoint > 0x10FFFF) {
-      /* Outside Unicode range */
-      if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+  } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
+    /* Surrogate codepoints are allowed in WTF-8/WTF-8b */
+    if (!(flags & JVP_UTF8_ERRORS_UTF16)) {
+      /* Surrogate codepoints can't be encoded in UTF8 */
       codepoint = -1;
     }
+  } else if (codepoint > 0x10FFFF) {
+    /* Outside Unicode range */
+    if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range");
+    codepoint = -1;
   }
   if (codepoint == -1 && (flags & JVP_UTF8_REPLACE))
     codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
@@ -195,6 +245,112 @@ const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_fla
   return in + length;
 }
 
+// assumes two bytes are readable from `in`
+static int decode_utf8_error(const char* in) {
+  uint32_t data = U32(in[0]  & 0xFF, in[1] & 0xFF, 0, 0);
+  int codepoint;
+  if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80)
+    return codepoint + 0x80;
+  return -1;
+}
+
+// assumes three bytes are readable from `in`
+static int decode_utf16_error(const char* in) {
+  uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0);
+  int codepoint;
+  if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF)
+    return codepoint;
+  return -1;
+}
+
+// jvp_utf8_wtf_join attempts to turn errors at the end of `a` and the
+// beginning of `b` into a valid code point. if a correction is possible,
+// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing
+// errors, and the UTF-8 encoding of the code point to insert is stored in
+// `out`. the number of bytes that should be inserted from `out` into the
+// middle of the strings is returned (up to 4). this will be 0 if there are no
+// bytes to insert.
+int jvp_utf8_wtf_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) {
+  const char* aend = astart + *alen_io;
+  const char* bstart = *bstart_io;
+  const char* bend = bstart + *blen_io;
+  int bcp;
+  bstart = jvp_utf8_wtf_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp);
+  if (!bstart) {
+    // end of string
+    return 0;
+  }
+  if (bcp >= 0xDC00 && bcp <= 0xDFFF) {
+    // UTF-16 tail surrogate, look for lead surrogate at the end of `a`
+    assert(bstart == *bstart_io + UTF16_ERR_LEN);
+    if (aend - astart < UTF16_ERR_LEN)
+      return 0;
+    int acp = decode_utf16_error(aend - UTF16_ERR_LEN);
+    if (acp >= 0xD800 && acp <= 0xDBFF) {
+      // UTF-16 lead surrogate, decode matching UTF-16 pair
+      *alen_io -= UTF16_ERR_LEN;
+      *blen_io -= UTF16_ERR_LEN;
+      *bstart_io += UTF16_ERR_LEN;
+      int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00));
+      return jvp_utf8_encode(codepoint, out);
+    }
+    return 0;
+  }
+  if (bcp >= -0xFF && bcp <= -0x80) {
+    // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte
+    bcp = -bcp;
+    assert(bstart == *bstart_io + UTF8_ERR_LEN);
+    if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+      return 0;
+    // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b`
+    unsigned char buf[6];
+    unsigned char* bufstart = buf + 3;
+    unsigned char* bufend = bufstart;
+    *bufend++ = bcp;
+    int length;
+    // search backwards in `a` for a leading byte
+    for (;;) {
+      if (aend - astart < UTF8_ERR_LEN)
+        return 0; // `a` is too short
+      int acp = decode_utf8_error(aend - UTF8_ERR_LEN);
+      if (acp == -1)
+        return 0; // not a UTF-8 error
+      aend -= UTF8_ERR_LEN;
+      length = utf8_coding_length[acp];
+      if (length == 0)
+        return 0; // not a possible UTF-8 byte
+      *--bufstart = acp;
+      if (length != UTF8_CONTINUATION_BYTE)
+        break; // found leading byte
+      if (bufstart == buf)
+        return 0; // too many continuation bytes
+    }
+    if (bufend - bufstart > length)
+      return 0; // too many continuation bytes
+    // search forwards in `b` for any more needed continuation bytes
+    while (bufend - bufstart < length) {
+      if (bend - bstart < UTF8_ERR_LEN)
+        return 0; // `b` is too short
+      bcp = decode_utf8_error(bstart);
+      if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE)
+        return 0; // not a UTF-8 error, didn't find enough continuation bytes
+      bstart += UTF8_ERR_LEN;
+      *bufend++ = bcp;
+    }
+    int codepoint;
+    // check that the bytes are strict UTF-8
+    jvp_utf8_wtf_next((char*)bufstart, (char*)bufend, 0, &codepoint);
+    if (codepoint != -1) {
+      memcpy(out, bufstart, 4);
+      *alen_io = aend - astart;
+      *blen_io = bend - bstart;
+      *bstart_io = bstart;
+      return bufend - bufstart;
+    }
+  }
+  return 0;
+}
+
 int jvp_utf8_is_valid(const char* in, const char* end) {
   int codepoint;
   while ((in = jvp_utf8_wtf_next(in, end, 0, &codepoint))) {
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index cb0a481a5e..33005e6f68 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -1,6 +1,8 @@
 #ifndef JV_UNICODE_H
 #define JV_UNICODE_H
 
+#include <stdint.h>
+
 enum jvp_utf8_flags {
   /* Emit replacement character instead of -1 for errors */
   JVP_UTF8_REPLACE = 1,
@@ -15,6 +17,7 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_
 const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
 const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len);
+int jvp_utf8_wtf_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out);
 int jvp_utf8_is_valid(const char* in, const char* end);
 
 int jvp_utf8_decode_length(char startchar);
diff --git a/tests/jq.test b/tests/jq.test
index eb8674c976..068bf31494 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -62,6 +62,11 @@ null
 null
 "∀\ud800∃\udc00∅\udfff"
 
+# Check that unpaired surrogates are paired when concatenated
+add
+["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"]
+"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼"
+
 "inter\("pol" + "ation")"
 null
 "interpolation"
@@ -87,6 +92,16 @@ null
 "Zm/Ds2Jhcgo="
 "foóbar\n"
 
+# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points)
+. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩"
+true
+
+# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors)
+@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text
+"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK"
+true
+
 @uri
 "\u03bc"
 "%CE%BC"

From 2e1b5d22473addd975e0ab5cde169a997436b212 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sat, 22 Jul 2023 09:55:08 +1200
Subject: [PATCH 5/8] Update `@uri` to handle binary strings

---
 src/builtin.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/builtin.c b/src/builtin.c
index ea419db1e7..bbea10212a 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -657,16 +657,19 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
     while (*p) unreserved[(int)*p++] = 1;
 
     jv line = jv_string("");
-    const char* s = jv_string_value(input);
-    for (int i=0; i<jv_string_length_bytes(jv_copy(input)); i++) {
-      unsigned ch = (unsigned)(unsigned char)*s;
-      if (ch < 128 && unreserved[ch]) {
-        line = jv_string_append_buf(line, s, 1);
-      } else {
-        line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
+    const char *start = jv_string_value(input);
+    const char *end = start + jv_string_length_bytes(jv_copy(input));
+    const char *bytes;
+    uint32_t bytes_len;
+    while ((start = jvp_utf8_wtf_next_bytes(start, end, &bytes, &bytes_len)))
+      for (uint32_t i = 0; i < bytes_len; i++) {
+        unsigned ch = (unsigned)(unsigned char)bytes[i];
+        if (ch < 128 && unreserved[ch]) {
+          line = jv_string_append_buf(line, &bytes[i], 1);
+        } else {
+          line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
+        }
       }
-      s++;
-    }
     jv_free(input);
     return line;
   } else if (!strcmp(fmt_s, "sh")) {

From f68f25b9d2c0dbf8fb351f745b569065ba571174 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sat, 22 Jul 2023 16:46:52 +1200
Subject: [PATCH 6/8] Preserve UTF-8 and UTF-16 errors in `explode`

Errors are emitted as negative code points instead of being transformed into
replacement characters. `implode` is also updated accordingly so the original
string can be reconstructed without data loss.
---
 docs/content/manual/manual.yml |  3 ++-
 jq.1.prebuilt                  |  2 +-
 src/jv.c                       | 15 +++++++++++++--
 tests/jq.test                  |  4 ++--
 tests/shtest                   |  2 ++
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index c44ad3818a..4d1050b183 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -1699,7 +1699,8 @@ sections:
         body: |
 
           Converts an input string into an array of the string's
-          codepoint numbers.
+          codepoint numbers. Ill-formed Unicode is represented using
+          negative numbers.
 
         examples:
           - program: 'explode'
diff --git a/jq.1.prebuilt b/jq.1.prebuilt
index 490791b831..1f3f49d795 100644
--- a/jq.1.prebuilt
+++ b/jq.1.prebuilt
@@ -1849,7 +1849,7 @@ jq \'[\.[]|rtrimstr("foo")]\'
 .IP "" 0
 .
 .SS "explode"
-Converts an input string into an array of the string\'s codepoint numbers\.
+Converts an input string into an array of the string\'s codepoint numbers\. Ill\-formed Unicode is represented using negative numbers\.
 .
 .IP "" 4
 .
diff --git a/src/jv.c b/src/jv.c
index d44f04b5b7..f81c29fc4f 100644
--- a/src/jv.c
+++ b/src/jv.c
@@ -1354,8 +1354,13 @@ jv jv_string_explode(jv j) {
   const char* end = i + len;
   jv a = jv_array_sized(len);
   int c;
-  while ((i = jvp_utf8_next(i, end, &c)))
+  while ((i = jvp_utf8_wtf_next(i, end, JVP_UTF8_ERRORS_ALL, &c))) {
+    // UTF-16 errors are emitted as negative integers to clearly distinguish them from valid Unicode text
+    // UTF-8 errors are already negated when using `JVP_UTF8_ERRORS_ALL`
+    if (c >= 0xD800 && c <= 0xDFFF)
+      c = -c;
     a = jv_array_append(a, jv_number(c));
+  }
   jv_free(j);
   return a;
 }
@@ -1373,7 +1378,13 @@ jv jv_string_implode(jv j) {
     assert(JVP_HAS_KIND(n, JV_KIND_NUMBER));
     int nv = jv_number_value(n);
     jv_free(n);
-    if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
+    // UTF-16 errors are represented as negative integers to clearly distinguish them from valid Unicode text
+    if (nv >= -0xDFFF && nv <= -0xD800) {
+      // convert negative UTF-16 errors into positive errors as expected by `jv_string_append_codepoint`
+      nv = -nv;
+    } else if (nv >= -0xFF && nv <= -0x80) {
+      // negative UTF-8 errors are already in the representation expected by `jv_string_append_codepoint`
+    } else if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
       nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
     s = jv_string_append_codepoint(s, nv);
   }
diff --git a/tests/jq.test b/tests/jq.test
index 068bf31494..1debbe4a71 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -58,9 +58,9 @@ null
 "Aa\u000d\u000a\u0009\u0008\u000c\u03bc"
 
 # Check that unpaired surrogates are preserved in output
-"\u2200\ud800\u2203\udc00\u2205\udfff"
+"\u2200\ud800\u2203\udc00\u2205\udfff" | "\(.)\(explode | implode)"
 null
-"∀\ud800∃\udc00∅\udfff"
+"∀\ud800∃\udc00∅\udfff∀\ud800∃\udc00∅\udfff"
 
 # Check that unpaired surrogates are paired when concatenated
 add
diff --git a/tests/shtest b/tests/shtest
index 36b7205bf2..cf2bc6dd6b 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -132,6 +132,8 @@ if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
     cmp $d/out $d/rand
     $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out
     cmp $d/out $d/rand
+    $VALGRIND $Q $JQ -j 'explode | implode' $d/out.json >$d/out
+    cmp $d/out $d/rand
     base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
     cmp $d/out $d/rand
     $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out

From 5c2fe323ffe1a47f6ff03ccac29fb6379bf6d93d Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sun, 23 Jul 2023 01:15:00 +1200
Subject: [PATCH 7/8] Remove UTF-8 backtracking workaround

This is no longer needed as strings are capable of storing partial UTF-8
sequences.
---
 src/jv_file.c    | 12 ++----------
 src/jv_unicode.c | 23 -----------------------
 src/jv_unicode.h |  1 -
 3 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/src/jv_file.c b/src/jv_file.c
index b10bcc0b5c..a4514e220b 100644
--- a/src/jv_file.c
+++ b/src/jv_file.c
@@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) {
     parser = jv_parser_new(0);
   }
 
-  // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
-  // buffer, we need to be able to read the remainder of a sequence and add that
-  // before appending.
-  const int max_utf8_len = 4;
-  char buf[4096+max_utf8_len];
+  char buf[4096];
   while (!feof(file) && !ferror(file)) {
-    size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
+    size_t n = fread(buf, 1, sizeof(buf), file);
     int len = 0;
 
     if (n == 0)
       continue;
-    if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 &&
-        !feof(file) && !ferror(file)) {
-      n += fread(buf+n, 1, len, file);
-    }
 
     if (raw) {
       data = jv_string_append_buf(data, buf, n);
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index a8858a2cf4..cb03cf833e 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -67,29 +67,6 @@ static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) {
   return 1;
 }
 
-// jvp_utf8_backtrack returns the beginning of the last codepoint in the
-// string, assuming that start is the last byte in the string.
-// If the last codepoint is incomplete, returns the number of missing bytes via
-// *missing_bytes.  If there are no leading bytes or an invalid byte is
-// encountered, NULL is returned and *missing_bytes is not altered.
-const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) {
-  assert(min <= start);
-  if (min == start) {
-    return min;
-  }
-  int length = 0;
-  int seen = 1;
-  while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
-    start--;
-    seen++;
-  }
-  if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) {
-    return NULL;
-  }
-  if (missing_bytes) *missing_bytes = length - seen;
-  return start;
-}
-
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
   return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret);
 }
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 33005e6f68..4c287cc125 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -13,7 +13,6 @@ enum jvp_utf8_flags {
   JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8
 };
 
-const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes);
 const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint);
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint);
 const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len);

From 911d01aaa5bd33137fadf028b9c3b4f86171b542 Mon Sep 17 00:00:00 2001
From: Max Zerzouri <maxdamantus@gmail.com>
Date: Sun, 23 Jul 2023 02:06:11 +1200
Subject: [PATCH 8/8] tests/shtest: fix use of base64 command for macOS
 compatibility

---
 tests/shtest | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/shtest b/tests/shtest
index cf2bc6dd6b..23b9837347 100755
--- a/tests/shtest
+++ b/tests/shtest
@@ -134,7 +134,7 @@ if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then
     cmp $d/out $d/rand
     $VALGRIND $Q $JQ -j 'explode | implode' $d/out.json >$d/out
     cmp $d/out $d/rand
-    base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
+    base64 <$d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out
     cmp $d/out $d/rand
     $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out
     cmp $d/out $d/rand