jqlang · Maxdamantus · May 15, 2021 · Jul 21, 2023 · May 16, 2021 · May 25, 2021
diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
@@ -1699,7 +1699,8 @@ sections:
         body: |
 
           Converts an input string into an array of the string's
-          codepoint numbers.
+          codepoint numbers. Ill-formed Unicode is represented using
+          negative numbers.
 
         examples:
           - program: 'explode'
@@ -2040,7 +2041,6 @@ sections:
           * `@base64d`:
 
             The inverse of `@base64`, input is decoded as specified by RFC 4648.
-            Note\: If the decoded string is not UTF-8, the results are undefined.
 
           This syntax can be combined with string interpolation in a
           useful way. You can follow a `@foo` token with a string

diff --git a/jq.1.prebuilt b/jq.1.prebuilt
diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py
@@ -16,8 +16,7 @@ def print_table(type, name, t):
 def utf8info(c):
     if c < 0x80: return 1, mask(7)
     if 0x80 <= c <= 0xBF: return 255, mask(6)
-    if 0xC0 <= c <= 0xC1: return 0, 0
-    if 0xC2 <= c <= 0xDF: return 2, mask(5)
+    if 0xC0 <= c <= 0xDF: return 2, mask(5)
     if 0xE0 <= c <= 0xEF: return 3, mask(4)
     if 0xF0 <= c <= 0xF4: return 4, mask(3)
     if 0xF4 <= c <= 0xFF: return 0, 0

diff --git a/src/builtin.c b/src/builtin.c
@@ -470,7 +470,7 @@ static jv f_dump(jq_state *jq, jv input) {
 static jv f_json_parse(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings can be parsed");
-  jv res = jv_parse_sized(jv_string_value(input),
+  jv res = jv_parse_wtf_sized(jv_string_value(input),
                           jv_string_length_bytes(jv_copy(input)));
   jv_free(input);
   return res;
@@ -520,7 +520,15 @@ static jv f_tostring(jq_state *jq, jv input) {
 static jv f_utf8bytelength(jq_state *jq, jv input) {
   if (jv_get_kind(input) != JV_KIND_STRING)
     return type_error(input, "only strings have UTF-8 byte length");
-  return jv_number(jv_string_length_bytes(input));
+  const char* i = jv_string_value(input);
+  const char* end = i + jv_string_length_bytes(jv_copy(input));
+  uint32_t len = 0;
+  const char *bytes;
+  uint32_t bytes_len;
+  while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len)))
+    len += bytes_len;
+  jv_free(input);
+  return jv_number(len);
 }
 
 #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
@@ -649,16 +657,19 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
     while (*p) unreserved[(int)*p++] = 1;
 
     jv line = jv_string("");
-    const char* s = jv_string_value(input);
-    for (int i=0; i<jv_string_length_bytes(jv_copy(input)); i++) {
-      unsigned ch = (unsigned)(unsigned char)*s;
-      if (ch < 128 && unreserved[ch]) {
-        line = jv_string_append_buf(line, s, 1);
-      } else {
-        line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
+    const char *start = jv_string_value(input);
+    const char *end = start + jv_string_length_bytes(jv_copy(input));
+    const char *bytes;
+    uint32_t bytes_len;
+    while ((start = jvp_utf8_wtf_next_bytes(start, end, &bytes, &bytes_len)))
+      for (uint32_t i = 0; i < bytes_len; i++) {
+        unsigned ch = (unsigned)(unsigned char)bytes[i];
+        if (ch < 128 && unreserved[ch]) {
+          line = jv_string_append_buf(line, &bytes[i], 1);
+        } else {
+          line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
+        }
       }
-      s++;
-    }
     jv_free(input);
     return line;
   } else if (!strcmp(fmt_s, "sh")) {
@@ -695,21 +706,36 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
     jv_free(fmt);
     input = f_tostring(jq, input);
     jv line = jv_string("");
-    const unsigned char* data = (const unsigned char*)jv_string_value(input);
-    int len = jv_string_length_bytes(jv_copy(input));
-    for (int i=0; i<len; i+=3) {
-      uint32_t code = 0;
-      int n = len - i >= 3 ? 3 : len-i;
-      for (int j=0; j<3; j++) {
+    const char* i = jv_string_value(input);
+    const char* end = i + jv_string_length_bytes(jv_copy(input));
+    uint32_t code = 0;
+    int n = 0;
+    const char *bytes;
+    uint32_t bytes_len;
+    while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len))) {
+      unsigned char *ubuf = (unsigned char *)bytes;
+      for (uint32_t x = 0; x < bytes_len; x++) {
         code <<= 8;
-        code |= j < n ? (unsigned)data[i+j] : 0;
+        code |= ubuf[x];
+        if (++n == 3) {
+          char buf[4];
+          for (int j = 0; j < 4; j++)
+            buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
+          line = jv_string_append_buf(line, buf, sizeof(buf));
+          n = 0;
+          code = 0;
+        }
       }
+    }
+    if (n > 0) {
+      assert(n < 3);
+      code <<= 8*(3 - n);
       char buf[4];
-      for (int j=0; j<4; j++) {
+      for (int j = 0; j < 4; j++)
         buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
-      }
-      if (n < 3) buf[3] = '=';
-      if (n < 2) buf[2] = '=';
+      buf[3] = '=';
+      if (n < 2)
+        buf[2] = '=';
       line = jv_string_append_buf(line, buf, sizeof(buf));
     }
     jv_free(input);

diff --git a/src/jv.c b/src/jv.c
@@ -1085,20 +1085,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
   return s;
 }
 
-/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
+/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
 static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   const char* end = data + length;
   const char* i = data;
   const char* cstart;
 
-  uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
+  uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
   jvp_string* s = jvp_string_alloc(maxlength);
   char* out = s->data;
   int c = 0;
 
-  while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+  while ((i = jvp_utf8_wtf_next((cstart = i), end, 0, &c))) {
     if (c == -1) {
-      c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+      int error = (unsigned char)*cstart;
+      assert(error >= 0x80 && error <= 0xFF);
+      c = -error;
+      /* Ensure each UTF-8 error byte is consumed separately */
+      i = cstart + 1;
     }
     out += jvp_utf8_encode(c, out);
     assert(out < s->data + maxlength);
@@ -1110,8 +1114,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
   return r;
 }
 
-/* Assumes valid UTF8 */
-static jv jvp_string_new(const char* data, uint32_t length) {
+/* Assumes valid WTF-8b */
+jv jv_string_wtf_sized(const char* data, int length) {
   jvp_string* s = jvp_string_alloc(length);
   s->length_hashed = length << 1;
   if (data != NULL)
@@ -1151,20 +1155,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
   jvp_string* s = jvp_string_ptr(string);
   uint32_t currlen = jvp_string_length(s);
 
+  char join_buf[4];
+  int join_len = jvp_utf8_wtf_join(s->data, &currlen, &data, &len, join_buf);
+
   if (jvp_refcnt_unshared(string.u.ptr) &&
-      jvp_string_remaining_space(s) >= len) {
+      jvp_string_remaining_space(s) >= join_len + len) {
     // the next string fits at the end of a
+    memcpy(s->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(s->data + currlen, data, len);
     s->data[currlen + len] = 0;
     s->length_hashed = (currlen + len) << 1;
     return string;
   } else {
     // allocate a bigger buffer and copy
-    uint32_t allocsz = (currlen + len) * 2;
+    uint32_t allocsz = (currlen + join_len + len) * 2;
     if (allocsz < 32) allocsz = 32;
     jvp_string* news = jvp_string_alloc(allocsz);
-    news->length_hashed = (currlen + len) << 1;
+    news->length_hashed = (currlen + join_len + len) << 1;
     memcpy(news->data, s->data, currlen);
+    memcpy(news->data + currlen, join_buf, join_len);
+    currlen += join_len;
     memcpy(news->data + currlen, data, len);
     news->data[currlen + len] = 0;
     jvp_string_free(string);
@@ -1252,7 +1263,7 @@ static int jvp_string_equal(jv a, jv b) {
 jv jv_string_sized(const char* str, int len) {
   return
     jvp_utf8_is_valid(str, str+len) ?
-    jvp_string_new(str, len) :
+    jv_string_wtf_sized(str, len) :
     jvp_string_copy_replace_bad(str, len);
 }
 
@@ -1318,14 +1329,14 @@ jv jv_string_split(jv j, jv sep) {
 
   if (seplen == 0) {
     int c;
-    while ((jstr = jvp_utf8_next(jstr, jend, &c)))
+    while ((jstr = jvp_utf8_wtf_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
       a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
   } else {
     for (p = jstr; p < jend; p = s + seplen) {
       s = _jq_memmem(p, jend - p, sepstr, seplen);
       if (s == NULL)
         s = jend;
-      a = jv_array_append(a, jv_string_sized(p, s - p));
+      a = jv_array_append(a, jv_string_wtf_sized(p, s - p));
       // Add an empty string to denote that j ends on a sep
       if (s + seplen == jend && seplen != 0)
         a = jv_array_append(a, jv_string(""));
@@ -1343,8 +1354,13 @@ jv jv_string_explode(jv j) {
   const char* end = i + len;
   jv a = jv_array_sized(len);
   int c;
-  while ((i = jvp_utf8_next(i, end, &c)))
+  while ((i = jvp_utf8_wtf_next(i, end, JVP_UTF8_ERRORS_ALL, &c))) {
+    // UTF-16 errors are emitted as negative integers to clearly distinguish them from valid Unicode text
+    // UTF-8 errors are already negated when using `JVP_UTF8_ERRORS_ALL`
+    if (c >= 0xD800 && c <= 0xDFFF)
+      c = -c;
     a = jv_array_append(a, jv_number(c));
+  }
   jv_free(j);
   return a;
 }
@@ -1362,7 +1378,13 @@ jv jv_string_implode(jv j) {
     assert(JVP_HAS_KIND(n, JV_KIND_NUMBER));
     int nv = jv_number_value(n);
     jv_free(n);
-    if (nv > 0x10FFFF)
+    // UTF-16 errors are represented as negative integers to clearly distinguish them from valid Unicode text
+    if (nv >= -0xDFFF && nv <= -0xD800) {
+      // convert negative UTF-16 errors into positive errors as expected by `jv_string_append_codepoint`
+      nv = -nv;
+    } else if (nv >= -0xFF && nv <= -0x80) {
+      // negative UTF-8 errors are already in the representation expected by `jv_string_append_codepoint`
+    } else if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
       nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
     s = jv_string_append_codepoint(s, nv);
   }
@@ -1397,7 +1419,7 @@ jv jv_string_slice(jv j, int start, int end) {
 
   /* Look for byte offset corresponding to start codepoints */
   for (p = s, i = 0; i < start; i++) {
-    p = jvp_utf8_next(p, s + len, &c);
+    p = jvp_utf8_wtf_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (p == NULL) {
       jv_free(j);
       return jv_string_empty(16);
@@ -1409,7 +1431,7 @@ jv jv_string_slice(jv j, int start, int end) {
   }
   /* Look for byte offset corresponding to end codepoints */
   for (e = p; e != NULL && i < end; i++) {
-    e = jvp_utf8_next(e, s + len, &c);
+    e = jvp_utf8_wtf_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
     if (e == NULL) {
       e = s + len;
       break;
@@ -1427,7 +1449,7 @@ jv jv_string_slice(jv j, int start, int end) {
    * memory like a drunken navy programmer.  There's probably nothing we
    * can do about it.
    */
-  res = jv_string_sized(p, e - p);
+  res = jv_string_wtf_sized(p, e - p);
   jv_free(j);
   return res;
 }

diff --git a/src/jv.h b/src/jv.h
@@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv);
 
 jv jv_string(const char*);
 jv jv_string_sized(const char*, int);
+jv jv_string_wtf_sized(const char*, int);
 jv jv_string_empty(int len);
 int jv_string_length_bytes(jv);
 int jv_string_length_codepoints(jv);
@@ -227,6 +228,7 @@ enum {
 
 jv jv_parse(const char* string);
 jv jv_parse_sized(const char* string, int length);
+jv jv_parse_wtf_sized(const char* string, int length);
 
 typedef void (*jv_nomem_handler_f)(void *);
 void jv_nomem_handler(jv_nomem_handler_f, void *);

diff --git a/src/jv_file.c b/src/jv_file.c
@@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) {
     parser = jv_parser_new(0);
   }
 
-  // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
-  // buffer, we need to be able to read the remainder of a sequence and add that
-  // before appending.
-  const int max_utf8_len = 4;
-  char buf[4096+max_utf8_len];
+  char buf[4096];
   while (!feof(file) && !ferror(file)) {
-    size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
+    size_t n = fread(buf, 1, sizeof(buf), file);
     int len = 0;
 
     if (n == 0)
       continue;
-    if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 &&
-        !feof(file) && !ferror(file)) {
-      n += fread(buf+n, 1, len, file);
-    }
 
     if (raw) {
       data = jv_string_append_buf(data, buf, n);