From 067e68291db7a5ce0aaf03050017f11b955ccf3f Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sat, 15 May 2021 10:50:15 +0000 Subject: [PATCH 1/8] jv_string_implode: avoid producing unprintable string from reserved code points --- src/jv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jv.c b/src/jv.c index 498a14149d..c1d48bce07 100644 --- a/src/jv.c +++ b/src/jv.c @@ -1362,7 +1362,7 @@ jv jv_string_implode(jv j) { assert(JVP_HAS_KIND(n, JV_KIND_NUMBER)); int nv = jv_number_value(n); jv_free(n); - if (nv > 0x10FFFF) + if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF) nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER s = jv_string_append_codepoint(s, nv); } From 6aff473b771cd022ab7b43b6b5db3e61678c0aa0 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Fri, 21 Jul 2023 19:34:12 +1200 Subject: [PATCH 2/8] Binary strings: preserve UTF-8 and UTF-16 errors The internal string representation is changed from UTF-8 with replacement characters to a modified form of "WTF-8" that is able to distinctly encode UTF-8 errors and UTF-16 errors. This handles UTF-8 errors in raw string inputs and handles UTF-8 and UTF-16 errors in JSON input. UTF-16 errors (using "\uXXXX") and UTF-8 errors (using the original raw bytes) are maintained when emitting JSON. When emitting raw strings, UTF-8 errors are maintained and UTF-16 errors are converted into replacement characters. --- scripts/gen_utf8_tables.py | 3 +- src/jv.c | 28 ++++---- src/jv.h | 1 + src/jv_parse.c | 77 +++++++++++++------- src/jv_print.c | 26 ++++++- src/jv_unicode.c | 143 ++++++++++++++++++++++++++++++++++--- src/jv_unicode.h | 12 ++++ src/jv_utf8_tables.h | 4 +- src/main.c | 9 ++- tests/jq.test | 5 ++ tests/shtest | 9 +++ 11 files changed, 264 insertions(+), 53 deletions(-) diff --git a/scripts/gen_utf8_tables.py b/scripts/gen_utf8_tables.py index 6fe0a5312b..7706462351 100644 --- a/scripts/gen_utf8_tables.py +++ b/scripts/gen_utf8_tables.py @@ -16,8 +16,7 @@ def print_table(type, name, t): def utf8info(c): if c < 0x80: return 1, mask(7) if 0x80 <= c <= 0xBF: return 255, mask(6) - if 0xC0 <= c <= 0xC1: return 0, 0 - if 0xC2 <= c <= 0xDF: return 2, mask(5) + if 0xC0 <= c <= 0xDF: return 2, mask(5) if 0xE0 <= c <= 0xEF: return 3, mask(4) if 0xF0 <= c <= 0xF4: return 4, mask(3) if 0xF4 <= c <= 0xFF: return 0, 0 diff --git a/src/jv.c b/src/jv.c index c1d48bce07..92d8336faf 100644 --- a/src/jv.c +++ b/src/jv.c @@ -1085,20 +1085,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) { return s; } -/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */ +/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { const char* end = data + length; const char* i = data; const char* cstart; - uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD + uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX jvp_string* s = jvp_string_alloc(maxlength); char* out = s->data; int c = 0; - while ((i = jvp_utf8_next((cstart = i), end, &c))) { + while ((i = jvp_utf8_wtf_next((cstart = i), end, 0, &c))) { if (c == -1) { - c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + int error = (unsigned char)*cstart; + assert(error >= 0x80 && error <= 0xFF); + c = -error; + /* Ensure each UTF-8 error byte is consumed separately */ + i = cstart + 1; } out += jvp_utf8_encode(c, out); assert(out < s->data + maxlength); @@ -1110,8 +1114,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) { return r; } -/* Assumes valid UTF8 */ -static jv jvp_string_new(const char* data, uint32_t length) { +/* Assumes valid WTF-8b */ +jv jv_string_wtf_sized(const char* data, int length) { jvp_string* s = jvp_string_alloc(length); s->length_hashed = length << 1; if (data != NULL) @@ -1252,7 +1256,7 @@ static int jvp_string_equal(jv a, jv b) { jv jv_string_sized(const char* str, int len) { return jvp_utf8_is_valid(str, str+len) ? - jvp_string_new(str, len) : + jv_string_wtf_sized(str, len) : jvp_string_copy_replace_bad(str, len); } @@ -1318,14 +1322,14 @@ jv jv_string_split(jv j, jv sep) { if (seplen == 0) { int c; - while ((jstr = jvp_utf8_next(jstr, jend, &c))) + while ((jstr = jvp_utf8_wtf_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c))) a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c)); } else { for (p = jstr; p < jend; p = s + seplen) { s = _jq_memmem(p, jend - p, sepstr, seplen); if (s == NULL) s = jend; - a = jv_array_append(a, jv_string_sized(p, s - p)); + a = jv_array_append(a, jv_string_wtf_sized(p, s - p)); // Add an empty string to denote that j ends on a sep if (s + seplen == jend && seplen != 0) a = jv_array_append(a, jv_string("")); @@ -1397,7 +1401,7 @@ jv jv_string_slice(jv j, int start, int end) { /* Look for byte offset corresponding to start codepoints */ for (p = s, i = 0; i < start; i++) { - p = jvp_utf8_next(p, s + len, &c); + p = jvp_utf8_wtf_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c); if (p == NULL) { jv_free(j); return jv_string_empty(16); @@ -1409,7 +1413,7 @@ jv jv_string_slice(jv j, int start, int end) { } /* Look for byte offset corresponding to end codepoints */ for (e = p; e != NULL && i < end; i++) { - e = jvp_utf8_next(e, s + len, &c); + e = jvp_utf8_wtf_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c); if (e == NULL) { e = s + len; break; @@ -1427,7 +1431,7 @@ jv jv_string_slice(jv j, int start, int end) { * memory like a drunken navy programmer. There's probably nothing we * can do about it. */ - res = jv_string_sized(p, e - p); + res = jv_string_wtf_sized(p, e - p); jv_free(j); return res; } diff --git a/src/jv.h b/src/jv.h index 8c96f822f0..8a328ec91b 100644 --- a/src/jv.h +++ b/src/jv.h @@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv); jv jv_string(const char*); jv jv_string_sized(const char*, int); +jv jv_string_wtf_sized(const char*, int); jv jv_string_empty(int len); int jv_string_length_bytes(jv); int jv_string_length_codepoints(jv); diff --git a/src/jv_parse.c b/src/jv_parse.c index 3a8718ae82..a7e1d2463b 100644 --- a/src/jv_parse.c +++ b/src/jv_parse.c @@ -428,7 +428,7 @@ static void tokenadd(struct jv_parser* p, char c) { p->tokenbuf[p->tokenpos++] = c; } -static int unhex4(char* hex) { +static int unhex4(const char* hex) { int r = 0; for (int i=0; i<4; i++) { char c = *hex++; @@ -444,15 +444,19 @@ static int unhex4(char* hex) { } static pfunc found_string(struct jv_parser* p) { - char* in = p->tokenbuf; - char* out = p->tokenbuf; - char* end = p->tokenbuf + p->tokenpos; - - while (in < end) { - char c = *in++; + const char* in = p->tokenbuf; + // start by writing to tokenbuf, only allocate in case that output size is greater than input size (possible only when input has UTF-8 errors) + char* newbuf = NULL; + char* buf = p->tokenbuf; + char* out = buf; + const char* end = p->tokenbuf + p->tokenpos; + const char* cstart; + int c; + + while ((in = jvp_utf8_wtf_next((cstart = in), end, 0, &c))) { if (c == '\\') { if (in >= end) - return "Expected escape character at end of string"; + return jv_mem_free(newbuf), "Expected escape character at end of string"; c = *in++; switch (c) { case '\\': @@ -467,38 +471,61 @@ static pfunc found_string(struct jv_parser* p) { case 'u': /* ahh, the complicated case */ if (in + 4 > end) - return "Invalid \\uXXXX escape"; + return jv_mem_free(newbuf), "Invalid \\uXXXX escape"; int hexvalue = unhex4(in); if (hexvalue < 0) - return "Invalid characters in \\uXXXX escape"; + return jv_mem_free(newbuf), "Invalid characters in \\uXXXX escape"; unsigned long codepoint = (unsigned long)hexvalue; in += 4; + // leading surrogate if (0xD800 <= codepoint && codepoint <= 0xDBFF) { - /* who thought UTF-16 surrogate pairs were a good idea? */ - if (in + 6 > end || in[0] != '\\' || in[1] != 'u') - return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; - unsigned long surrogate = unhex4(in+2); - if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF)) - return "Invalid \\uXXXX\\uXXXX surrogate pair escape"; - in += 6; - codepoint = 0x10000 + (((codepoint - 0xD800) << 10) - |(surrogate - 0xDC00)); + // look ahead for trailing surrogate and decode as UTF-16, otherwise encode this lone surrogate as WTF-8 + if (in + 6 <= end && in[0] == '\\' && in[1] == 'u') { + unsigned long surrogate = unhex4(in+2); + if (0xDC00 <= surrogate && surrogate <= 0xDFFF) { + in += 6; + codepoint = 0x10000 + (((codepoint - 0xD800) << 10) + |(surrogate - 0xDC00)); + } + } } - if (codepoint > 0x10FFFF) - codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + // UTF-16 surrogates can not encode a greater codepoint + assert(codepoint <= 0x10FFFF); + // NOTE: a leading or trailing surrogate here (0xD800 <= codepoint && codepoint <= 0xDFFF) is encoded as WTF-8 out += jvp_utf8_encode(codepoint, out); break; default: - return "Invalid escape"; + return jv_mem_free(newbuf), "Invalid escape"; } } else { if (c > 0 && c < 0x001f) - return "Invalid string: control characters from U+0000 through U+001F must be escaped"; - *out++ = c; + return jv_mem_free(newbuf), "Invalid string: control characters from U+0000 through U+001F must be escaped"; + if (c == -1) { + int error = (unsigned char)*cstart; + assert(error >= 0x80 && error <= 0xFF); + c = -error; + /* Ensure each UTF-8 error byte is consumed separately */ + const int wtf8_length = 2; + assert(jvp_utf8_encode_length(c) == wtf8_length); + in = cstart + 1; + if (newbuf == NULL && out + wtf8_length > in) { + /* Output is about to overflow input, move output to temporary buffer */ + int current_size = out - p->tokenbuf; + int remaining = end - cstart; + newbuf = jv_mem_alloc(current_size + remaining * wtf8_length); // worst case: all remaining bad bytes, each becomes a 2-byte overlong U+XX + memcpy(newbuf, buf, current_size); + buf = newbuf; + out = buf + current_size; + } + } else + assert(jvp_utf8_encode_length(c) == in - cstart); + out += jvp_utf8_encode(c, out); } } - TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf))); + jv v = jv_string_wtf_sized(buf, out - buf); + jv_mem_free(newbuf); + TRY(value(p, v)); p->tokenpos = 0; return 0; } diff --git a/src/jv_print.c b/src/jv_print.c index d1db88aa89..7c4258ee3b 100644 --- a/src/jv_print.c +++ b/src/jv_print.c @@ -98,6 +98,16 @@ static void put_char(char c, FILE* fout, jv* strout, int T) { put_buf(&c, 1, fout, strout, T); } +static void put_invalid_utf8_byte(int c, FILE* fout, jv* strout, int T) { + assert(c >= 0x80 && c <= 0xFF); + if (strout) { + // encode as an invalid UTF-8 byte in output + *strout = jv_string_append_codepoint(*strout, -c); + } else { + put_char(c, fout, strout, T); + } +} + static void put_str(const char* s, FILE* fout, jv* strout, int T) { put_buf(s, strlen(s), fout, strout, T); } @@ -121,7 +131,7 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { int c = 0; char buf[32]; put_char('"', F, S, T); - while ((i = jvp_utf8_next((cstart = i), end, &c))) { + while ((i = jvp_utf8_wtf_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { assert(c != -1); int unicode_escape = 0; if (0x20 <= c && c <= 0x7E) { @@ -130,6 +140,17 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { put_char('\\', F, S, T); } put_char(c, F, S, T); + } else if (c >= -0xFF && c <= -0x80) { + // Invalid UTF-8 byte + if (ascii_only) { + // refusing to emit invalid UTF-8 + // TODO: convince the world to adopt a "\xXX" notation for JSON? + c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER + unicode_escape = 1; + } else { + // pass through + put_invalid_utf8_byte(-c, F, S, T); + } } else if (c < 0x20 || c == 0x7F) { // ASCII control character switch (c) { @@ -160,6 +181,9 @@ static void jvp_dump_string(jv str, int ascii_only, FILE* F, jv* S, int T) { } else { if (ascii_only) { unicode_escape = 1; + } else if (c >= 0xD800 && c <= 0xDFFF) { + // lone surrogate; can't be encoded to UTF-8 + unicode_escape = 1; } else { put_buf(cstart, i - cstart, F, S, T); } diff --git a/src/jv_unicode.c b/src/jv_unicode.c index d197349f48..cbd812b454 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -27,6 +27,112 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_ } const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret); +} + +// jvp_utf8_wtf_next_bytes iterates through chunks of UTF-8 bytes represented +// by a WTF-8b string. *bytes_out is set to the start of the current chunk and +// *bytes_len is set to the size of the current chunk. Valid sequences of UTF-8 +// bytes are emitted as maximally sized chunks (pointing into the `in` string). +// Ill-formed UTF-8 bytes are emitted individually (pointing into a static +// array containing the byte). Ill-formed UTF-16 code units are emitted as +// UTF-8 replacement characters (pointing into a static array containing the +// bytes for U+FFFD). +const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len) { + // U+FFFD REPLACEMENT CHARACTER + static const unsigned char UTF8_REPLACEMENT[] = {0xEF,0xBF,0xBD}; + // array of bytes from 0x80 to 0xFF (inclusive) + static const unsigned char UTF8_ILL_FORMED[] = { + #define ROW(x) \ + x + 0, x + 1, x + 2, x + 3, \ + x + 4, x + 5, x + 6, x + 7, \ + x + 8, x + 9, x + 10, x + 11, \ + x + 12, x + 13, x + 14, x + 15 + ROW(0x80), ROW(0x90), ROW(0xA0), ROW(0xB0), + ROW(0xC0), ROW(0xD0), ROW(0xE0), ROW(0xF0) + #undef ROW + }; + + const char* i = in; + const char* cstart; + int c; + + while ((i = jvp_utf8_wtf_next((cstart = i), end, JVP_UTF8_ERRORS_ALL, &c))) { + if (c >= -0xFF && c <= -0x80) { + // invalid UTF-8 byte; pass through + if (cstart > in) { + // can't emit single byte yet; emit previous chunk first + break; + } + *bytes_len = 1; + *bytes_out = (const char*)&UTF8_ILL_FORMED[-c - 0x80]; + return i; + } + if (c >= 0xD800 && c <= 0xDFFF) { + // lone surrogate; can't be encoded to UTF-8 + if (cstart > in) { + // can't emit replacement bytes yet; emit previous chunk first + break; + } + *bytes_len = sizeof UTF8_REPLACEMENT; + *bytes_out = (const char*)UTF8_REPLACEMENT; + return i; + } + } + + uint32_t len = cstart - in; + *bytes_len = len; + *bytes_out = in; + return len == 0? NULL : cstart; +} + +/* + The internal representation of jv strings uses an encoding that is hereby + referred to as "WTF-8b" (until someone demonstrates use of another term to + refer to the same encoding). + + WTF-8b is an extension of WTF-8, which is an extension of UTF-8. Any sequence + of Unicode scalar values is represented by the same bytes in UTF-8, WTF-8 and + WTF-8b, therefore any well-formed UTF-8 string is interpreted as the same + sequence of Unicode scalar values (roughly, code points) in WTF-8b. + + Like WTF-8, WTF-8b is able to encode UTF-16 errors (lone surrogates) using + the "generalized UTF-8" representation of code points between U+D800 and + U+DFFF. These errors occur in JSON terms such as: + "_\uD8AB_\uDBCD_" + + Unlike WTF-8, WTF-8b is also able to encode UTF-8 errors (bytes 0x80 to 0xFF + that are not part of a valid UTF-8 sequence) using the first 128 "overlong" + codings (unused 2-byte representations of U+00 to U+7F). These errors can + occur in any byte stream that is interpreted as UTF-8, for example: + "\xED\xA2\xAB" + The above example is in fact the WTF-8b (and WTF-8) encoding for the lone + UTF-16 surrogate "\uD8AB", which demonstrates the need for a distinct + encoding of UTF-8 errors. If a distinction were not made, then "\xED\xA2\xAB" + and "\uD8AB" would be interpreted as the same string, so at least one of the + forms would not be preserved when printed as JSON output. + + It should also be noted that the process of converting from invalid UTF-8 to + WTF-8b is not (and can not be) idempotent, since the "generalized UTF-8" + representation of UTF-16 surrogates is intentionally not able to be + generated from invalid UTF-8, only through some other means (usually "\uXXXX" + notation). + + Each UTF-16 error is encoded as 3 WTF-8b (or WTF-8) bytes. + Each UTF-8 error is encoded as 2 WTF-8b bytes. + + When iterating over code points using `JVP_UTF8_ERRORS_UTF16`, encoded UTF-16 + errors are emitted in the form of code points in the range U+D800 to U+DFFF. + These code points can be reencoded as usual using `jvp_utf8_encode`. + + When iterating over code points using `JVP_UTF8_ERRORS_UTF8`, encoded UTF-8 + errors are emitted in the form of code points in the negative range -0x80 to + -0xFF. These negative code points can be negated to determine the original + error bytes. These code points can be reencoded as usual using + `jvp_utf8_encode`. +*/ + +const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint_ret) { assert(in <= end); if (in == end) { return 0; @@ -40,9 +146,11 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { length = 1; } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); length = 1; } else if (in + length > end) { /* String ends before UTF8 sequence ends */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); length = end - in; } else { codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; @@ -50,6 +158,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { unsigned ch = (unsigned char)in[i]; if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){ /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: wrong bytes"); codepoint = -1; length = i; break; @@ -58,17 +167,29 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { } if (codepoint < utf8_first_codepoint[length]) { /* Overlong UTF8 sequence */ - codepoint = -1; + if ((flags & JVP_UTF8_ERRORS_UTF8) && 0x00 <= codepoint && codepoint <= 0x7F) { + /* UTF-8 error is emitted as a negative codepoint */ + codepoint = -(codepoint + 0x80); + } else { + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: overlong"); + codepoint = -1; + } } if (0xD800 <= codepoint && codepoint <= 0xDFFF) { - /* Surrogate codepoints can't be encoded in UTF8 */ - codepoint = -1; + /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ + if (!(flags & JVP_UTF8_ERRORS_UTF16)) { + /* Surrogate codepoints can't be encoded in UTF8 */ + codepoint = -1; + } } if (codepoint > 0x10FFFF) { /* Outside Unicode range */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); codepoint = -1; } } + if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) + codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER assert(length > 0); *codepoint_ret = codepoint; return in + length; @@ -76,7 +197,7 @@ const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { int jvp_utf8_is_valid(const char* in, const char* end) { int codepoint; - while ((in = jvp_utf8_next(in, end, &codepoint))) { + while ((in = jvp_utf8_wtf_next(in, end, 0, &codepoint))) { if (codepoint == -1) return 0; } return 1; @@ -91,20 +212,24 @@ int jvp_utf8_decode_length(char startchar) { } int jvp_utf8_encode_length(int codepoint) { - if (codepoint <= 0x7F) return 1; + if (codepoint >= 0 && codepoint <= 0x7F) return 1; else if (codepoint <= 0x7FF) return 2; else if (codepoint <= 0xFFFF) return 3; else return 4; } int jvp_utf8_encode(int codepoint, char* out) { - assert(codepoint >= 0 && codepoint <= 0x10FFFF); + assert((codepoint >= 0 && codepoint <= 0x10FFFF) || (codepoint >= -0xFF && codepoint <= -0x80)); char* start = out; - if (codepoint <= 0x7F) { + if (codepoint >= 0 && codepoint <= 0x7F) { *out++ = codepoint; } else if (codepoint <= 0x7FF) { - *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); - *out++ = 0x80 + ((codepoint & 0x03F)); + // encode UTF-8 errors as overlong representations of U+00 to U+7F + int cp = codepoint >= -0xFF && codepoint <= -0x80? + -codepoint - 0x80 : + codepoint; + *out++ = 0xC0 + ((cp & 0x7C0) >> 6); + *out++ = 0x80 + ((cp & 0x03F)); } else if(codepoint <= 0xFFFF) { *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 558721a8fd..cb0a481a5e 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -1,8 +1,20 @@ #ifndef JV_UNICODE_H #define JV_UNICODE_H +enum jvp_utf8_flags { + /* Emit replacement character instead of -1 for errors */ + JVP_UTF8_REPLACE = 1, + /* Treat input as WTF-8b, emit 0xD800 to 0xDFFF to denote encoded UTF-16 errors */ + JVP_UTF8_ERRORS_UTF16 = 2, + /* Treat input as WTF-8b, emit -0x80 to -0xFF to denote encoded UTF-8 errors */ + JVP_UTF8_ERRORS_UTF8 = 4, + JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8 +}; + const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); +const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); +const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len); int jvp_utf8_is_valid(const char* in, const char* end); int jvp_utf8_decode_length(char startchar); diff --git a/src/jv_utf8_tables.h b/src/jv_utf8_tables.h index f1a4252fce..7c68749e97 100644 --- a/src/jv_utf8_tables.h +++ b/src/jv_utf8_tables.h @@ -12,7 +12,7 @@ static const unsigned char utf8_coding_length[] = 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; @@ -29,7 +29,7 @@ static const unsigned char utf8_coding_bits[] = 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, - 0x00, 0x00, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, + 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; diff --git a/src/main.c b/src/main.c index 48af5a31c1..83faeb3197 100644 --- a/src/main.c +++ b/src/main.c @@ -31,6 +31,7 @@ extern void jv_tsd_dtoa_ctx_init(); #include "jv.h" #include "jq.h" #include "jv_alloc.h" +#include "jv_unicode.h" #include "util.h" #include "src/version.h" @@ -182,8 +183,12 @@ static int process(jq_state *jq, jv value, int flags, int dumpopts, int options) if (options & ASCII_OUTPUT) { jv_dumpf(jv_copy(result), stdout, JV_PRINT_ASCII); } else { - priv_fwrite(jv_string_value(result), jv_string_length_bytes(jv_copy(result)), - stdout, dumpopts & JV_PRINT_ISATTY); + const char *start = jv_string_value(result); + const char *end = start + jv_string_length_bytes(jv_copy(result)); + const char *bytes; + uint32_t bytes_len; + while ((start = jvp_utf8_wtf_next_bytes(start, end, &bytes, &bytes_len))) + priv_fwrite(bytes, bytes_len, stdout, dumpopts & JV_PRINT_ISATTY); } ret = JQ_OK; jv_free(result); diff --git a/tests/jq.test b/tests/jq.test index da35e9a84c..eb8674c976 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -57,6 +57,11 @@ null "Aa\r\n\t\b\f\u03bc" "Aa\u000d\u000a\u0009\u0008\u000c\u03bc" +# Check that unpaired surrogates are preserved in output +"\u2200\ud800\u2203\udc00\u2205\udfff" +null +"∀\ud800∃\udc00∅\udfff" + "inter\("pol" + "ation")" null "interpolation" diff --git a/tests/shtest b/tests/shtest index d681ab45ad..35443cfb45 100755 --- a/tests/shtest +++ b/tests/shtest @@ -122,6 +122,15 @@ fi cmp $d/out $d/expected +clean=false +# Invalid UTF-8 bytes are preserved when encoding/decoding JSON +dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null +$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json +$VALGRIND $Q $JQ -j . $d/out.json >$d/out +cmp $d/out $d/rand +clean=true + + ## Test --exit-status data='{"i": 1}\n{"i": 2}\n{"i": 3}\n' printf "$data" | $JQ --exit-status 'select(.i==1)' > /dev/null 2>&1 From 79f0479e3164884e0e6dd6a1fd9893f971c1d062 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sun, 16 May 2021 09:18:51 +0000 Subject: [PATCH 3/8] Update `@base64`, `utf8bytelength` and `fromjson` to handle binary strings --- docs/content/manual/manual.yml | 1 - jq.1.prebuilt | 2 +- src/builtin.c | 49 ++++++++++++++++------ src/jv.h | 1 + src/jv_parse.c | 76 +++++++++++++++++++++++++--------- tests/base64.test | 10 +++++ tests/shtest | 19 ++++++--- 7 files changed, 118 insertions(+), 40 deletions(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index ff68482a3d..c44ad3818a 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -2040,7 +2040,6 @@ sections: * `@base64d`: The inverse of `@base64`, input is decoded as specified by RFC 4648. - Note\: If the decoded string is not UTF-8, the results are undefined. This syntax can be combined with string interpolation in a useful way. You can follow a `@foo` token with a string diff --git a/jq.1.prebuilt b/jq.1.prebuilt index 80933f748e..490791b831 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -2226,7 +2226,7 @@ The input is converted to base64 as specified by RFC 4648\. \fB@base64d\fR: . .IP -The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\. Note\e: If the decoded string is not UTF\-8, the results are undefined\. +The inverse of \fB@base64\fR, input is decoded as specified by RFC 4648\. . .P This syntax can be combined with string interpolation in a useful way\. You can follow a \fB@foo\fR token with a string literal\. The contents of the string literal will \fInot\fR be escaped\. However, all interpolations made inside that string literal will be escaped\. For instance, diff --git a/src/builtin.c b/src/builtin.c index b38d4c2f4f..ea419db1e7 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -470,7 +470,7 @@ static jv f_dump(jq_state *jq, jv input) { static jv f_json_parse(jq_state *jq, jv input) { if (jv_get_kind(input) != JV_KIND_STRING) return type_error(input, "only strings can be parsed"); - jv res = jv_parse_sized(jv_string_value(input), + jv res = jv_parse_wtf_sized(jv_string_value(input), jv_string_length_bytes(jv_copy(input))); jv_free(input); return res; @@ -520,7 +520,15 @@ static jv f_tostring(jq_state *jq, jv input) { static jv f_utf8bytelength(jq_state *jq, jv input) { if (jv_get_kind(input) != JV_KIND_STRING) return type_error(input, "only strings have UTF-8 byte length"); - return jv_number(jv_string_length_bytes(input)); + const char* i = jv_string_value(input); + const char* end = i + jv_string_length_bytes(jv_copy(input)); + uint32_t len = 0; + const char *bytes; + uint32_t bytes_len; + while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len))) + len += bytes_len; + jv_free(input); + return jv_number(len); } #define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" @@ -695,21 +703,36 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { jv_free(fmt); input = f_tostring(jq, input); jv line = jv_string(""); - const unsigned char* data = (const unsigned char*)jv_string_value(input); - int len = jv_string_length_bytes(jv_copy(input)); - for (int i=0; i= 3 ? 3 : len-i; - for (int j=0; j<3; j++) { + const char* i = jv_string_value(input); + const char* end = i + jv_string_length_bytes(jv_copy(input)); + uint32_t code = 0; + int n = 0; + const char *bytes; + uint32_t bytes_len; + while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len))) { + unsigned char *ubuf = (unsigned char *)bytes; + for (uint32_t x = 0; x < bytes_len; x++) { code <<= 8; - code |= j < n ? (unsigned)data[i+j] : 0; + code |= ubuf[x]; + if (++n == 3) { + char buf[4]; + for (int j = 0; j < 4; j++) + buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; + line = jv_string_append_buf(line, buf, sizeof(buf)); + n = 0; + code = 0; + } } + } + if (n > 0) { + assert(n < 3); + code <<= 8*(3 - n); char buf[4]; - for (int j=0; j<4; j++) { + for (int j = 0; j < 4; j++) buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f]; - } - if (n < 3) buf[3] = '='; - if (n < 2) buf[2] = '='; + buf[3] = '='; + if (n < 2) + buf[2] = '='; line = jv_string_append_buf(line, buf, sizeof(buf)); } jv_free(input); diff --git a/src/jv.h b/src/jv.h index 8a328ec91b..3f0dedbf2f 100644 --- a/src/jv.h +++ b/src/jv.h @@ -228,6 +228,7 @@ enum { jv jv_parse(const char* string); jv jv_parse_sized(const char* string, int length); +jv jv_parse_wtf_sized(const char* string, int length); typedef void (*jv_nomem_handler_f)(void *); void jv_nomem_handler(jv_nomem_handler_f, void *); diff --git a/src/jv_parse.c b/src/jv_parse.c index a7e1d2463b..1573328a99 100644 --- a/src/jv_parse.c +++ b/src/jv_parse.c @@ -885,35 +885,63 @@ jv jv_parser_next(struct jv_parser* p) { } } -jv jv_parse_sized(const char* string, int length) { +static jv jvp_parse_sized(const char* string, int length, int extended) { struct jv_parser parser; parser_init(&parser, 0); - jv_parser_set_buf(&parser, string, length, 0); - jv value = jv_parser_next(&parser); - if (jv_is_valid(value)) { - jv next = jv_parser_next(&parser); - if (jv_is_valid(next)) { - // multiple JSON values, we only wanted one - jv_free(value); - jv_free(next); - value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); - } else if (jv_invalid_has_msg(jv_copy(next))) { - // parser error after the first JSON value - jv_free(value); - value = next; + const char *i = string; + const char *end = string + length; + jv value = jv_invalid(); + int count = 0; + while (i != NULL) { + const char *bytes; + uint32_t bytes_len; + if (extended) { + // TOOD: consider handling string values containing UTF-16 errors; this + // won't normally occur when using the output of eg, `tojson`, but could + // occur when constructing JSON manually, eg: + // > "\"\uD800\"" | fromjson + // NOTE: a simple but crude way to do this might be to replace UTF-16 + // errors in the input with \uXXXX sequences, since UTF-16 errors should + // only be allowed within string literals, where escape sequences can be + // equivalently used + i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len); } else { - // a single valid JSON value - jv_free(next); + bytes = string; + bytes_len = length; + i = NULL; } - } else if (jv_invalid_has_msg(jv_copy(value))) { - // parse error, we'll return it - } else { + jv_parser_set_buf(&parser, bytes, bytes_len, i != NULL); + for (;;) { + jv next = jv_parser_next(&parser); + if (!jv_is_valid(next)) { + if (jv_invalid_has_msg(jv_copy(next))) { + // parse error, we'll return it + count++; + jv_free(value); + value = next; + i = NULL; + } + break; + } + jv_free(value); + if (count++ == 0) { + // a single valid JSON value + value = next; + } else { + // multiple JSON values, we only wanted one + jv_free(next); + value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values")); + i = NULL; + break; + } + } + } + if (count == 0) { // no value at all jv_free(value); value = jv_invalid_with_msg(jv_string("Expected JSON value")); } parser_free(&parser); - if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) { jv msg = jv_invalid_get_msg(value); value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')", @@ -924,6 +952,14 @@ jv jv_parse_sized(const char* string, int length) { return value; } +jv jv_parse_sized(const char* string, int length) { + return jvp_parse_sized(string, length, 0); +} + +jv jv_parse_wtf_sized(const char* string, int length) { + return jvp_parse_sized(string, length, 1); +} + jv jv_parse(const char* string) { return jv_parse_sized(string, strlen(string)); } diff --git a/tests/base64.test b/tests/base64.test index 0f82b0b71d..6507bb83b7 100644 --- a/tests/base64.test +++ b/tests/base64.test @@ -33,3 +33,13 @@ . | try @base64d catch . "QUJDa" "string (\"QUJDa\") trailing base64 byte found" + +# random binary data +(. | @base64d | @base64) == . +"zns0Su1i4JjDfGiR95WOcU8iiPMOrfJTUBm9P1ot2qIMiyk04b0WSIFNTMD7w9ziMV8nSbwpPqNl3JKF1eWZrRRg24rbvh66O1e7Z1xIGPNqTqm+jdzRCkWSryR+67wXRVgD6Q==" +true + +# replace lone surrogates +@base64 +"foo\udca9\ud83dbar" +"Zm9v77+977+9YmFy" diff --git a/tests/shtest b/tests/shtest index 35443cfb45..36b7205bf2 100755 --- a/tests/shtest +++ b/tests/shtest @@ -123,11 +123,20 @@ cmp $d/out $d/expected clean=false -# Invalid UTF-8 bytes are preserved when encoding/decoding JSON -dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null -$VALGRIND $Q $JQ -sR . $d/rand >$d/out.json -$VALGRIND $Q $JQ -j . $d/out.json >$d/out -cmp $d/out $d/rand +# Invalid UTF-8 bytes are preserved when encoding/decoding JSON and base64 and concatenating binary strings +if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then + $VALGRIND $Q $JQ -sR . $d/rand >$d/out.json + $VALGRIND $Q $JQ -j . $d/out.json >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -jR fromjson $d/out.json >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out + cmp $d/out $d/rand + base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out + cmp $d/out $d/rand + $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out + cmp $d/out $d/rand +fi clean=true From 7fde46e5440e7f1b83a3bc2e93f6409d806f33a9 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Tue, 25 May 2021 22:59:59 +1200 Subject: [PATCH 4/8] Correct UTF-8 and UTF-16 errors during concatenation UTF-8 errors and UTF-16 errors that were previously encoded into the ends of strings will now potentially be used to form correct code points. This is mostly a matter of making string equality behave expectedly, since without this normalisation, it is possible to produce `jv` strings that are converted to UTF-8 or UTF-16 the same way but are not equal due well-formed code units that may or may not be encoded as errors. --- src/jv.c | 13 ++- src/jv_unicode.c | 248 ++++++++++++++++++++++++++++++++++++++--------- src/jv_unicode.h | 3 + tests/jq.test | 15 +++ 4 files changed, 230 insertions(+), 49 deletions(-) diff --git a/src/jv.c b/src/jv.c index 92d8336faf..d44f04b5b7 100644 --- a/src/jv.c +++ b/src/jv.c @@ -1155,20 +1155,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) { jvp_string* s = jvp_string_ptr(string); uint32_t currlen = jvp_string_length(s); + char join_buf[4]; + int join_len = jvp_utf8_wtf_join(s->data, &currlen, &data, &len, join_buf); + if (jvp_refcnt_unshared(string.u.ptr) && - jvp_string_remaining_space(s) >= len) { + jvp_string_remaining_space(s) >= join_len + len) { // the next string fits at the end of a + memcpy(s->data + currlen, join_buf, join_len); + currlen += join_len; memcpy(s->data + currlen, data, len); s->data[currlen + len] = 0; s->length_hashed = (currlen + len) << 1; return string; } else { // allocate a bigger buffer and copy - uint32_t allocsz = (currlen + len) * 2; + uint32_t allocsz = (currlen + join_len + len) * 2; if (allocsz < 32) allocsz = 32; jvp_string* news = jvp_string_alloc(allocsz); - news->length_hashed = (currlen + len) << 1; + news->length_hashed = (currlen + join_len + len) << 1; memcpy(news->data, s->data, currlen); + memcpy(news->data + currlen, join_buf, join_len); + currlen += join_len; memcpy(news->data + currlen, data, len); news->data[currlen + len] = 0; jvp_string_free(string); diff --git a/src/jv_unicode.c b/src/jv_unicode.c index cbd812b454..a8858a2cf4 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -1,8 +1,72 @@ #include +#include #include #include "jv_unicode.h" #include "jv_utf8_tables.h" +// length of encoding of erroneous UTF-8 byte +#define UTF8_ERR_LEN 2 +// length of encoding of erroneous UTF-16 surrogate +#define UTF16_ERR_LEN 3 + +#define U32(a, b, c, d) ( \ + (uint32_t) (a) << 0 | \ + (uint32_t) (b) << 8 | \ + (uint32_t) (c) << 16 | \ + (uint32_t) (d) << 24 \ +) + +#define BYTE(u32, n) ((uint32_t) (((u32) >> (n)*8) & 0xFF)) + +#define B0 0x00 // 00000000 +#define B1 0x80 // 10000000 +#define B2 0xC0 // 11000000 +#define B3 0xE0 // 11100000 +#define B4 0xF0 // 11110000 +#define B5 0xF8 // 11111000 + +// NOTE: these flags are likely to be optimised out as `decode` gets inlined +enum decode_flags { + DECODE_1 = 1, + DECODE_2 = 2, + DECODE_3 = 8, + DECODE_4 = 16 +}; + +// decode up to 4 bytes of "generalised UTF-8"; no checking for overlong +// codings or out-of-range code points, works by testing all fixed bits in each +// of the 4 coding patterns, then shifting the value bits according to the +// pattern +static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) { + if((flags & DECODE_1) && (data & U32(B1, B0, B0, B0)) == 0){ + *codepoint_ret = BYTE(data, 0); + return 1; + } + if((flags & DECODE_2) && (data & U32(B3, B2, B0, B0)) == U32(B2, B1, B0, B0)){ + *codepoint_ret = + (BYTE(data, 0) & ~B3) << 6 | + (BYTE(data, 1) & ~B2) << 0; + return 2; + } + if((flags & DECODE_3) && (data & U32(B4, B2, B2, B0)) == U32(B3, B1, B1, B0)){ + *codepoint_ret = + (BYTE(data, 0) & ~B4) << 12 | + (BYTE(data, 1) & ~B2) << 6 | + (BYTE(data, 2) & ~B2) << 0; + return 3; + } + if((flags & DECODE_4) && (data & U32(B5, B2, B2, B2)) == U32(B4, B1, B1, B1)){ + *codepoint_ret = + (BYTE(data, 0) & ~B5) << 18 | + (BYTE(data, 1) & ~B2) << 12 | + (BYTE(data, 2) & ~B2) << 6 | + (BYTE(data, 3) & ~B2) << 0; + return 4; + } + *codepoint_ret = -1; + return 1; +} + // jvp_utf8_backtrack returns the beginning of the last codepoint in the // string, assuming that start is the last byte in the string. // If the last codepoint is incomplete, returns the number of missing bytes via @@ -137,56 +201,42 @@ const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_fla if (in == end) { return 0; } - int codepoint = -1; - unsigned char first = (unsigned char)in[0]; - int length = utf8_coding_length[first]; - if ((first & 0x80) == 0) { + uint32_t data = in[0] & 0xFF; + if ((data & B1) == 0) { /* Fast-path for ASCII */ - codepoint = first; - length = 1; - } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { - /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: bad single byte"); - length = 1; - } else if (in + length > end) { - /* String ends before UTF8 sequence ends */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: underrun"); - length = end - in; - } else { - codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; - for (int i=1; i 0x10FFFF) { - /* Outside Unicode range */ - if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); + } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) { + /* Surrogate codepoints are allowed in WTF-8/WTF-8b */ + if (!(flags & JVP_UTF8_ERRORS_UTF16)) { + /* Surrogate codepoints can't be encoded in UTF8 */ codepoint = -1; } + } else if (codepoint > 0x10FFFF) { + /* Outside Unicode range */ + if (flags & JVP_UTF8_ERRORS_ALL) assert(0 && "Invalid WTF-8b sequence: out of range"); + codepoint = -1; } if (codepoint == -1 && (flags & JVP_UTF8_REPLACE)) codepoint = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER @@ -195,6 +245,112 @@ const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_fla return in + length; } +// assumes two bytes are readable from `in` +static int decode_utf8_error(const char* in) { + uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, 0, 0); + int codepoint; + if (decode(DECODE_2, data, &codepoint) == UTF8_ERR_LEN && codepoint < 0x80) + return codepoint + 0x80; + return -1; +} + +// assumes three bytes are readable from `in` +static int decode_utf16_error(const char* in) { + uint32_t data = U32(in[0] & 0xFF, in[1] & 0xFF, in[2] & 0xFF, 0); + int codepoint; + if (decode(DECODE_3, data, &codepoint) == UTF16_ERR_LEN && codepoint >= 0xD800 && codepoint < 0xDFFF) + return codepoint; + return -1; +} + +// jvp_utf8_wtf_join attempts to turn errors at the end of `a` and the +// beginning of `b` into a valid code point. if a correction is possible, +// `*alen_io`, `*bstart_io` and `*blen_io` are updated to exclude the existing +// errors, and the UTF-8 encoding of the code point to insert is stored in +// `out`. the number of bytes that should be inserted from `out` into the +// middle of the strings is returned (up to 4). this will be 0 if there are no +// bytes to insert. +int jvp_utf8_wtf_join(const char* astart, uint32_t* alen_io, const char** bstart_io, uint32_t* blen_io, char* out) { + const char* aend = astart + *alen_io; + const char* bstart = *bstart_io; + const char* bend = bstart + *blen_io; + int bcp; + bstart = jvp_utf8_wtf_next(bstart, bend, JVP_UTF8_ERRORS_ALL, &bcp); + if (!bstart) { + // end of string + return 0; + } + if (bcp >= 0xDC00 && bcp <= 0xDFFF) { + // UTF-16 tail surrogate, look for lead surrogate at the end of `a` + assert(bstart == *bstart_io + UTF16_ERR_LEN); + if (aend - astart < UTF16_ERR_LEN) + return 0; + int acp = decode_utf16_error(aend - UTF16_ERR_LEN); + if (acp >= 0xD800 && acp <= 0xDBFF) { + // UTF-16 lead surrogate, decode matching UTF-16 pair + *alen_io -= UTF16_ERR_LEN; + *blen_io -= UTF16_ERR_LEN; + *bstart_io += UTF16_ERR_LEN; + int codepoint = 0x10000 + (((acp - 0xD800) << 10) | (bcp - 0xDC00)); + return jvp_utf8_encode(codepoint, out); + } + return 0; + } + if (bcp >= -0xFF && bcp <= -0x80) { + // UTF-8 error, if it's a continuation byte, search backwards in `a` for the leading byte + bcp = -bcp; + assert(bstart == *bstart_io + UTF8_ERR_LEN); + if (utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) + return 0; + // if there's a correctable error, we will consume up to 4 encoded error bytes total, with up to 3 bytes from each of `a` and `b` + unsigned char buf[6]; + unsigned char* bufstart = buf + 3; + unsigned char* bufend = bufstart; + *bufend++ = bcp; + int length; + // search backwards in `a` for a leading byte + for (;;) { + if (aend - astart < UTF8_ERR_LEN) + return 0; // `a` is too short + int acp = decode_utf8_error(aend - UTF8_ERR_LEN); + if (acp == -1) + return 0; // not a UTF-8 error + aend -= UTF8_ERR_LEN; + length = utf8_coding_length[acp]; + if (length == 0) + return 0; // not a possible UTF-8 byte + *--bufstart = acp; + if (length != UTF8_CONTINUATION_BYTE) + break; // found leading byte + if (bufstart == buf) + return 0; // too many continuation bytes + } + if (bufend - bufstart > length) + return 0; // too many continuation bytes + // search forwards in `b` for any more needed continuation bytes + while (bufend - bufstart < length) { + if (bend - bstart < UTF8_ERR_LEN) + return 0; // `b` is too short + bcp = decode_utf8_error(bstart); + if (bcp == -1 || utf8_coding_length[bcp] != UTF8_CONTINUATION_BYTE) + return 0; // not a UTF-8 error, didn't find enough continuation bytes + bstart += UTF8_ERR_LEN; + *bufend++ = bcp; + } + int codepoint; + // check that the bytes are strict UTF-8 + jvp_utf8_wtf_next((char*)bufstart, (char*)bufend, 0, &codepoint); + if (codepoint != -1) { + memcpy(out, bufstart, 4); + *alen_io = aend - astart; + *blen_io = bend - bstart; + *bstart_io = bstart; + return bufend - bufstart; + } + } + return 0; +} + int jvp_utf8_is_valid(const char* in, const char* end) { int codepoint; while ((in = jvp_utf8_wtf_next(in, end, 0, &codepoint))) { diff --git a/src/jv_unicode.h b/src/jv_unicode.h index cb0a481a5e..33005e6f68 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -1,6 +1,8 @@ #ifndef JV_UNICODE_H #define JV_UNICODE_H +#include + enum jvp_utf8_flags { /* Emit replacement character instead of -1 for errors */ JVP_UTF8_REPLACE = 1, @@ -15,6 +17,7 @@ const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_ const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len); +int jvp_utf8_wtf_join(const char* astart, uint32_t* alen, const char** bstart, uint32_t* blen, char* out); int jvp_utf8_is_valid(const char* in, const char* end); int jvp_utf8_decode_length(char startchar); diff --git a/tests/jq.test b/tests/jq.test index eb8674c976..068bf31494 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -62,6 +62,11 @@ null null "∀\ud800∃\udc00∅\udfff" +# Check that unpaired surrogates are paired when concatenated +add +["\ud83d","\ude43","\ud83e","\udd11","\ud83e","\udd17","\ud83e","\udd14","\ud83e","\udd10","\ud83d","\ude44","\ud83e","\udd12","\ud83e","\udd15","\ud83e","\udd13","\ud83e","\udd16","\ud83e","\udd18","\ud83c","\udffb","\ud83c","\udffc"] +"🙃🤑🤗🤔🤐🙄🤒🤕🤓🤖🤘🏻🏼" + "inter\("pol" + "ation")" null "interpolation" @@ -87,6 +92,16 @@ null "Zm/Ds2Jhcgo=" "foóbar\n" +# test correction of UTF-8 errors when concatenating as binary data (input is a random sequence of code points) +. as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text +"򍨼衍򙮬񪜁򻴠󖂡󔁰񗏷󛊭񢠃򍧝𭌞󹰞󙴋𿋓󧜹򳔎񦰓򅆹򽐟󂑛򶃯㾱ꕽ񂊛򉙲򅤎􃖣󻣸󁸦򴏜򽃿􄑏󠦱񄛲񄕵񡿚򮩒񡏂򨆯򶚒󎮆󉨗򡮟򆿴񬏪򻀅㫑񉒗󴍶󬪸񝶑񂾑򇔣򉩉􂞇𲡀𨫆򤵇𲺝\u001c񖂟񳐉󲔹𳨬􀮔𸒙񜶻㊬񓐊񽒬󑀧󗧚󞌶󦥥𗌽𘀍󴼹􌇺򫗛񂷶󏷕񜁍񥬟󼁁󓺉𗟒򷝊𩕃񞝏񧄀󁲩򐀄򳂸񲊷򃀋񃫫𝷏򏖝򷂍󢭣􋛨𞪒򁁅勸󯩥󵪭񚮚򻡍騎񾊯򪓚񗡈񎕫򡯬񋫠ᕴ𞨹󾄇񩠶𙯾񢥱𚯴񬥷󢶖񾹌񡈟򧓑񒾘𚸯񳗺񭟡𫸬񷤖񷆐𖋌񦰃椀𫎾󗚋𿋆󈝰񺥲򝕊𵯮򙧚󬱃󍗞󱆃󂟙󟆺񻢬󸮤󗗉񉛮𺵡𰣒􁋙񻍛􇡘ᮍ񕥸񨵂盕嗪𻸮򶆍򊈤񽓎󙴐𗬜󾱒󷹰􇡈񨦎􏥩񴲡𨑮򱏝𭢊󕁶򣙥󶡮󮰌󿙾氕񼻘􆔪񢕀񊿃󮨝񑛖󣴊󎎏򳞓㊁󒭀󇜳𯄌𻙩" +true + +# test preservation of binary data when concatenating (input is a random sequence of UTF-16 surrogates encoded in WTF-8, should be treated as regular UTF-8 errors) +@base64d | . as $text | @base64 | . as $b64 | [range(0, 300)] | map($b64[(.*4):((. + 1)*4)] | @base64d) | add | . == $text +"7bKv7aiz7auX7aG37aO77aOe7auy7bmm7bqk7aG87bSH7a6m7bmc7bum7bqj7au+7bqf7aap7buC7byq7aS37aCp7aSl7a+a7bur7aGV7bGl7b6M7biB7aOe7ayR7amW7aOX7b637a+P7bu+7ayP7bOw7ba/7ayp7b6G7aqd7bG37bK57b6O7bq27a+u7a2N7ayu7bKK" +true + @uri "\u03bc" "%CE%BC" From 2e1b5d22473addd975e0ab5cde169a997436b212 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sat, 22 Jul 2023 09:55:08 +1200 Subject: [PATCH 5/8] Update `@uri` to handle binary strings --- src/builtin.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/builtin.c b/src/builtin.c index ea419db1e7..bbea10212a 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -657,16 +657,19 @@ static jv f_format(jq_state *jq, jv input, jv fmt) { while (*p) unreserved[(int)*p++] = 1; jv line = jv_string(""); - const char* s = jv_string_value(input); - for (int i=0; i Date: Sat, 22 Jul 2023 16:46:52 +1200 Subject: [PATCH 6/8] Preserve UTF-8 and UTF-16 errors in `explode` Errors are emitted as negative code points instead of being transformed into replacement characters. `implode` is also updated accordingly so the original string can be reconstructed without data loss. --- docs/content/manual/manual.yml | 3 ++- jq.1.prebuilt | 2 +- src/jv.c | 15 +++++++++++++-- tests/jq.test | 4 ++-- tests/shtest | 2 ++ 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index c44ad3818a..4d1050b183 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -1699,7 +1699,8 @@ sections: body: | Converts an input string into an array of the string's - codepoint numbers. + codepoint numbers. Ill-formed Unicode is represented using + negative numbers. examples: - program: 'explode' diff --git a/jq.1.prebuilt b/jq.1.prebuilt index 490791b831..1f3f49d795 100644 --- a/jq.1.prebuilt +++ b/jq.1.prebuilt @@ -1849,7 +1849,7 @@ jq \'[\.[]|rtrimstr("foo")]\' .IP "" 0 . .SS "explode" -Converts an input string into an array of the string\'s codepoint numbers\. +Converts an input string into an array of the string\'s codepoint numbers\. Ill\-formed Unicode is represented using negative numbers\. . .IP "" 4 . diff --git a/src/jv.c b/src/jv.c index d44f04b5b7..f81c29fc4f 100644 --- a/src/jv.c +++ b/src/jv.c @@ -1354,8 +1354,13 @@ jv jv_string_explode(jv j) { const char* end = i + len; jv a = jv_array_sized(len); int c; - while ((i = jvp_utf8_next(i, end, &c))) + while ((i = jvp_utf8_wtf_next(i, end, JVP_UTF8_ERRORS_ALL, &c))) { + // UTF-16 errors are emitted as negative integers to clearly distinguish them from valid Unicode text + // UTF-8 errors are already negated when using `JVP_UTF8_ERRORS_ALL` + if (c >= 0xD800 && c <= 0xDFFF) + c = -c; a = jv_array_append(a, jv_number(c)); + } jv_free(j); return a; } @@ -1373,7 +1378,13 @@ jv jv_string_implode(jv j) { assert(JVP_HAS_KIND(n, JV_KIND_NUMBER)); int nv = jv_number_value(n); jv_free(n); - if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF) + // UTF-16 errors are represented as negative integers to clearly distinguish them from valid Unicode text + if (nv >= -0xDFFF && nv <= -0xD800) { + // convert negative UTF-16 errors into positive errors as expected by `jv_string_append_codepoint` + nv = -nv; + } else if (nv >= -0xFF && nv <= -0x80) { + // negative UTF-8 errors are already in the representation expected by `jv_string_append_codepoint` + } else if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF) nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER s = jv_string_append_codepoint(s, nv); } diff --git a/tests/jq.test b/tests/jq.test index 068bf31494..1debbe4a71 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -58,9 +58,9 @@ null "Aa\u000d\u000a\u0009\u0008\u000c\u03bc" # Check that unpaired surrogates are preserved in output -"\u2200\ud800\u2203\udc00\u2205\udfff" +"\u2200\ud800\u2203\udc00\u2205\udfff" | "\(.)\(explode | implode)" null -"∀\ud800∃\udc00∅\udfff" +"∀\ud800∃\udc00∅\udfff∀\ud800∃\udc00∅\udfff" # Check that unpaired surrogates are paired when concatenated add diff --git a/tests/shtest b/tests/shtest index 36b7205bf2..cf2bc6dd6b 100755 --- a/tests/shtest +++ b/tests/shtest @@ -132,6 +132,8 @@ if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then cmp $d/out $d/rand $VALGRIND $Q $JQ -j '@base64 | @base64d' $d/out.json >$d/out cmp $d/out $d/rand + $VALGRIND $Q $JQ -j 'explode | implode' $d/out.json >$d/out + cmp $d/out $d/rand base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out cmp $d/out $d/rand $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out From 5c2fe323ffe1a47f6ff03ccac29fb6379bf6d93d Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sun, 23 Jul 2023 01:15:00 +1200 Subject: [PATCH 7/8] Remove UTF-8 backtracking workaround This is no longer needed as strings are capable of storing partial UTF-8 sequences. --- src/jv_file.c | 12 ++---------- src/jv_unicode.c | 23 ----------------------- src/jv_unicode.h | 1 - 3 files changed, 2 insertions(+), 34 deletions(-) diff --git a/src/jv_file.c b/src/jv_file.c index b10bcc0b5c..a4514e220b 100644 --- a/src/jv_file.c +++ b/src/jv_file.c @@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) { parser = jv_parser_new(0); } - // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read - // buffer, we need to be able to read the remainder of a sequence and add that - // before appending. - const int max_utf8_len = 4; - char buf[4096+max_utf8_len]; + char buf[4096]; while (!feof(file) && !ferror(file)) { - size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file); + size_t n = fread(buf, 1, sizeof(buf), file); int len = 0; if (n == 0) continue; - if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 && - !feof(file) && !ferror(file)) { - n += fread(buf+n, 1, len, file); - } if (raw) { data = jv_string_append_buf(data, buf, n); diff --git a/src/jv_unicode.c b/src/jv_unicode.c index a8858a2cf4..cb03cf833e 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -67,29 +67,6 @@ static int decode(enum decode_flags flags, uint32_t data, int* codepoint_ret) { return 1; } -// jvp_utf8_backtrack returns the beginning of the last codepoint in the -// string, assuming that start is the last byte in the string. -// If the last codepoint is incomplete, returns the number of missing bytes via -// *missing_bytes. If there are no leading bytes or an invalid byte is -// encountered, NULL is returned and *missing_bytes is not altered. -const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) { - assert(min <= start); - if (min == start) { - return min; - } - int length = 0; - int seen = 1; - while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) { - start--; - seen++; - } - if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) { - return NULL; - } - if (missing_bytes) *missing_bytes = length - seen; - return start; -} - const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { return jvp_utf8_wtf_next(in, end, JVP_UTF8_REPLACE, codepoint_ret); } diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 33005e6f68..4c287cc125 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -13,7 +13,6 @@ enum jvp_utf8_flags { JVP_UTF8_ERRORS_ALL = JVP_UTF8_ERRORS_UTF16 | JVP_UTF8_ERRORS_UTF8 }; -const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes); const char* jvp_utf8_wtf_next(const char* in, const char* end, enum jvp_utf8_flags flags, int* codepoint); const char* jvp_utf8_next(const char* in, const char* end, int* codepoint); const char* jvp_utf8_wtf_next_bytes(const char* in, const char* end, const char** bytes_out, uint32_t* bytes_len); From 911d01aaa5bd33137fadf028b9c3b4f86171b542 Mon Sep 17 00:00:00 2001 From: Max Zerzouri Date: Sun, 23 Jul 2023 02:06:11 +1200 Subject: [PATCH 8/8] tests/shtest: fix use of base64 command for macOS compatibility --- tests/shtest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/shtest b/tests/shtest index cf2bc6dd6b..23b9837347 100755 --- a/tests/shtest +++ b/tests/shtest @@ -134,7 +134,7 @@ if dd if=/dev/urandom bs=1024 count=1024 >$d/rand 2>/dev/null; then cmp $d/out $d/rand $VALGRIND $Q $JQ -j 'explode | implode' $d/out.json >$d/out cmp $d/out $d/rand - base64 $d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out + base64 <$d/rand | $VALGRIND $Q $JQ -R '@base64d' | $VALGRIND $Q $JQ -sj 'add' >$d/out cmp $d/out $d/rand $VALGRIND $Q $JQ -nj '$a' --rawfile a $d/rand >$d/out cmp $d/out $d/rand