Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support binary strings, preserve UTF-8 and UTF-16 errors #2314

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
4 changes: 2 additions & 2 deletions docs/content/manual/manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,8 @@ sections:
body: |

Converts an input string into an array of the string's
codepoint numbers.
codepoint numbers. Ill-formed Unicode is represented using
negative numbers.

examples:
- program: 'explode'
Expand Down Expand Up @@ -2040,7 +2041,6 @@ sections:
* `@base64d`:

The inverse of `@base64`, input is decoded as specified by RFC 4648.
Note\: If the decoded string is not UTF-8, the results are undefined.

This syntax can be combined with string interpolation in a
useful way. You can follow a `@foo` token with a string
Expand Down
4 changes: 2 additions & 2 deletions jq.1.prebuilt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions scripts/gen_utf8_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ def print_table(type, name, t):
def utf8info(c):
if c < 0x80: return 1, mask(7)
if 0x80 <= c <= 0xBF: return 255, mask(6)
if 0xC0 <= c <= 0xC1: return 0, 0
if 0xC2 <= c <= 0xDF: return 2, mask(5)
if 0xC0 <= c <= 0xDF: return 2, mask(5)
if 0xE0 <= c <= 0xEF: return 3, mask(4)
if 0xF0 <= c <= 0xF4: return 4, mask(3)
if 0xF4 <= c <= 0xFF: return 0, 0
Expand Down
70 changes: 48 additions & 22 deletions src/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ static jv f_dump(jq_state *jq, jv input) {
static jv f_json_parse(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings can be parsed");
jv res = jv_parse_sized(jv_string_value(input),
jv res = jv_parse_wtf_sized(jv_string_value(input),
jv_string_length_bytes(jv_copy(input)));
jv_free(input);
return res;
Expand Down Expand Up @@ -520,7 +520,15 @@ static jv f_tostring(jq_state *jq, jv input) {
static jv f_utf8bytelength(jq_state *jq, jv input) {
if (jv_get_kind(input) != JV_KIND_STRING)
return type_error(input, "only strings have UTF-8 byte length");
return jv_number(jv_string_length_bytes(input));
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
uint32_t len = 0;
const char *bytes;
uint32_t bytes_len;
while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len)))
len += bytes_len;
jv_free(input);
return jv_number(len);
}

#define CHARS_ALPHANUM "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
Expand Down Expand Up @@ -649,16 +657,19 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
while (*p) unreserved[(int)*p++] = 1;

jv line = jv_string("");
const char* s = jv_string_value(input);
for (int i=0; i<jv_string_length_bytes(jv_copy(input)); i++) {
unsigned ch = (unsigned)(unsigned char)*s;
if (ch < 128 && unreserved[ch]) {
line = jv_string_append_buf(line, s, 1);
} else {
line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
const char *start = jv_string_value(input);
const char *end = start + jv_string_length_bytes(jv_copy(input));
const char *bytes;
uint32_t bytes_len;
while ((start = jvp_utf8_wtf_next_bytes(start, end, &bytes, &bytes_len)))
for (uint32_t i = 0; i < bytes_len; i++) {
unsigned ch = (unsigned)(unsigned char)bytes[i];
if (ch < 128 && unreserved[ch]) {
line = jv_string_append_buf(line, &bytes[i], 1);
} else {
line = jv_string_concat(line, jv_string_fmt("%%%02X", ch));
}
}
s++;
}
jv_free(input);
return line;
} else if (!strcmp(fmt_s, "sh")) {
Expand Down Expand Up @@ -695,21 +706,36 @@ static jv f_format(jq_state *jq, jv input, jv fmt) {
jv_free(fmt);
input = f_tostring(jq, input);
jv line = jv_string("");
const unsigned char* data = (const unsigned char*)jv_string_value(input);
int len = jv_string_length_bytes(jv_copy(input));
for (int i=0; i<len; i+=3) {
uint32_t code = 0;
int n = len - i >= 3 ? 3 : len-i;
for (int j=0; j<3; j++) {
const char* i = jv_string_value(input);
const char* end = i + jv_string_length_bytes(jv_copy(input));
uint32_t code = 0;
int n = 0;
const char *bytes;
uint32_t bytes_len;
while ((i = jvp_utf8_wtf_next_bytes(i, end, &bytes, &bytes_len))) {
unsigned char *ubuf = (unsigned char *)bytes;
for (uint32_t x = 0; x < bytes_len; x++) {
code <<= 8;
code |= j < n ? (unsigned)data[i+j] : 0;
code |= ubuf[x];
if (++n == 3) {
char buf[4];
for (int j = 0; j < 4; j++)
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
line = jv_string_append_buf(line, buf, sizeof(buf));
n = 0;
code = 0;
}
}
}
if (n > 0) {
assert(n < 3);
code <<= 8*(3 - n);
char buf[4];
for (int j=0; j<4; j++) {
for (int j = 0; j < 4; j++)
buf[j] = BASE64_ENCODE_TABLE[(code >> (18 - j*6)) & 0x3f];
}
if (n < 3) buf[3] = '=';
if (n < 2) buf[2] = '=';
buf[3] = '=';
if (n < 2)
buf[2] = '=';
line = jv_string_append_buf(line, buf, sizeof(buf));
}
jv_free(input);
Expand Down
56 changes: 39 additions & 17 deletions src/jv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1085,20 +1085,24 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
return s;
}

/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
/* Copy a UTF8 string, using WTF-8b to replace all UTF-8 errors */
static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
const char* end = data + length;
const char* i = data;
const char* cstart;

uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
uint32_t maxlength = length * 2 + 1; // worst case: all bad bytes, each becomes a 2-byte overlong U+XX
jvp_string* s = jvp_string_alloc(maxlength);
char* out = s->data;
int c = 0;

while ((i = jvp_utf8_next((cstart = i), end, &c))) {
while ((i = jvp_utf8_wtf_next((cstart = i), end, 0, &c))) {
if (c == -1) {
c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
int error = (unsigned char)*cstart;
assert(error >= 0x80 && error <= 0xFF);
c = -error;
/* Ensure each UTF-8 error byte is consumed separately */
i = cstart + 1;
}
out += jvp_utf8_encode(c, out);
assert(out < s->data + maxlength);
Expand All @@ -1110,8 +1114,8 @@ static jv jvp_string_copy_replace_bad(const char* data, uint32_t length) {
return r;
}

/* Assumes valid UTF8 */
static jv jvp_string_new(const char* data, uint32_t length) {
/* Assumes valid WTF-8b */
jv jv_string_wtf_sized(const char* data, int length) {
jvp_string* s = jvp_string_alloc(length);
s->length_hashed = length << 1;
if (data != NULL)
Expand Down Expand Up @@ -1151,20 +1155,27 @@ static jv jvp_string_append(jv string, const char* data, uint32_t len) {
jvp_string* s = jvp_string_ptr(string);
uint32_t currlen = jvp_string_length(s);

char join_buf[4];
int join_len = jvp_utf8_wtf_join(s->data, &currlen, &data, &len, join_buf);

if (jvp_refcnt_unshared(string.u.ptr) &&
jvp_string_remaining_space(s) >= len) {
jvp_string_remaining_space(s) >= join_len + len) {
// the next string fits at the end of a
memcpy(s->data + currlen, join_buf, join_len);
currlen += join_len;
memcpy(s->data + currlen, data, len);
s->data[currlen + len] = 0;
s->length_hashed = (currlen + len) << 1;
return string;
} else {
// allocate a bigger buffer and copy
uint32_t allocsz = (currlen + len) * 2;
uint32_t allocsz = (currlen + join_len + len) * 2;
if (allocsz < 32) allocsz = 32;
jvp_string* news = jvp_string_alloc(allocsz);
news->length_hashed = (currlen + len) << 1;
news->length_hashed = (currlen + join_len + len) << 1;
memcpy(news->data, s->data, currlen);
memcpy(news->data + currlen, join_buf, join_len);
currlen += join_len;
memcpy(news->data + currlen, data, len);
news->data[currlen + len] = 0;
jvp_string_free(string);
Expand Down Expand Up @@ -1252,7 +1263,7 @@ static int jvp_string_equal(jv a, jv b) {
jv jv_string_sized(const char* str, int len) {
return
jvp_utf8_is_valid(str, str+len) ?
jvp_string_new(str, len) :
jv_string_wtf_sized(str, len) :
jvp_string_copy_replace_bad(str, len);
}

Expand Down Expand Up @@ -1318,14 +1329,14 @@ jv jv_string_split(jv j, jv sep) {

if (seplen == 0) {
int c;
while ((jstr = jvp_utf8_next(jstr, jend, &c)))
while ((jstr = jvp_utf8_wtf_next(jstr, jend, JVP_UTF8_ERRORS_ALL, &c)))
a = jv_array_append(a, jv_string_append_codepoint(jv_string(""), c));
} else {
for (p = jstr; p < jend; p = s + seplen) {
s = _jq_memmem(p, jend - p, sepstr, seplen);
if (s == NULL)
s = jend;
a = jv_array_append(a, jv_string_sized(p, s - p));
a = jv_array_append(a, jv_string_wtf_sized(p, s - p));
// Add an empty string to denote that j ends on a sep
if (s + seplen == jend && seplen != 0)
a = jv_array_append(a, jv_string(""));
Expand All @@ -1343,8 +1354,13 @@ jv jv_string_explode(jv j) {
const char* end = i + len;
jv a = jv_array_sized(len);
int c;
while ((i = jvp_utf8_next(i, end, &c)))
while ((i = jvp_utf8_wtf_next(i, end, JVP_UTF8_ERRORS_ALL, &c))) {
// UTF-16 errors are emitted as negative integers to clearly distinguish them from valid Unicode text
// UTF-8 errors are already negated when using `JVP_UTF8_ERRORS_ALL`
if (c >= 0xD800 && c <= 0xDFFF)
c = -c;
a = jv_array_append(a, jv_number(c));
}
jv_free(j);
return a;
}
Expand All @@ -1362,7 +1378,13 @@ jv jv_string_implode(jv j) {
assert(JVP_HAS_KIND(n, JV_KIND_NUMBER));
int nv = jv_number_value(n);
jv_free(n);
if (nv > 0x10FFFF)
// UTF-16 errors are represented as negative integers to clearly distinguish them from valid Unicode text
if (nv >= -0xDFFF && nv <= -0xD800) {
// convert negative UTF-16 errors into positive errors as expected by `jv_string_append_codepoint`
nv = -nv;
} else if (nv >= -0xFF && nv <= -0x80) {
// negative UTF-8 errors are already in the representation expected by `jv_string_append_codepoint`
} else if (nv < 0 || (nv >= 0xD800 && nv <= 0xDFFF) || nv > 0x10FFFF)
nv = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
s = jv_string_append_codepoint(s, nv);
}
Expand Down Expand Up @@ -1397,7 +1419,7 @@ jv jv_string_slice(jv j, int start, int end) {

/* Look for byte offset corresponding to start codepoints */
for (p = s, i = 0; i < start; i++) {
p = jvp_utf8_next(p, s + len, &c);
p = jvp_utf8_wtf_next(p, s + len, JVP_UTF8_ERRORS_ALL, &c);
if (p == NULL) {
jv_free(j);
return jv_string_empty(16);
Expand All @@ -1409,7 +1431,7 @@ jv jv_string_slice(jv j, int start, int end) {
}
/* Look for byte offset corresponding to end codepoints */
for (e = p; e != NULL && i < end; i++) {
e = jvp_utf8_next(e, s + len, &c);
e = jvp_utf8_wtf_next(e, s + len, JVP_UTF8_ERRORS_ALL, &c);
if (e == NULL) {
e = s + len;
break;
Expand All @@ -1427,7 +1449,7 @@ jv jv_string_slice(jv j, int start, int end) {
* memory like a drunken navy programmer. There's probably nothing we
* can do about it.
*/
res = jv_string_sized(p, e - p);
res = jv_string_wtf_sized(p, e - p);
jv_free(j);
return res;
}
Expand Down
2 changes: 2 additions & 0 deletions src/jv.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ jv jv_array_indexes(jv, jv);

jv jv_string(const char*);
jv jv_string_sized(const char*, int);
jv jv_string_wtf_sized(const char*, int);
jv jv_string_empty(int len);
int jv_string_length_bytes(jv);
int jv_string_length_codepoints(jv);
Expand Down Expand Up @@ -227,6 +228,7 @@ enum {

jv jv_parse(const char* string);
jv jv_parse_sized(const char* string, int length);
jv jv_parse_wtf_sized(const char* string, int length);

typedef void (*jv_nomem_handler_f)(void *);
void jv_nomem_handler(jv_nomem_handler_f, void *);
Expand Down
12 changes: 2 additions & 10 deletions src/jv_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,13 @@ jv jv_load_file(const char* filename, int raw) {
parser = jv_parser_new(0);
}

// To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
// buffer, we need to be able to read the remainder of a sequence and add that
// before appending.
const int max_utf8_len = 4;
char buf[4096+max_utf8_len];
char buf[4096];
while (!feof(file) && !ferror(file)) {
size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
size_t n = fread(buf, 1, sizeof(buf), file);
int len = 0;

if (n == 0)
continue;
if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0 &&
!feof(file) && !ferror(file)) {
n += fread(buf+n, 1, len, file);
}

if (raw) {
data = jv_string_append_buf(data, buf, n);
Expand Down
Loading