forked from bitcoin/bitcoin
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This adds full UTF-8 support both on input and output. Input: read and validate full UTF-8, both Basic Multilingual Plane and extended characters. Collate surrogate pairs as specified in RFC4627. This ensures that UTF-8 strings that reach the application are always valid, and that invalid UTF-8 fails parsing. Output: Assume UTF-8 strings provided for output are valid. The escaping was broken, fix this by not encoding UTF-8 characters with \u. Writing them to the output stream as-is is the right thing to do. See https://www.ietf.org/rfc/rfc4627.txt: "JSON text SHALL be encoded in Unicode. The default encoding is UTF-8." Also add tests for the new functionality. Fixes #16.
- Loading branch information
Showing
10 changed files
with
178 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
// Copyright 2016 Wladimir J. van der Laan | ||
// Distributed under the MIT software license, see the accompanying | ||
// file COPYING or http://www.opensource.org/licenses/mit-license.php. | ||
#ifndef UNIVALUE_UTFFILTER_H | ||
#define UNIVALUE_UTFFILTER_H | ||
|
||
#include <string> | ||
|
||
/** | ||
* Filter that generates and validates UTF-8, as well as collates UTF-16 | ||
* surrogate pairs as specified in RFC4627. | ||
*/ | ||
class JSONUTF8StringFilter | ||
{ | ||
public: | ||
JSONUTF8StringFilter(std::string &s): | ||
str(s), is_valid(true), codepoint(0), state(0), surpair(0) | ||
{ | ||
} | ||
// Write single 8-bit char (may be part of UTF-8 sequence) | ||
void push_back(unsigned char ch) | ||
{ | ||
if (state == 0) { | ||
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through | ||
str.push_back(ch); | ||
else if (ch < 0xc0) // Mid-sequence character, invalid in this state | ||
is_valid = false; | ||
else if (ch < 0xe0) { // Start of 2-byte sequence | ||
codepoint = (ch & 0x1f) << 6; | ||
state = 6; | ||
} else if (ch < 0xf0) { // Start of 3-byte sequence | ||
codepoint = (ch & 0x0f) << 12; | ||
state = 12; | ||
} else if (ch < 0xf8) { // Start of 4-byte sequence | ||
codepoint = (ch & 0x07) << 18; | ||
state = 18; | ||
} else // Reserved, invalid | ||
is_valid = false; | ||
} else { | ||
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid | ||
is_valid = false; | ||
state -= 6; | ||
codepoint |= (ch & 0x3f) << state; | ||
if (state == 0) | ||
push_back_u(codepoint); | ||
} | ||
} | ||
// Write codepoint directly, possibly collating surrogate pairs | ||
void push_back_u(unsigned int codepoint) | ||
{ | ||
if (state) // Only accept full codepoints in open state | ||
is_valid = false; | ||
if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair | ||
if (surpair) // Two subsequent surrogate pair openers - fail | ||
is_valid = false; | ||
else | ||
surpair = codepoint; | ||
} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair | ||
if (surpair) { // Open surrogate pair, expect second half | ||
// Compute code point from UTF-16 surrogate pair | ||
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00)); | ||
surpair = 0; | ||
} else // Second half doesn't follow a first half - fail | ||
is_valid = false; | ||
} else { | ||
if (surpair) // First half of surrogate pair not followed by second - fail | ||
is_valid = false; | ||
else | ||
append_codepoint(codepoint); | ||
} | ||
} | ||
// Check that we're in a state where the string can be ended | ||
// No open sequences, no open surrogate pairs, etc | ||
bool finalize() | ||
{ | ||
if (state || surpair) | ||
is_valid = false; | ||
return is_valid; | ||
} | ||
private: | ||
std::string &str; | ||
bool is_valid; | ||
// Current UTF-8 decoding state | ||
unsigned int codepoint; | ||
int state; // Top bit to be filled in for next UTF-8 byte, or 0 | ||
|
||
// Keep track of the following state to handle the following section of | ||
// RFC4627: | ||
// | ||
// To escape an extended character that is not in the Basic Multilingual | ||
// Plane, the character is represented as a twelve-character sequence, | ||
// encoding the UTF-16 surrogate pair. So, for example, a string | ||
// containing only the G clef character (U+1D11E) may be represented as | ||
// "\uD834\uDD1E". | ||
// | ||
// Two subsequent \u.... may have to be replaced with one actual codepoint. | ||
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0 | ||
|
||
void append_codepoint(unsigned int codepoint) | ||
{ | ||
if (codepoint <= 0x7f) | ||
str.push_back((char)codepoint); | ||
else if (codepoint <= 0x7FF) { | ||
str.push_back((char)(0xC0 | (codepoint >> 6))); | ||
str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||
} else if (codepoint <= 0xFFFF) { | ||
str.push_back((char)(0xE0 | (codepoint >> 12))); | ||
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); | ||
str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||
} else if (codepoint <= 0x1FFFFF) { | ||
str.push_back((char)(0xF0 | (codepoint >> 18))); | ||
str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F))); | ||
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F))); | ||
str.push_back((char)(0x80 | (codepoint & 0x3F))); | ||
} | ||
} | ||
}; | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
["\ud834"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
["\udd61"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
["���"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
["�"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
["a§■𐎒𝅘𝅥𝅯"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters