Skip to content

Commit

Permalink
Fix decoder on broken utf8 sequences.
Browse files Browse the repository at this point in the history
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
  • Loading branch information
phprus committed Aug 18, 2022
1 parent fbb568b commit d1cbd5e
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
5 changes: 4 additions & 1 deletion include/fmt/format.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
*/
FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
-> const char* {
constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
Expand All @@ -628,6 +629,8 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
*e |= uchar(s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
*e |= ((uchar(s[0]) & prefix_masks[len]) !=
uchar(prefix_masks[len] << 1)); // first byte correct?

return next;
}
Expand All @@ -643,7 +646,7 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
auto error = 0;
auto end = utf8_decode(buf_ptr, &cp, &error);
bool result = f(error ? invalid_code_point : cp,
string_view(ptr, to_unsigned(end - buf_ptr)));
string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
return result ? end : nullptr;
};
auto p = s.data();
Expand Down
9 changes: 9 additions & 0 deletions test/ranges-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,14 @@ TEST(ranges_test, format_map) {
auto m = std::map<std::string, int>{{"one", 1}, {"two", 2}};
EXPECT_EQ(fmt::format("{}", m), "{\"one\": 1, \"two\": 2}");
EXPECT_EQ(fmt::format("{:n}", m), "\"one\": 1, \"two\": 2");

if (fmt::detail::is_utf8()) {
// Broken utf-8.
auto m1 = std::map<std::string, std::string>{{"broken-utf8-1", "\xf0\x28"},
{"broken-utf8-2", "\xe1\x28"}};
EXPECT_EQ(fmt::format("{}", m1),
"{\"broken-utf8-1\": \"\\xf0(\", \"broken-utf8-2\": \"\\xe1(\"}");
}
}

TEST(ranges_test, format_set) {
Expand Down Expand Up @@ -380,6 +388,7 @@ TEST(ranges_test, escape_string) {
EXPECT_EQ(fmt::format("{}", vec{"\xcd\xb8"}), "[\"\\u0378\"]");
// Unassigned Unicode code points.
EXPECT_EQ(fmt::format("{}", vec{"\xf0\xaa\x9b\x9e"}), "[\"\\U0002a6de\"]");
// Broken utf-8.
EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}),
"[\"\\xf4\\x8f\\xbf\\xc0\"]");
EXPECT_EQ(fmt::format("{}", vec{"понедельник"}), "[\"понедельник\"]");
Expand Down

0 comments on commit d1cbd5e

Please sign in to comment.