Skip to content

Commit

Permalink
Fix decoder on broken utf8 sequences.
Browse files Browse the repository at this point in the history
Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
  • Loading branch information
phprus committed Aug 23, 2022
1 parent fbb568b commit 1d560a3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
5 changes: 4 additions & 1 deletion include/fmt/format.h
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
*/
FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
-> const char* {
constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8};
constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
constexpr const int shiftc[] = {0, 18, 12, 6, 0};
Expand All @@ -628,6 +629,8 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
*e |= uchar(s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
*e |= ((uchar(s[0]) & prefix_masks[len]) !=
uchar((prefix_masks[len] << 1) & 0xFF)); // first byte correct?

return next;
}
Expand All @@ -643,7 +646,7 @@ FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
auto error = 0;
auto end = utf8_decode(buf_ptr, &cp, &error);
bool result = f(error ? invalid_code_point : cp,
string_view(ptr, to_unsigned(end - buf_ptr)));
string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
return result ? end : nullptr;
};
auto p = s.data();
Expand Down
7 changes: 7 additions & 0 deletions test/ranges-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,15 @@ TEST(ranges_test, escape_string) {
EXPECT_EQ(fmt::format("{}", vec{"\xcd\xb8"}), "[\"\\u0378\"]");
// Unassigned Unicode code points.
EXPECT_EQ(fmt::format("{}", vec{"\xf0\xaa\x9b\x9e"}), "[\"\\U0002a6de\"]");
// Broken utf-8.
EXPECT_EQ(fmt::format("{}", vec{"\xf4\x8f\xbf\xc0"}),
"[\"\\xf4\\x8f\\xbf\\xc0\"]");
EXPECT_EQ(fmt::format("{}", vec{"\xf0\x28"}), "[\"\\xf0(\"]");
EXPECT_EQ(fmt::format("{}", vec{"\xe1\x28"}), "[\"\\xe1(\"]");
EXPECT_EQ(fmt::format("{}", vec{std::string("\xf0\x28\0\0anything", 12)}),
"[\"\\xf0(\\x00\\x00anything\"]");

// Correct utf-8.
EXPECT_EQ(fmt::format("{}", vec{"понедельник"}), "[\"понедельник\"]");
}
}
Expand Down

0 comments on commit 1d560a3

Please sign in to comment.