Skip to content

Commit

Permalink
Improve handling of Unicode in paths
Browse files Browse the repository at this point in the history
  • Loading branch information
vitaut committed Apr 30, 2023
1 parent 5316214 commit 02cae7e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 32 deletions.
23 changes: 17 additions & 6 deletions include/fmt/std.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,23 @@ void write_escaped_path(basic_memory_buffer<Char>& quoted,
}
# ifdef _WIN32
template <>
inline void write_escaped_path<char>(basic_memory_buffer<char>& quoted,
inline void write_escaped_path<char>(memory_buffer& quoted,
const std::filesystem::path& p) {
auto s = p.u8string();
write_escaped_string<char>(
std::back_inserter(quoted),
string_view(reinterpret_cast<const char*>(s.c_str()), s.size()));
auto buf = basic_memory_buffer<wchar_t>();
write_escaped_string<wchar_t>(std::back_inserter(buf), p.native());
for (unsigned c : buf) {
// Convert UTF-16 to UTF-8.

This comment has been minimized.

Copy link
@phprus

phprus Apr 30, 2023

Contributor

Maybe reuse

fmt/include/fmt/os.h

Lines 127 to 129 in 02cae7e

// A converter from UTF-16 to UTF-8.
// It is only provided for Windows since other systems support UTF-8 natively.
class utf16_to_utf8 {

?

This comment has been minimized.

Copy link
@vitaut

vitaut May 1, 2023

Author Contributor

We could probably merge this into utf16_to_utf8 and get rid of the dreaded windows API but it's not essential.

This comment has been minimized.

Copy link
@phprus

phprus May 1, 2023

Contributor

@vitaut

fmtlib contains a full implementation of converting utf16 (with surrogate pair support) and utf32 to utf8:

fmt/include/fmt/chrono.h

Lines 361 to 413 in 02cae7e

template <typename OutputIt>
auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
-> OutputIt {
if (detail::is_utf8() && loc != get_classic_locale()) {
// char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
// gcc-4.
#if FMT_MSC_VERSION != 0 || \
(defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI))
// The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
// and newer.
using code_unit = wchar_t;
#else
using code_unit = char32_t;
#endif
using unit_t = codecvt_result<code_unit>;
unit_t unit;
write_codecvt(unit, in, loc);
// In UTF-8 is used one to four one-byte code units.
auto&& buf = basic_memory_buffer<char, unit_t::max_size * 4>();
for (code_unit* p = unit.buf; p != unit.end; ++p) {
uint32_t c = static_cast<uint32_t>(*p);
if (sizeof(code_unit) == 2 && c >= 0xd800 && c <= 0xdfff) {
// surrogate pair
++p;
if (p == unit.end || (c & 0xfc00) != 0xd800 ||
(*p & 0xfc00) != 0xdc00) {
FMT_THROW(format_error("failed to format time"));
}
c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
}
if (c < 0x80) {
buf.push_back(static_cast<char>(c));
} else if (c < 0x800) {
buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else if (c >= 0x10000 && c <= 0x10ffff) {
buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else {
FMT_THROW(format_error("failed to format time"));
}
}
return copy_str<char>(buf.data(), buf.data() + buf.size(), out);
}
return copy_str<char>(in.data(), in.data() + in.size(), out);
}

I think we can reuse it in path transform and in utf16_to_utf8 class.

This comment has been minimized.

Copy link
@phprus

phprus May 1, 2023

Contributor

I can make PR on this week.

I think implementation can be moved into the format.h header.
What do your think?

This comment has been minimized.

Copy link
@vitaut

vitaut May 1, 2023

Author Contributor

Sounds like a great idea, thanks!

if (c < 0x80) {
quoted.push_back(static_cast<unsigned char>(c));
} else if (c < 0x800) {
quoted.push_back(0b1100'0000 | ((c >> 6) & 0b01'1111));
quoted.push_back(0b1000'0000 | (c & 0b11'1111));
} else {
quoted.push_back(0b1110'0000 | ((c >> 12) & 0b01'1111));
quoted.push_back(0b1000'0000 | ((c >> 6) & 0b11'1111));
quoted.push_back(0b1000'0000 | (c & 0b11'1111));
}
}
}
# endif
template <>
Expand All @@ -86,7 +97,7 @@ struct formatter<std::filesystem::path, Char>
template <typename FormatContext>
auto format(const std::filesystem::path& p, FormatContext& ctx) const ->
typename FormatContext::iterator {
basic_memory_buffer<Char> quoted;
auto quoted = basic_memory_buffer<Char>();
detail::write_escaped_path(quoted, p);
return formatter<basic_string_view<Char>>::format(
basic_string_view<Char>(quoted.data(), quoted.size()), ctx);
Expand Down
36 changes: 10 additions & 26 deletions test/std-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,34 @@

using testing::StartsWith;

TEST(std_test, path) {
#ifdef __cpp_lib_filesystem
TEST(std_test, path) {
EXPECT_EQ(fmt::format("{:8}", std::filesystem::path("foo")), "\"foo\" ");
EXPECT_EQ(fmt::format("{}", std::filesystem::path("foo\"bar.txt")),
"\"foo\\\"bar.txt\"");
EXPECT_EQ(fmt::format("{:?}", std::filesystem::path("foo\"bar.txt")),
"\"foo\\\"bar.txt\"");

# ifdef _WIN32
// File.txt in Russian.
const wchar_t unicode_path[] = {0x424, 0x430, 0x439, 0x43b, 0x2e,
0x74, 0x78, 0x74, 0};
const char unicode_u8path[] = {'"', char(0xd0), char(0xa4), char(0xd0),
char(0xb0), char(0xd0), char(0xb9), char(0xd0),
char(0xbb), '.', 't', 'x',
't', '"', '\0'};
EXPECT_EQ(fmt::format("{}", std::filesystem::path(unicode_path)),
unicode_u8path);
EXPECT_EQ(fmt::format("{}", std::filesystem::path(
L"\x0428\x0447\x0443\x0447\x044B\x043D\x0448"
L"\x0447\x044B\x043D\x0430")),
"\"Шчучыншчына\"");
// EXPECT_EQ(fmt::format("{}", std::filesystem::path(L"\xd800")),
// "\\x{d800}");
# endif
#endif
}

TEST(ranges_std_test, format_vector_path) {
// Test ambiguity problem described in #2954.
#ifdef __cpp_lib_filesystem
TEST(ranges_std_test, format_vector_path) {
auto p = std::filesystem::path("foo/bar.txt");
auto c = std::vector<std::string>{"abc", "def"};
EXPECT_EQ(fmt::format("path={}, range={}", p, c),
"path=\"foo/bar.txt\", range=[\"abc\", \"def\"]");
#endif
}

// Test that path is not escaped twice in the debug mode.
TEST(ranges_std_test, format_quote_path) {
// Test that path is not escaped twice in the debug mode.
#ifdef __cpp_lib_filesystem
auto vec =
std::vector<std::filesystem::path>{"path1/file1.txt", "path2/file2.txt"};
EXPECT_EQ(fmt::format("{}", vec),
Expand All @@ -61,8 +54,8 @@ TEST(ranges_std_test, format_quote_path) {
EXPECT_EQ(fmt::format("{}", o), "optional(\"path/file.txt\")");
EXPECT_EQ(fmt::format("{:?}", o), "optional(\"path/file.txt\")");
# endif
#endif
}
#endif

TEST(std_test, thread_id) {
EXPECT_FALSE(fmt::format("{}", std::this_thread::get_id()).empty());
Expand Down Expand Up @@ -215,13 +208,4 @@ TEST(std_test, exception) {
} catch (const std::system_error& ex) {
EXPECT_THAT(fmt::format("{:t}", ex), StartsWith("std::system_error: "));
}

#ifdef __cpp_lib_filesystem
try {
throw std::filesystem::filesystem_error("message", std::error_code());
} catch (const std::filesystem::filesystem_error& ex) {
EXPECT_THAT(fmt::format("{:t}", ex),
StartsWith("std::filesystem::filesystem_error: "));
}
#endif

This comment has been minimized.

Copy link
@phprus

phprus Apr 30, 2023

Contributor

This is test case for:

// std::filesystem::__cxx11::* -> std::filesystem::*

conversion.

Why did you remove it?

This comment has been minimized.

Copy link
@vitaut

vitaut May 1, 2023

Author Contributor

Ah, thanks for catching this. I completely forgot why it is here and thought it is redundant. Will put it back with a comment.

}

0 comments on commit 02cae7e

Please sign in to comment.