fmtlib · codeinred · Mar 3, 2023 · Mar 3, 2023 · vitaut · Mar 4, 2023
diff --git a/include/fmt/core.h b/include/fmt/core.h
@@ -2204,20 +2204,32 @@ constexpr auto to_ascii(Char c) -> char {
   return c <= 0xff ? static_cast<char>(c) : '\0';
 }
 
+// Returns the length of a codepoint. Returns 0 for invalid codepoints.
 FMT_CONSTEXPR inline auto code_point_length_impl(char c) -> int {
   return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
       [static_cast<unsigned char>(c) >> 3];
 }
+// Returns the length of a codepoint. Returns 1 for invalid codepoints.
+// This is equivalent to
+//
+// int len = code_point_length_impl(c);
+// return len + !len;
+//
+// This is useful because it allows the compiler to check that the
+// length is within the range [1, 4]
+FMT_CONSTEXPR inline auto code_point_length_impl_2(char c) -> int {
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (static_cast<unsigned char>(c) >> 3))) & 0x3) + 1;
+}
 int len = code_point_length_impl(*s); 
 // Compute the pointer to the next character early so that the next 
 // iteration can start working on the next character. Neither Clang 
 // nor GCC figure out this reordering on their own. 
 const char* next = s + len + !len; 
 using uchar = unsigned char; 
 // Assume a four-byte character and load four bytes. Unused bits are 
 // shifted out. 
 *c = uint32_t(uchar(s[0]) & masks[len]) << 18; 
 FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int { 
   if (const_check(sizeof(Char) != 1)) return 1; 
   int len = code_point_length_impl(static_cast<char>(*begin)); 
   // Compute the pointer to the next character early so that the next 
   // iteration can start working on the next character. Neither Clang 
   // nor GCC figure out this reordering on their own. 
   return len + !len; 
 } 
 int len = code_point_length_impl(*s); 
 // Compute the pointer to the next character early so that the next 
 // iteration can start working on the next character. Neither Clang 
 // nor GCC figure out this reordering on their own. 
 const char* next = s + len + !len; 
  
 using uchar = unsigned char; 
  
 // Assume a four-byte character and load four bytes. Unused bits are 
 // shifted out. 
 *c = uint32_t(uchar(s[0]) & masks[len]) << 18; 
 FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int { 
   if (const_check(sizeof(Char) != 1)) return 1; 
   int len = code_point_length_impl(static_cast<char>(*begin)); 
  
   // Compute the pointer to the next character early so that the next 
   // iteration can start working on the next character. Neither Clang 
   // nor GCC figure out this reordering on their own. 
   return len + !len; 
 } 
 
 template <typename Char>
 FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
   if (const_check(sizeof(Char) != 1)) return 1;
-  int len = code_point_length_impl(static_cast<char>(*begin));
+  int len = code_point_length_impl_2(static_cast<char>(*begin));
 
   // Compute the pointer to the next character early so that the next
   // iteration can start working on the next character. Neither Clang
   // nor GCC figure out this reordering on their own.
-  return len + !len;
+  return len;
 }
 
 // Return the result via the out param to workaround gcc bug 77539.

diff --git a/test/core-test.cc b/test/core-test.cc
@@ -898,3 +898,18 @@ TEST(core_test, has_const_formatter) {
 TEST(core_test, format_nonconst) {
   EXPECT_EQ(fmt::format("{}", nonconst_formattable()), "test");
 }
+
+TEST(core_test, code_point_length_impl) {
+  // code_point_length_impl_2 is a bit-shifted version of code_point_length_impl
+  // that returns 1 for invalid codepoints, so that length is always in [1..4]
+  int min = CHAR_MIN;
+  int max = CHAR_MAX;
+
+  for(int ch = min; ch <= max; ch++) {
+    char c = static_cast<char>(ch);
+    int len1 = fmt::detail::code_point_length_impl(c);
+    int len2 = fmt::detail::code_point_length_impl_2(c);
+
+    ASSERT_EQ(len1 + !len1, len2);
+  }
+}