Consolidate and optimize more ascii character checks

cloudflare · Oct 10, 2024 · 0a4ed82 · 0a4ed82
1 parent 18ee6a7
commit 0a4ed82
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 43 deletions.
diff --git a/src/workerd/api/data-url.c++ b/src/workerd/api/data-url.c++
@@ -1,6 +1,7 @@
 #include "data-url.h"
 
 #include <workerd/api/encoding.h>
+#include <workerd/util/strings.h>
 
 #include <kj/encoding.h>
 
@@ -39,7 +40,7 @@ kj::Maybe<DataUrl> DataUrl::from(const jsg::Url& url) {
   static const auto strip = [](auto label) {
     auto result = kj::heapArray<kj::byte>(label.size());
     size_t len = 0;
-    for (auto c: label) {
+    for (const char c: label) {
       if (!isAsciiWhitespace(c)) {
         result[len++] = c;
       }

diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++
@@ -7,6 +7,7 @@
 #include "util.h"
 
 #include <workerd/jsg/jsg.h>
+#include <workerd/util/strings.h>
 
 #include <unicode/ucnv.h>
 #include <unicode/utf8.h>

diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h
@@ -11,18 +11,6 @@
 
 namespace workerd::api {
 
-constexpr kj::FixedArray<uint8_t, 256> ascii_whitespace_table = []() consteval {
-  kj::FixedArray<uint8_t, 256> result{};
-  for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
-    result[c] = true;
-  }
-  return result;
-}();
-
-constexpr bool isAsciiWhitespace(uint8_t c) noexcept {
-  return ascii_whitespace_table[c];
-}
-
 // The encodings listed here are defined as required by the Encoding spec.
 // The first label is enum we use to identify the encoding in code, while
 // the second label is the public identifier.

diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++
@@ -4,6 +4,7 @@
 #include "pyodide.h"
 
 #include <workerd/util/string-buffer.h>
+#include <workerd/util/strings.h>
 
 #include <kj/array.h>
 #include <kj/common.h>
@@ -190,13 +191,13 @@ kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::St
       // We also accept `.` because import idents can contain it.
       // TODO: We don't currently support unicode, but if we see packages that utilise it we will
       // implement that support.
-      if (std::isdigit(str[start])) {
+      if (isDigit(str[start])) {
         return 0;
       }
       int i = 0;
       for (; start + i < str.size(); i++) {
         char c = str[start + i];
-        bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
+        bool validIdentChar = isAlpha(c) || isDigit(c) || c == '_' || c == '.';
         if (!validIdentChar) {
           return i;
         }

diff --git a/src/workerd/api/util.c++ b/src/workerd/api/util.c++
@@ -214,27 +214,30 @@ kj::String redactUrl(kj::StringPtr url) {
   };
 
   for (const char& c: url) {
-    bool isUpper = ('A' <= c && c <= 'Z');
-    bool isLower = ('a' <= c && c <= 'z');
-    bool isDigit = ('0' <= c && c <= '9');
-    bool isSep = (c == '+' || c == '-' || c == '_');
+    uint8_t lookup = kCharLookupTable[c];
+    bool isSep = lookup & CharAttributeFlag::SEPARATOR;
+    bool isAlphaUpper = lookup & CharAttributeFlag::UPPER_CASE;
+    bool isAlphaLower = lookup & CharAttributeFlag::LOWER_CASE;
+    bool isDigit = lookup & CharAttributeFlag::DIGIT;
+    bool isHex = lookup & CharAttributeFlag::HEX;
+
     // These extra characters are used in the regular and url-safe versions of
     // base64, but might also be used for GUID-style separators in hex ids.
     // Regular base64 also includes '/', which we don't try to match here due
     // to its prevalence in URLs.  Likewise, we ignore the base64 "=" padding
     // character.
 
-    if (isUpper || isLower || isDigit || isSep) {
-      if (isHexDigit(c)) {
+    if (isAlphaUpper || isAlphaLower || isDigit || isSep) {
+      if (isHex) {
         hexDigitCount++;
       }
-      if (!isHexDigit(c) && !isSep) {
+      if (!isHex && !isSep) {
         sawNonHexChar = true;
       }
-      if (isUpper) {
+      if (isAlphaUpper) {
         upperCount++;
       }
-      if (isLower) {
+      if (isAlphaLower) {
         lowerCount++;
       }
       if (isDigit) {

diff --git a/src/workerd/util/strings.h b/src/workerd/util/strings.h
@@ -3,41 +3,92 @@
 //     https://opensource.org/licenses/Apache-2.0
 #pragma once
 
+#include <kj/debug.h>
 #include <kj/string.h>
 
 namespace workerd {
 
+enum CharAttributeFlag : uint8_t {
+  NONE = 0,
+  ALPHA = 1 << 0,
+  DIGIT = 1 << 1,
+  HEX = 1 << 2,
+  ASCII = 1 << 3,
+  ASCII_WHITESPACE = 1 << 4,
+  UPPER_CASE = 1 << 5,
+  LOWER_CASE = 1 << 6,
+  SEPARATOR = 1 << 7,
+};
+
+// Construct a lookup table for various interesting character properties.
+constexpr kj::FixedArray<uint8_t, 256> kCharLookupTable = []() consteval {
+  kj::FixedArray<uint8_t, 256> result{};
+  for (uint8_t c = 'A'; c <= 'Z'; c++) {
+    if (c <= 'F') {
+      result[c] |= CharAttributeFlag::HEX;
+      result[c + 0x20] |= CharAttributeFlag::HEX;
+    }
+    result[c] |= CharAttributeFlag::ALPHA | CharAttributeFlag::UPPER_CASE;
+    result[c + 0x20] |= CharAttributeFlag::ALPHA | CharAttributeFlag::LOWER_CASE;
+  }
+  for (uint8_t c = '0'; c <= '9'; c++) {
+    result[c] |= CharAttributeFlag::DIGIT | CharAttributeFlag::HEX;
+  }
+  for (uint8_t c = 0; c <= 0x7f; c++) {
+    result[c] |= CharAttributeFlag::ASCII;
+  }
+  for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
+    result[c] |= CharAttributeFlag::ASCII_WHITESPACE;
+  }
+  result['+'] |= CharAttributeFlag::SEPARATOR;
+  result['-'] |= CharAttributeFlag::SEPARATOR;
+  result['_'] |= CharAttributeFlag::SEPARATOR;
+  return result;
+}();
+
+constexpr bool isAlpha(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::ALPHA;
+}
+
+constexpr bool isDigit(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::DIGIT;
+}
+
+// Check if `c` is the ASCII code of a hexadecimal digit.
+constexpr bool isHexDigit(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::HEX;
+}
+
+constexpr bool isAscii(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::ASCII;
+}
+
+constexpr bool isAsciiWhitespace(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::ASCII_WHITESPACE;
+}
+
+constexpr bool isAlphaUpper(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::UPPER_CASE;
+}
+
+constexpr bool isAlphaLower(const kj::byte c) noexcept {
+  return kCharLookupTable[c] & CharAttributeFlag::LOWER_CASE;
+}
+
 inline kj::String toLowerCopy(kj::StringPtr ptr) {
   auto str = kj::str(ptr);
   for (char& c: str) {
-    if ('A' <= c && c <= 'Z') c += 'a' - 'A';
+    if (isAlphaUpper(c)) c += 0x20;
   }
   return kj::mv(str);
 }
 
 inline kj::String toLowerCopy(kj::ArrayPtr<const char> ptr) {
   auto str = kj::str(ptr);
   for (char& c: str) {
-    if ('A' <= c && c <= 'Z') c += 'a' - 'A';
+    if (isAlphaUpper(c)) c += 0x20;
   }
   return kj::mv(str);
 }
 
-constexpr kj::FixedArray<uint8_t, 256> kHexDigitTable = []() consteval {
-  kj::FixedArray<uint8_t, 256> result{};
-  for (uint8_t c: {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}) {
-    result[c] = true;
-  }
-  for (uint8_t c: {'A', 'B', 'C', 'D', 'E', 'F'}) {
-    result[c] = true;         // Uppercase
-    result[c + 0x20] = true;  // Lowercase
-  }
-  return result;
-}();
-
-// Check if `c` is the ASCII code of a hexadecimal digit.
-constexpr bool isHexDigit(char c) {
-  return kHexDigitTable[static_cast<int>(c)] == 1;
-}
-
 }  // namespace workerd