From 18ee6a75b5c668f3f0d3380d26e1f8816b2f8bd2 Mon Sep 17 00:00:00 2001 From: James M Snell Date: Wed, 9 Oct 2024 11:05:20 -0700 Subject: [PATCH 1/3] Deduplicate multiple isHexDigit impls and optimize --- src/workerd/api/util.c++ | 6 +++--- src/workerd/api/util.h | 5 ----- src/workerd/jsg/BUILD.bazel | 1 + src/workerd/jsg/url.c++ | 7 ++----- src/workerd/util/strings.h | 17 +++++++++++++++++ 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/workerd/api/util.c++ b/src/workerd/api/util.c++ index 03e1b16092a..276f2916da2 100644 --- a/src/workerd/api/util.c++ +++ b/src/workerd/api/util.c++ @@ -7,6 +7,7 @@ #include "simdutf.h" #include +#include #include @@ -216,7 +217,6 @@ kj::String redactUrl(kj::StringPtr url) { bool isUpper = ('A' <= c && c <= 'Z'); bool isLower = ('a' <= c && c <= 'z'); bool isDigit = ('0' <= c && c <= '9'); - bool isHexDigit = (isDigit || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f')); bool isSep = (c == '+' || c == '-' || c == '_'); // These extra characters are used in the regular and url-safe versions of // base64, but might also be used for GUID-style separators in hex ids. @@ -225,10 +225,10 @@ kj::String redactUrl(kj::StringPtr url) { // character. if (isUpper || isLower || isDigit || isSep) { - if (isHexDigit) { + if (isHexDigit(c)) { hexDigitCount++; } - if (!isHexDigit && !isSep) { + if (!isHexDigit(c) && !isSep) { sawNonHexChar = true; } if (isUpper) { diff --git a/src/workerd/api/util.h b/src/workerd/api/util.h index 09edc77812e..5333c807939 100644 --- a/src/workerd/api/util.h +++ b/src/workerd/api/util.h @@ -35,11 +35,6 @@ kj::String toLower(kj::String&& str); // Mutate `str` with all alphabetic ASCII characters uppercased. Returns `str`. kj::String toUpper(kj::String&& str); -inline bool isHexDigit(uint32_t c) { - // Check if `c` is the ASCII code of a hexadecimal digit. - return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); -} - // Parse `rawText` as application/x-www-form-urlencoded name/value pairs and store in `query`. If // `skipLeadingQuestionMark` is true, any initial '?' will be ignored. Otherwise, it will be // interpreted as part of the first URL-encoded field. diff --git a/src/workerd/jsg/BUILD.bazel b/src/workerd/jsg/BUILD.bazel index b84b00b48b7..01de37e92d5 100644 --- a/src/workerd/jsg/BUILD.bazel +++ b/src/workerd/jsg/BUILD.bazel @@ -92,6 +92,7 @@ wd_cc_library( visibility = ["//visibility:public"], deps = [ ":memory-tracker", + "//src/workerd/util:strings", "@capnp-cpp//src/kj", ], ) diff --git a/src/workerd/jsg/url.c++ b/src/workerd/jsg/url.c++ index 515a43aa4a9..77e258212b7 100644 --- a/src/workerd/jsg/url.c++ +++ b/src/workerd/jsg/url.c++ @@ -1,5 +1,7 @@ #include "url.h" +#include + #include extern "C" { @@ -488,11 +490,6 @@ inline bool isAsciiDigit(char c) { return c >= '0' && c <= '9'; }; -inline bool isHexDigit(char c) { - // Check if `c` is the ASCII code of a hexadecimal digit. - return isAsciiDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); -} - inline bool isAscii(char codepoint) { return codepoint >= 0x00 && codepoint <= 0x7f; }; diff --git a/src/workerd/util/strings.h b/src/workerd/util/strings.h index c036128db2e..35c80e4c4d6 100644 --- a/src/workerd/util/strings.h +++ b/src/workerd/util/strings.h @@ -23,4 +23,21 @@ inline kj::String toLowerCopy(kj::ArrayPtr ptr) { return kj::mv(str); } +constexpr kj::FixedArray kHexDigitTable = []() consteval { + kj::FixedArray result{}; + for (uint8_t c: {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}) { + result[c] = true; + } + for (uint8_t c: {'A', 'B', 'C', 'D', 'E', 'F'}) { + result[c] = true; // Uppercase + result[c + 0x20] = true; // Lowercase + } + return result; +}(); + +// Check if `c` is the ASCII code of a hexadecimal digit. +constexpr bool isHexDigit(char c) { + return kHexDigitTable[static_cast(c)] == 1; +} + } // namespace workerd From 0a4ed822b18431b2131c65f5a34ea9c98bce778a Mon Sep 17 00:00:00 2001 From: James M Snell Date: Wed, 9 Oct 2024 12:52:05 -0700 Subject: [PATCH 2/3] Consolidate and optimize more ascii character checks --- src/workerd/api/data-url.c++ | 3 +- src/workerd/api/encoding.c++ | 1 + src/workerd/api/encoding.h | 12 ---- src/workerd/api/pyodide/pyodide.c++ | 5 +- src/workerd/api/util.c++ | 21 ++++--- src/workerd/util/strings.h | 89 +++++++++++++++++++++++------ 6 files changed, 88 insertions(+), 43 deletions(-) diff --git a/src/workerd/api/data-url.c++ b/src/workerd/api/data-url.c++ index 61808802b0a..42d55b9509a 100644 --- a/src/workerd/api/data-url.c++ +++ b/src/workerd/api/data-url.c++ @@ -1,6 +1,7 @@ #include "data-url.h" #include +#include #include @@ -39,7 +40,7 @@ kj::Maybe DataUrl::from(const jsg::Url& url) { static const auto strip = [](auto label) { auto result = kj::heapArray(label.size()); size_t len = 0; - for (auto c: label) { + for (const char c: label) { if (!isAsciiWhitespace(c)) { result[len++] = c; } diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 7186ad837a1..2ef849c7abb 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -7,6 +7,7 @@ #include "util.h" #include +#include #include #include diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index 420ec1c7b53..1dc24be2257 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -11,18 +11,6 @@ namespace workerd::api { -constexpr kj::FixedArray ascii_whitespace_table = []() consteval { - kj::FixedArray result{}; - for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) { - result[c] = true; - } - return result; -}(); - -constexpr bool isAsciiWhitespace(uint8_t c) noexcept { - return ascii_whitespace_table[c]; -} - // The encodings listed here are defined as required by the Encoding spec. // The first label is enum we use to identify the encoding in code, while // the second label is the public identifier. diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++ index 9c8a5884aab..b09b1b2aecc 100644 --- a/src/workerd/api/pyodide/pyodide.c++ +++ b/src/workerd/api/pyodide/pyodide.c++ @@ -4,6 +4,7 @@ #include "pyodide.h" #include +#include #include #include @@ -190,13 +191,13 @@ kj::Array ArtifactBundler::parsePythonScriptImports(kj::Array