Skip to content

Commit

Permalink
Merge pull request #2876 from cloudflare/jsnell/dedup-optimize-ishexd…
Browse files Browse the repository at this point in the history
…igit
  • Loading branch information
jasnell authored Oct 10, 2024
2 parents c9e01d3 + dc975ed commit 1d6b431
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 37 deletions.
3 changes: 2 additions & 1 deletion src/workerd/api/data-url.c++
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "data-url.h"

#include <workerd/api/encoding.h>
#include <workerd/util/strings.h>

#include <kj/encoding.h>

Expand Down Expand Up @@ -39,7 +40,7 @@ kj::Maybe<DataUrl> DataUrl::from(const jsg::Url& url) {
static const auto strip = [](auto label) {
auto result = kj::heapArray<kj::byte>(label.size());
size_t len = 0;
for (auto c: label) {
for (const kj::byte c: label) {
if (!isAsciiWhitespace(c)) {
result[len++] = c;
}
Expand Down
1 change: 1 addition & 0 deletions src/workerd/api/encoding.c++
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "util.h"

#include <workerd/jsg/jsg.h>
#include <workerd/util/strings.h>

#include <unicode/ucnv.h>
#include <unicode/utf8.h>
Expand Down
12 changes: 0 additions & 12 deletions src/workerd/api/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,6 @@

namespace workerd::api {

constexpr kj::FixedArray<uint8_t, 256> ascii_whitespace_table = []() consteval {
kj::FixedArray<uint8_t, 256> result{};
for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
result[c] = true;
}
return result;
}();

constexpr bool isAsciiWhitespace(uint8_t c) noexcept {
return ascii_whitespace_table[c];
}

// The encodings listed here are defined as required by the Encoding spec.
// The first label is enum we use to identify the encoding in code, while
// the second label is the public identifier.
Expand Down
5 changes: 3 additions & 2 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "pyodide.h"

#include <workerd/util/string-buffer.h>
#include <workerd/util/strings.h>

#include <kj/array.h>
#include <kj/common.h>
Expand Down Expand Up @@ -190,13 +191,13 @@ kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::St
// We also accept `.` because import idents can contain it.
// TODO: We don't currently support unicode, but if we see packages that utilise it we will
// implement that support.
if (std::isdigit(str[start])) {
if (isDigit(str[start])) {
return 0;
}
int i = 0;
for (; start + i < str.size(); i++) {
char c = str[start + i];
bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
bool validIdentChar = isAlpha(c) || isDigit(c) || c == '_' || c == '.';
if (!validIdentChar) {
return i;
}
Expand Down
23 changes: 13 additions & 10 deletions src/workerd/api/util.c++
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "simdutf.h"

#include <workerd/util/mimetype.h>
#include <workerd/util/strings.h>

#include <kj/encoding.h>

Expand Down Expand Up @@ -213,28 +214,30 @@ kj::String redactUrl(kj::StringPtr url) {
};

for (const char& c: url) {
bool isUpper = ('A' <= c && c <= 'Z');
bool isLower = ('a' <= c && c <= 'z');
bool isDigit = ('0' <= c && c <= '9');
bool isHexDigit = (isDigit || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'));
bool isSep = (c == '+' || c == '-' || c == '_');
uint8_t lookup = kCharLookupTable[c];
bool isSep = lookup & CharAttributeFlag::SEPARATOR;
bool isAlphaUpper = lookup & CharAttributeFlag::UPPER_CASE;
bool isAlphaLower = lookup & CharAttributeFlag::LOWER_CASE;
bool isDigit = lookup & CharAttributeFlag::DIGIT;
bool isHex = lookup & CharAttributeFlag::HEX;

// These extra characters are used in the regular and url-safe versions of
// base64, but might also be used for GUID-style separators in hex ids.
// Regular base64 also includes '/', which we don't try to match here due
// to its prevalence in URLs. Likewise, we ignore the base64 "=" padding
// character.

if (isUpper || isLower || isDigit || isSep) {
if (isHexDigit) {
if (isAlphaUpper || isAlphaLower || isDigit || isSep) {
if (isHex) {
hexDigitCount++;
}
if (!isHexDigit && !isSep) {
if (!isHex && !isSep) {
sawNonHexChar = true;
}
if (isUpper) {
if (isAlphaUpper) {
upperCount++;
}
if (isLower) {
if (isAlphaLower) {
lowerCount++;
}
if (isDigit) {
Expand Down
5 changes: 0 additions & 5 deletions src/workerd/api/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ kj::String toLower(kj::String&& str);
// Mutate `str` with all alphabetic ASCII characters uppercased. Returns `str`.
kj::String toUpper(kj::String&& str);

inline bool isHexDigit(uint32_t c) {
// Check if `c` is the ASCII code of a hexadecimal digit.
return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
}

// Parse `rawText` as application/x-www-form-urlencoded name/value pairs and store in `query`. If
// `skipLeadingQuestionMark` is true, any initial '?' will be ignored. Otherwise, it will be
// interpreted as part of the first URL-encoded field.
Expand Down
1 change: 1 addition & 0 deletions src/workerd/jsg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ wd_cc_library(
visibility = ["//visibility:public"],
deps = [
":memory-tracker",
"//src/workerd/util:strings",
"@capnp-cpp//src/kj",
],
)
Expand Down
7 changes: 2 additions & 5 deletions src/workerd/jsg/url.c++
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "url.h"

#include <workerd/util/strings.h>

#include <kj/hash.h>

extern "C" {
Expand Down Expand Up @@ -488,11 +490,6 @@ inline bool isAsciiDigit(char c) {
return c >= '0' && c <= '9';
};

inline bool isHexDigit(char c) {
// Check if `c` is the ASCII code of a hexadecimal digit.
return isAsciiDigit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
}

inline bool isAscii(char codepoint) {
return codepoint >= 0x00 && codepoint <= 0x7f;
};
Expand Down
72 changes: 70 additions & 2 deletions src/workerd/util/strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,90 @@
// https://opensource.org/licenses/Apache-2.0
#pragma once

#include <kj/debug.h>
#include <kj/string.h>

namespace workerd {

enum CharAttributeFlag : uint8_t {
NONE = 0,
ALPHA = 1 << 0,
DIGIT = 1 << 1,
HEX = 1 << 2,
ASCII = 1 << 3,
ASCII_WHITESPACE = 1 << 4,
UPPER_CASE = 1 << 5,
LOWER_CASE = 1 << 6,
SEPARATOR = 1 << 7,
};

// Construct a lookup table for various interesting character properties.
constexpr kj::FixedArray<uint8_t, 256> kCharLookupTable = []() consteval {
kj::FixedArray<uint8_t, 256> result{};
for (uint8_t c = 'A'; c <= 'Z'; c++) {
if (c <= 'F') {
result[c] |= CharAttributeFlag::HEX;
result[c + 0x20] |= CharAttributeFlag::HEX;
}
result[c] |= CharAttributeFlag::ALPHA | CharAttributeFlag::UPPER_CASE;
result[c + 0x20] |= CharAttributeFlag::ALPHA | CharAttributeFlag::LOWER_CASE;
}
for (uint8_t c = '0'; c <= '9'; c++) {
result[c] |= CharAttributeFlag::DIGIT | CharAttributeFlag::HEX;
}
for (uint8_t c = 0; c <= 0x7f; c++) {
result[c] |= CharAttributeFlag::ASCII;
}
for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
result[c] |= CharAttributeFlag::ASCII_WHITESPACE;
}
result['+'] |= CharAttributeFlag::SEPARATOR;
result['-'] |= CharAttributeFlag::SEPARATOR;
result['_'] |= CharAttributeFlag::SEPARATOR;
return result;
}();

constexpr bool isAlpha(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ALPHA;
}

constexpr bool isDigit(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::DIGIT;
}

// Check if `c` is the ASCII code of a hexadecimal digit.
constexpr bool isHexDigit(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::HEX;
}

constexpr bool isAscii(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ASCII;
}

constexpr bool isAsciiWhitespace(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ASCII_WHITESPACE;
}

constexpr bool isAlphaUpper(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::UPPER_CASE;
}

constexpr bool isAlphaLower(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::LOWER_CASE;
}

inline kj::String toLowerCopy(kj::StringPtr ptr) {
auto str = kj::str(ptr);
for (char& c: str) {
if ('A' <= c && c <= 'Z') c += 'a' - 'A';
if (isAlphaUpper(c)) c += 0x20;
}
return kj::mv(str);
}

inline kj::String toLowerCopy(kj::ArrayPtr<const char> ptr) {
auto str = kj::str(ptr);
for (char& c: str) {
if ('A' <= c && c <= 'Z') c += 'a' - 'A';
if (isAlphaUpper(c)) c += 0x20;
}
return kj::mv(str);
}
Expand Down

0 comments on commit 1d6b431

Please sign in to comment.