Skip to content

Commit

Permalink
Consolidate and optimize more ascii character checks
Browse files Browse the repository at this point in the history
  • Loading branch information
jasnell committed Oct 10, 2024
1 parent 18ee6a7 commit 0a4ed82
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 43 deletions.
3 changes: 2 additions & 1 deletion src/workerd/api/data-url.c++
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "data-url.h"

#include <workerd/api/encoding.h>
#include <workerd/util/strings.h>

#include <kj/encoding.h>

Expand Down Expand Up @@ -39,7 +40,7 @@ kj::Maybe<DataUrl> DataUrl::from(const jsg::Url& url) {
static const auto strip = [](auto label) {
auto result = kj::heapArray<kj::byte>(label.size());
size_t len = 0;
for (auto c: label) {
for (const char c: label) {
if (!isAsciiWhitespace(c)) {
result[len++] = c;
}
Expand Down
1 change: 1 addition & 0 deletions src/workerd/api/encoding.c++
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "util.h"

#include <workerd/jsg/jsg.h>
#include <workerd/util/strings.h>

#include <unicode/ucnv.h>
#include <unicode/utf8.h>
Expand Down
12 changes: 0 additions & 12 deletions src/workerd/api/encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,6 @@

namespace workerd::api {

constexpr kj::FixedArray<uint8_t, 256> ascii_whitespace_table = []() consteval {
kj::FixedArray<uint8_t, 256> result{};
for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
result[c] = true;
}
return result;
}();

constexpr bool isAsciiWhitespace(uint8_t c) noexcept {
return ascii_whitespace_table[c];
}

// The encodings listed here are defined as required by the Encoding spec.
// The first label is enum we use to identify the encoding in code, while
// the second label is the public identifier.
Expand Down
5 changes: 3 additions & 2 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "pyodide.h"

#include <workerd/util/string-buffer.h>
#include <workerd/util/strings.h>

#include <kj/array.h>
#include <kj/common.h>
Expand Down Expand Up @@ -190,13 +191,13 @@ kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::St
// We also accept `.` because import idents can contain it.
// TODO: We don't currently support unicode, but if we see packages that utilise it we will
// implement that support.
if (std::isdigit(str[start])) {
if (isDigit(str[start])) {
return 0;
}
int i = 0;
for (; start + i < str.size(); i++) {
char c = str[start + i];
bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
bool validIdentChar = isAlpha(c) || isDigit(c) || c == '_' || c == '.';
if (!validIdentChar) {
return i;
}
Expand Down
21 changes: 12 additions & 9 deletions src/workerd/api/util.c++
Original file line number Diff line number Diff line change
Expand Up @@ -214,27 +214,30 @@ kj::String redactUrl(kj::StringPtr url) {
};

for (const char& c: url) {
bool isUpper = ('A' <= c && c <= 'Z');
bool isLower = ('a' <= c && c <= 'z');
bool isDigit = ('0' <= c && c <= '9');
bool isSep = (c == '+' || c == '-' || c == '_');
uint8_t lookup = kCharLookupTable[c];
bool isSep = lookup & CharAttributeFlag::SEPARATOR;
bool isAlphaUpper = lookup & CharAttributeFlag::UPPER_CASE;
bool isAlphaLower = lookup & CharAttributeFlag::LOWER_CASE;
bool isDigit = lookup & CharAttributeFlag::DIGIT;
bool isHex = lookup & CharAttributeFlag::HEX;

// These extra characters are used in the regular and url-safe versions of
// base64, but might also be used for GUID-style separators in hex ids.
// Regular base64 also includes '/', which we don't try to match here due
// to its prevalence in URLs. Likewise, we ignore the base64 "=" padding
// character.

if (isUpper || isLower || isDigit || isSep) {
if (isHexDigit(c)) {
if (isAlphaUpper || isAlphaLower || isDigit || isSep) {
if (isHex) {
hexDigitCount++;
}
if (!isHexDigit(c) && !isSep) {
if (!isHex && !isSep) {
sawNonHexChar = true;
}
if (isUpper) {
if (isAlphaUpper) {
upperCount++;
}
if (isLower) {
if (isAlphaLower) {
lowerCount++;
}
if (isDigit) {
Expand Down
89 changes: 70 additions & 19 deletions src/workerd/util/strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,92 @@
// https://opensource.org/licenses/Apache-2.0
#pragma once

#include <kj/debug.h>
#include <kj/string.h>

namespace workerd {

enum CharAttributeFlag : uint8_t {
NONE = 0,
ALPHA = 1 << 0,
DIGIT = 1 << 1,
HEX = 1 << 2,
ASCII = 1 << 3,
ASCII_WHITESPACE = 1 << 4,
UPPER_CASE = 1 << 5,
LOWER_CASE = 1 << 6,
SEPARATOR = 1 << 7,
};

// Construct a lookup table for various interesting character properties.
constexpr kj::FixedArray<uint8_t, 256> kCharLookupTable = []() consteval {
kj::FixedArray<uint8_t, 256> result{};
for (uint8_t c = 'A'; c <= 'Z'; c++) {
if (c <= 'F') {
result[c] |= CharAttributeFlag::HEX;
result[c + 0x20] |= CharAttributeFlag::HEX;
}
result[c] |= CharAttributeFlag::ALPHA | CharAttributeFlag::UPPER_CASE;
result[c + 0x20] |= CharAttributeFlag::ALPHA | CharAttributeFlag::LOWER_CASE;
}
for (uint8_t c = '0'; c <= '9'; c++) {
result[c] |= CharAttributeFlag::DIGIT | CharAttributeFlag::HEX;
}
for (uint8_t c = 0; c <= 0x7f; c++) {
result[c] |= CharAttributeFlag::ASCII;
}
for (uint8_t c: {0x09, 0x0a, 0x0c, 0x0d, 0x20}) {
result[c] |= CharAttributeFlag::ASCII_WHITESPACE;
}
result['+'] |= CharAttributeFlag::SEPARATOR;
result['-'] |= CharAttributeFlag::SEPARATOR;
result['_'] |= CharAttributeFlag::SEPARATOR;
return result;
}();

constexpr bool isAlpha(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ALPHA;
}

constexpr bool isDigit(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::DIGIT;
}

// Check if `c` is the ASCII code of a hexadecimal digit.
constexpr bool isHexDigit(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::HEX;
}

constexpr bool isAscii(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ASCII;
}

constexpr bool isAsciiWhitespace(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::ASCII_WHITESPACE;
}

constexpr bool isAlphaUpper(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::UPPER_CASE;
}

constexpr bool isAlphaLower(const kj::byte c) noexcept {
return kCharLookupTable[c] & CharAttributeFlag::LOWER_CASE;
}

inline kj::String toLowerCopy(kj::StringPtr ptr) {
auto str = kj::str(ptr);
for (char& c: str) {
if ('A' <= c && c <= 'Z') c += 'a' - 'A';
if (isAlphaUpper(c)) c += 0x20;
}
return kj::mv(str);
}

inline kj::String toLowerCopy(kj::ArrayPtr<const char> ptr) {
auto str = kj::str(ptr);
for (char& c: str) {
if ('A' <= c && c <= 'Z') c += 'a' - 'A';
if (isAlphaUpper(c)) c += 0x20;
}
return kj::mv(str);
}

constexpr kj::FixedArray<uint8_t, 256> kHexDigitTable = []() consteval {
kj::FixedArray<uint8_t, 256> result{};
for (uint8_t c: {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}) {
result[c] = true;
}
for (uint8_t c: {'A', 'B', 'C', 'D', 'E', 'F'}) {
result[c] = true; // Uppercase
result[c + 0x20] = true; // Lowercase
}
return result;
}();

// Check if `c` is the ASCII code of a hexadecimal digit.
constexpr bool isHexDigit(char c) {
return kHexDigitTable[static_cast<int>(c)] == 1;
}

} // namespace workerd

0 comments on commit 0a4ed82

Please sign in to comment.