Skip to content

Commit

Permalink
LibUnicode: Tweak code point general category APIs
Browse files Browse the repository at this point in the history
The motivation is to have `code_point_has_*_control_general_category()`
functions, so that clients don't have to look at `GeneralCategory`
values directly and hence don't have to worry about
`code_point_has_general_category()` returning an empty Optional. This
isn't quite achieved since some code inside LibLocale still uses some of
them, but it removes uses in LibGfx and LibJS.

This kind of cherry-picks the 2nd commit of
LadybirdBrowser/ladybird#239 (aa3a30870b58c47cb37bce1418d7e6bee7af71d9):
The Lexer.cpp, RegexByteCode.cpp, CharacterTypes.h, KeyboardEvent.cpp
changes are by trflynn.

This does not (yet?) pick up the new type for `GeneralCategory`. In the
future, we might want to make the generator generate a
`GeneralCategoryEnum` type and have the
`AK_TYPEDEF_DISTINCT_NUMERIC_GENERAL` in Forward.h, but at the moment
that makes things more complicated. (It also means we don't pick up the
RegexParser.cpp changes.)

Co-authored-by: Tim Flynn <trflynn89@serenityos.org>
  • Loading branch information
2 people authored and nico committed Nov 26, 2024
1 parent 5311895 commit bf74a49
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 15 deletions.
6 changes: 1 addition & 5 deletions Userland/Libraries/LibJS/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,11 +518,7 @@ bool Lexer::is_whitespace() const
auto code_point = current_code_point();
if (code_point == NO_BREAK_SPACE || code_point == ZERO_WIDTH_NO_BREAK_SPACE)
return true;

static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
if (space_separator_category.has_value())
return Unicode::code_point_has_general_category(code_point, *space_separator_category);
return false;
return Unicode::code_point_has_space_separator_general_category(code_point);
}

// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
Expand Down
6 changes: 1 addition & 5 deletions Userland/Libraries/LibRegex/RegexByteCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,15 +776,11 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& inp
bool OpCode_Compare::matches_character_class(CharClass character_class, u32 ch, bool insensitive)
{
constexpr auto is_space_or_line_terminator = [](u32 code_point) {
static auto space_separator = Unicode::general_category_from_string("Space_Separator"sv);
if (!space_separator.has_value())
return is_ascii_space(code_point);

if ((code_point == 0x0a) || (code_point == 0x0d) || (code_point == 0x2028) || (code_point == 0x2029))
return true;
if ((code_point == 0x09) || (code_point == 0x0b) || (code_point == 0x0c) || (code_point == 0xfeff))
return true;
return Unicode::code_point_has_general_category(code_point, *space_separator);
return Unicode::code_point_has_space_separator_general_category(code_point);
};

switch (character_class) {
Expand Down
19 changes: 19 additions & 0 deletions Userland/Libraries/LibUnicode/CharacterTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,25 @@ template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);

Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }

bool code_point_has_control_general_category(u32 code_point)
{
#if ENABLE_UNICODE_DATA
return code_point_has_general_category(code_point, Unicode::GeneralCategory::Control);
#else
return false;
#endif
}

bool code_point_has_space_separator_general_category(u32 code_point)
{
#if ENABLE_UNICODE_DATA
return code_point_has_general_category(code_point, Unicode::GeneralCategory::Space_Separator);
#else
return is_ascii_space(code_point);
#endif
}

Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; }

Expand Down
3 changes: 3 additions & 0 deletions Userland/Libraries/LibUnicode/CharacterTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ Optional<size_t> find_ignoring_case(ViewType, ViewType);
Optional<GeneralCategory> general_category_from_string(StringView);
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);

bool code_point_has_control_general_category(u32 code_point);
bool code_point_has_space_separator_general_category(u32 code_point);

Optional<Property> property_from_string(StringView);
bool code_point_has_property(u32 code_point, Property property);

Expand Down
6 changes: 1 addition & 5 deletions Userland/Libraries/LibWeb/UIEvents/KeyboardEvent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,11 +246,7 @@ static ErrorOr<Optional<String>> get_event_key_string(u32 code_point)
auto is_non_control_character = [&]() {
// A non-control character is any valid Unicode character except those that are part of the "Other, Control"
// ("Cc") General Category.
static auto control_general_category = Unicode::general_category_from_string("Cc"sv);
if (!control_general_category.has_value())
return true;

return !Unicode::code_point_has_general_category(code_point, *control_general_category);
return !Unicode::code_point_has_control_general_category(code_point);
};

// A key string is a string containing a 0 or 1 non-control characters ("base" characters) followed by 0 or more
Expand Down

0 comments on commit bf74a49

Please sign in to comment.