Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Unicode property escape support to RegExp #1295

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e906034
Add Unicode properties support to genUnicodeTable.py
Feb 6, 2024
fe08a42
Generate Unicode data tables.
Feb 6, 2024
349bb05
Bump Unicode Database version to 15.1.0
Mar 30, 2024
fc556df
Generate Unicode data tables.
Mar 30, 2024
13a7b45
Helper to add Unicode property ranges to a CodePointSet
Feb 6, 2024
8eceeba
Replace Unicode digit and connector punctuation usage
Feb 6, 2024
ab1d616
Add support for Unicode property names to the RegExp parser
Feb 6, 2024
3bd76fd
Tests for Unicode property names in RegExp
Feb 6, 2024
9b207c8
Update RegExp docs to include Unicode property name support
Feb 6, 2024
9cceed9
Improved structures for Unicode properties from genUnicodeTable.py
Feb 15, 2024
c10a01b
Generate Unicode data tables with improved structures
Feb 15, 2024
90a7228
Modify addUnicodePropertyRanges to use new structures
Feb 15, 2024
bf74303
Avoid dynamic reallocation in genUnicodeTable.py Unicode properties
Feb 23, 2024
e3bca6e
Generate Unicode data tables
Feb 23, 2024
cacddc7
Add RegExp Unicode tests for compound General_Category properties
Feb 23, 2024
42592b2
Modify addUnicodePropertyRanges to use offset and size structures
Feb 23, 2024
a689255
Add a CMake option to disable RegExp Unicode property escapes
Mar 13, 2024
53cc304
Skip tests for Unicode property escapes when the feature is disabled
Mar 13, 2024
e3e2d50
Add more docstrings and high-level comments to genUnicodeTable.py
Mar 30, 2024
0b92af0
Manually format range pool output to 3 columns in genUnicodeTable.py
Mar 31, 2024
b31c60b
Generate Unicode data tables
Mar 31, 2024
1b14949
Remove the need for `findRangeMapEntry`
Apr 1, 2024
0f71098
Prefer to pass `string_view` by value
Apr 1, 2024
f4388a0
Add a comment about Script_Extensions sharing names with Scripts
Apr 1, 2024
f1d0057
Treat an empty property name as a parser error right away
Apr 1, 2024
e35ea31
Assert the "not unicode" pre-condition when parsing a Unicode propert…
Apr 1, 2024
68e77a9
Use ArrayRef in addUnicodePropertyRanges to simplify the usage
Apr 1, 2024
10c1264
Generate contiguous script and script extension ranges
Apr 3, 2024
55a36b8
Generate Unicode data tables
Apr 3, 2024
2dcc669
Split the responsibility of `addUnicodeProperty`
Apr 3, 2024
339f5c9
Add more test cases for script extensions
Apr 3, 2024
8ce3a2f
Make the logical flow of `unicodePropertyRanges` more linear
Apr 6, 2024
64a18e7
Replace unnecessary bitfields with standard types
May 4, 2024
b372992
Expand split string of binary property names to a literal list
May 4, 2024
3b223ab
Add instructions for the use of genUnicodeTable.py
May 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,9 @@ set(EMSCRIPTEN_FASTCOMP OFF CACHE BOOL
set(HERMES_ENABLE_INTL OFF CACHE BOOL
"Enable JS Intl support (WIP)")

set(HERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES ON CACHE BOOL
"Enable RegExp Unicode Property Escapes support")

set(HERMES_ENABLE_TEST_SUITE ON CACHE BOOL
"Enable the test suite")

Expand Down Expand Up @@ -470,6 +473,10 @@ if (HERMES_ENABLE_INTL)
add_definitions(-DHERMES_ENABLE_INTL)
endif()

if (HERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES)
add_definitions(-DHERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES)
endif()

if (HERMES_ENABLE_WERROR)
# Turn all warnings into errors on GCC-compatible compilers.
if (GCC_COMPATIBLE)
Expand Down
3 changes: 0 additions & 3 deletions doc/RegExp.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,4 @@ As of this writing, Hermes regexp supports
1. All of ES6, including global, case-insensitive, multiline, sticky, and Unicode (and legacy).
1. ES9 lookbehinds.
1. Named capture groups.

Missing features from ES9 include:

1. Unicode property escapes.
neildhar marked this conversation as resolved.
Show resolved Hide resolved
28 changes: 28 additions & 0 deletions include/hermes/Platform/Unicode/CharacterProperties.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@

#include <cassert>
#include <cstdint>
#include <string>

#include "llvh/ADT/ArrayRef.h"

namespace hermes {

Expand Down Expand Up @@ -96,6 +99,16 @@ inline bool isUnicodeIDContinue(uint32_t cp) {
cp == UNICODE_ZWNJ || cp == UNICODE_ZWJ;
}

/// \return true if the codepoint is valid in a unicode property name
inline bool isUnicodePropertyName(uint32_t ch) {
return ch == '_' || ((ch | 32) >= 'a' && (ch | 32) <= 'z');
}

/// \return true if the codepoint is valid in a unicode property value
inline bool isUnicodePropertyValue(uint32_t ch) {
return isUnicodePropertyName(ch) || isUnicodeDigit(ch);
}

/// \return the canonicalized value of \p cp, following ES9 21.2.2.8.2.
uint32_t canonicalize(uint32_t cp, bool unicode);

Expand All @@ -104,6 +117,21 @@ class CodePointSet;
/// any character in \p set, following ES9 21.2.2.8.2.
CodePointSet makeCanonicallyEquivalent(const CodePointSet &set, bool unicode);

struct UnicodeRangePoolRef;

// Create a codepoint range array from a Unicode \p propertyName and \p
// propertyValue.
llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
std::string_view propertyName,
std::string_view propertyValue);

/// Add a codepoint range array of codepoints to \p receiver, typically used in
/// conjuction with unicodePropertyRanges.
void addRangeArrayPoolToBracket(
CodePointSet *receiver,
const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
bool inverted);

} // namespace hermes

#endif // HERMES_PLATFORMUNICODE_CHARACTERPROPERTIES_H
6 changes: 6 additions & 0 deletions include/hermes/Regex/RegexNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,12 @@ class BracketNode : public Node {
classes_.push_back(cls);
}

void addCodePointRanges(
llvh::ArrayRef<UnicodeRangePoolRef> rangeArray,
bool inverted = false) {
addRangeArrayPoolToBracket(&codePointSet_, rangeArray, inverted);
}

virtual MatchConstraintSet matchConstraints() const override {
MatchConstraintSet result = 0;
if (!canMatchASCII())
Expand Down
23 changes: 22 additions & 1 deletion include/hermes/Regex/RegexTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
#include "llvh/ADT/SmallString.h"

namespace hermes {

struct UnicodeRangePoolRef;

namespace regex {
namespace constants {

Expand Down Expand Up @@ -122,7 +125,10 @@ enum class ErrorType {
InvalidNamedReference,

/// Reference to nonexistent capture group.
NonexistentNamedCaptureReference
NonexistentNamedCaptureReference,

/// Invalid Unicode property name or value
InvalidPropertyName,
};

/// \return an error message for the given \p error.
Expand Down Expand Up @@ -158,6 +164,8 @@ inline const char *messageForError(ErrorType error) {
return "Invalid named reference";
case ErrorType::NonexistentNamedCaptureReference:
return "Nonexistent named capture reference";
case ErrorType::InvalidPropertyName:
return "Invalid property name";
case ErrorType::None:
return "No error";
}
Expand Down Expand Up @@ -209,6 +217,19 @@ struct CharacterClass {
CharacterClass(Type type, bool invert) : type_(type), inverted_(invert) {}
};

// Type wrapping up a Unicode codepoint range array.
struct CharacterClassCodepointRanges {
llvh::ArrayRef<UnicodeRangePoolRef> rangeArray;

// Whether the class is inverted (\P instead of \p).
bool inverted_;

CharacterClassCodepointRanges(
llvh::ArrayRef<UnicodeRangePoolRef> rangeArray,
bool inverted)
: rangeArray(rangeArray), inverted_(inverted) {}
};

// Struct representing flags which may be used when constructing the RegExp
class SyntaxFlags {
private:
Expand Down
3 changes: 3 additions & 0 deletions lib/CompilerDriver/CompilerDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,9 @@ void printHermesVersion(
#endif
#ifdef HERMESVM_CONTIGUOUS_HEAP
<< " Contiguous Heap\n"
#endif
#ifdef HERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES
<< " Unicode RegExp Property Escapes\n"
#endif
<< " Zip file input\n";
}
Expand Down
126 changes: 126 additions & 0 deletions lib/Platform/Unicode/CharacterProperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <algorithm>
#include <climits>
#include <iterator>
#include <string>
#include <utility>

namespace hermes {
Expand Down Expand Up @@ -210,4 +211,129 @@ uint32_t canonicalize(uint32_t cp, bool unicode) {
}
}

#ifdef HERMES_ENABLE_UNICODE_REGEXP_PROPERTY_ESCAPES

/// Find a matching entry (such as \p NameMapEntry or \p RangeMapEntry) by
/// matching a string \p name against the entry's \p name field.
template <class T>
static const T *findMapEntry(
const llvh::ArrayRef<T> &arrayRef,
const std::string_view name) {
auto it = std::lower_bound(
std::begin(arrayRef),
std::end(arrayRef),
name,
[](const T &a, std::string_view b) {
return UNICODE_DATA_STRING_POOL.compare(a.name.offset, a.name.size, b) <
0;
});
if (it == std::end(arrayRef) ||
UNICODE_DATA_STRING_POOL.compare(it->name.offset, it->name.size, name) !=
0) {
return nullptr;
}
return it;
}

llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
std::string_view propertyName,
std::string_view propertyValue) {
const NameMapEntry *canonicalNameEntry;
llvh::ArrayRef<RangeMapEntry> rangeMap;

if (propertyValue.empty()) {
// There was no property value, this is either a binary property or a value
// from General_Category, as per `LoneUnicodePropertyNameOrValue`.
if ((canonicalNameEntry = findMapEntry(
llvh::ArrayRef(canonicalPropertyNameMap_BinaryProperty),
propertyName))) {
rangeMap = unicodePropertyRangeMap_BinaryProperty;
} else if ((canonicalNameEntry = findMapEntry(
llvh::ArrayRef(canonicalPropertyNameMap_GeneralCategory),
propertyName))) {
rangeMap = unicodePropertyRangeMap_GeneralCategory;
}
} else {
// There was a property value, assume the name is a category.
if ((propertyName == "General_Category" || propertyName == "gc") &&
(canonicalNameEntry = findMapEntry(
llvh::ArrayRef(canonicalPropertyNameMap_GeneralCategory),
propertyValue))) {
rangeMap = unicodePropertyRangeMap_GeneralCategory;
} else if (
(propertyName == "Script" || propertyName == "sc") &&
(canonicalNameEntry = findMapEntry(
llvh::ArrayRef(canonicalPropertyNameMap_Script), propertyValue))) {
rangeMap = unicodePropertyRangeMap_Script;
} else if (
(propertyName == "Script_Extensions" || propertyName == "scx") &&
// Since Script_Extensions is a superset of Script, they share
// a name map.
(canonicalNameEntry = findMapEntry(
llvh::ArrayRef(canonicalPropertyNameMap_Script), propertyValue))) {
rangeMap = unicodePropertyRangeMap_ScriptExtensions;
} else {
return llvh::ArrayRef<UnicodeRangePoolRef>();
}
}

if (canonicalNameEntry == nullptr) {
return llvh::ArrayRef<UnicodeRangePoolRef>();
}

// Look up the range arrays for the property.
auto rangeMapEntry = findMapEntry(
rangeMap,
UNICODE_DATA_STRING_POOL.substr(
canonicalNameEntry->canonical.offset,
canonicalNameEntry->canonical.size));
if (rangeMapEntry == nullptr) {
return llvh::ArrayRef<UnicodeRangePoolRef>();
}

return llvh::ArrayRef{
&UNICODE_RANGE_ARRAY_POOL[rangeMapEntry->rangeArrayPoolOffset],
rangeMapEntry->rangeArraySize};
}

void addRangeArrayPoolToBracket(
CodePointSet *receiver,
const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
bool inverted) {
for (auto rangePoolRef : rangeArrayPool) {
auto rangePool = llvh::ArrayRef<UnicodeRange>{
&UNICODE_RANGE_POOL[rangePoolRef.offset], rangePoolRef.size};

if (inverted) {
uint32_t last = 0;
for (auto range : rangePool) {
receiver->add(CodePointRange{last, range.first - last});
last = range.second + 1;
}
// Add the final range.
receiver->add(CodePointRange{last, UNICODE_MAX_VALUE - last});
} else {
for (auto range : rangePool) {
const uint32_t length = range.second - range.first + 1;
receiver->add(CodePointRange{range.first, length});
}
}
}
}

#else

llvh::ArrayRef<UnicodeRangePoolRef> unicodePropertyRanges(
std::string_view propertyName,
std::string_view propertyValue) {
return llvh::ArrayRef<UnicodeRangePoolRef>();
}

void addRangeArrayPoolToBracket(
CodePointSet *receiver,
const llvh::ArrayRef<UnicodeRangePoolRef> rangeArrayPool,
bool inverted) {}

#endif

} // namespace hermes
Loading