Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: derive canonicalize result from UCD #107

Merged
merged 4 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@ jobs:
- name: Set up Node.js 22
uses: actions/setup-node@v3
with:
# Always build using the same Node.js version, to ensure consistent
# results from scripts/iu-mappings.js.
# This version should be the same as the one used in the
# publish-on-tag workflow.
node-version: 22
- name: Install dependencies
run: npm install
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/publish-on-tag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ jobs:
- name: Set up Node.js 22
uses: actions/setup-node@v3
with:
# Always build using the same Node.js version, to ensure consistent
# results from scripts/iu-mappings.js.
# This version should be the same as the one used in the main workflow.
node-version: 22
- name: Install dependencies
run: npm install
Expand Down
10 changes: 10 additions & 0 deletions data/i-bmp-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ module.exports = new Map([
[0x198, 0x199],
[0x199, 0x198],
[0x19A, 0x23D],
[0x19B, 0xA7DC],
[0x19C, 0x26F],
[0x19D, 0x272],
[0x19E, 0x220],
Expand Down Expand Up @@ -398,6 +399,7 @@ module.exports = new Map([
[0x260, 0x193],
[0x261, 0xA7AC],
[0x263, 0x194],
[0x264, 0xA7CB],
[0x265, 0xA78D],
[0x266, 0xA7AA],
[0x268, 0x197],
Expand Down Expand Up @@ -1155,6 +1157,8 @@ module.exports = new Map([
[0x1C86, 0x44A],
[0x1C87, 0x463],
[0x1C88, 0xA64B],
[0x1C89, 0x1C8A],
[0x1C8A, 0x1C89],
[0x1C90, 0x10D0],
[0x1C91, 0x10D1],
[0x1C92, 0x10D2],
Expand Down Expand Up @@ -2174,12 +2178,18 @@ module.exports = new Map([
[0xA7C8, 0xA7C7],
[0xA7C9, 0xA7CA],
[0xA7CA, 0xA7C9],
[0xA7CB, 0x264],
[0xA7CC, 0xA7CD],
[0xA7CD, 0xA7CC],
[0xA7D0, 0xA7D1],
[0xA7D1, 0xA7D0],
[0xA7D6, 0xA7D7],
[0xA7D7, 0xA7D6],
[0xA7D8, 0xA7D9],
[0xA7D9, 0xA7D8],
[0xA7DA, 0xA7DB],
[0xA7DB, 0xA7DA],
[0xA7DC, 0x19B],
Copy link
Collaborator Author

@JLHwung JLHwung Nov 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes in i-bmp-mappings.js are only due to Unicode 16, where 1C89..1C8A, A7CB..A7CD and A7DA..A7DC are introduced: https://www.unicode.org/Public/UCD/latest/ucd/DerivedAge.txt

[0xA7F5, 0xA7F6],
[0xA7F6, 0xA7F5],
[0xAB53, 0xA7B3],
Expand Down
10 changes: 0 additions & 10 deletions data/iu-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ module.exports = new Map([
[0xDF, 0x1E9E],
[0xE5, 0x212B],
[0x17F, 0x53],
[0x19B, 0xA7DC],
[0x1C4, 0x1C5],
[0x1C5, 0x1C4],
[0x1C7, 0x1C8],
Expand All @@ -17,7 +16,6 @@ module.exports = new Map([
[0x1CB, 0x1CA],
[0x1F1, 0x1F2],
[0x1F2, 0x1F1],
[0x264, 0xA7CB],
[0x345, 0x1FBE],
[0x390, 0x1FD3],
[0x392, 0x3D0],
Expand Down Expand Up @@ -66,8 +64,6 @@ module.exports = new Map([
[0x1C86, 0x42A],
[0x1C87, 0x462],
[0x1C88, 0xA64A],
[0x1C89, 0x1C8A],
[0x1C8A, 0x1C89],
[0x1E60, 0x1E9B],
[0x1E9B, 0x1E60],
[0x1E9E, 0xDF],
Expand Down Expand Up @@ -141,12 +137,6 @@ module.exports = new Map([
0xE5
]],
[0xA64A, 0x1C88],
[0xA7CB, 0x264],
[0xA7CC, 0xA7CD],
[0xA7CD, 0xA7CC],
[0xA7DA, 0xA7DB],
[0xA7DB, 0xA7DA],
[0xA7DC, 0x19B],
[0xFB05, 0xFB06],
[0xFB06, 0xFB05],
[0x10400, 0x10428],
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@
},
"devDependencies": {
"jsesc": "^3.0.2",
"@unicode/unicode-16.0.0": "^1.6.0"
"@unicode/unicode-16.0.0": "^1.6.2"
}
}
36 changes: 31 additions & 5 deletions scripts/case-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,36 @@ const flattenMapping = (mapping, extendFilter) => {
return result;
};

const simpleUppercaseMapping = require('@unicode/unicode-16.0.0/Simple_Case_Mapping/Uppercase/symbols.js');
const specialUppercaseMapping = require('@unicode/unicode-16.0.0/Special_Casing/Uppercase/symbols.js');

const characterToUppercase = (character) => {
// Note: While the spec requires pulling in the Final_Sigma casing context data
// (can be accessed from ./Special_Casing/Uppercase--Final_Sigma/) to do a locale-
// insensitive full case conversion, we intentionally skip this data because
// Final_Sigma should not be activated when there is only one character in the string
return (
specialUppercaseMapping.get(character) ??
simpleUppercaseMapping.get(character) ??
character
);
};

// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
const canonicalize = (codepoint) => {
// when HasEitherUnicodeFlag is false and rer.[[IgnoreCase]] is true
const character = String.fromCodePoint(codepoint);
const u = characterToUppercase(character);
if (u.length !== 1) {
return codepoint;
}
const cu = u.codePointAt(0);
if (codepoint >= 0x7F && cu < 0x7F) {
return codepoint;
}
return cu;
}

// From <http://unicode.org/Public/UCD/latest/ucd/CaseFolding.txt>:
//
// The status field is:
Expand Down Expand Up @@ -161,11 +191,7 @@ for (const [from, to] of oneWayMappings) {
extend(filteredMappings, from, to);
} else {
// https://mths.be/es6#sec-runtime-semantics-canonicalize-abstract-operation
if(
// TODO: Make this not depend on the engine in which this build script
// runs. (If V8 has a bug, then the generated data has the same bug.)
!RegExp(String.fromCodePoint(from), 'i').test(String.fromCodePoint(to))
) {
if (canonicalize(from) !== canonicalize(to)) {
extend(filteredMappings, from, to);
} else if (from > 0x80 || to > 0x80) {
extend(filteredBMPMappings, from, to);
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/modifiers.js
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ const modifiersFixtures = [
'matches': ['k', 'K', '\u{212A}', '\u{0131}'],
'nonMatches': ['0', ',']
},
{
// Unicode 16
'pattern': '(?i:\u1C89)',
'expected': '(?:[\\u1C89\\u1C8A])'
},
// +m
{
'pattern': '(?m:^[a-z])',
Expand Down
Loading