Skip to content

Commit

Permalink
fix: derive canonicalize result from UCD (#107)
Browse files Browse the repository at this point in the history
This ensures that the case mappings in data always conform to the latest Unicode version.

Co-authored-by: Mathias Bynens <mathias@qiwi.be>
  • Loading branch information
JLHwung and mathiasbynens authored Nov 4, 2024
1 parent f2f88e6 commit c9db4c2
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 23 deletions.
4 changes: 0 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@ jobs:
- name: Set up Node.js 22
uses: actions/setup-node@v3
with:
# Always build using the same Node.js version, to ensure consistent
# results from scripts/iu-mappings.js.
# This version should be the same as the one used in the
# publish-on-tag workflow.
node-version: 22
- name: Install dependencies
run: npm install
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/publish-on-tag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ jobs:
- name: Set up Node.js 22
uses: actions/setup-node@v3
with:
# Always build using the same Node.js version, to ensure consistent
# results from scripts/iu-mappings.js.
# This version should be the same as the one used in the main workflow.
node-version: 22
- name: Install dependencies
run: npm install
Expand Down
10 changes: 10 additions & 0 deletions data/i-bmp-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ module.exports = new Map([
[0x198, 0x199],
[0x199, 0x198],
[0x19A, 0x23D],
[0x19B, 0xA7DC],
[0x19C, 0x26F],
[0x19D, 0x272],
[0x19E, 0x220],
Expand Down Expand Up @@ -398,6 +399,7 @@ module.exports = new Map([
[0x260, 0x193],
[0x261, 0xA7AC],
[0x263, 0x194],
[0x264, 0xA7CB],
[0x265, 0xA78D],
[0x266, 0xA7AA],
[0x268, 0x197],
Expand Down Expand Up @@ -1155,6 +1157,8 @@ module.exports = new Map([
[0x1C86, 0x44A],
[0x1C87, 0x463],
[0x1C88, 0xA64B],
[0x1C89, 0x1C8A],
[0x1C8A, 0x1C89],
[0x1C90, 0x10D0],
[0x1C91, 0x10D1],
[0x1C92, 0x10D2],
Expand Down Expand Up @@ -2174,12 +2178,18 @@ module.exports = new Map([
[0xA7C8, 0xA7C7],
[0xA7C9, 0xA7CA],
[0xA7CA, 0xA7C9],
[0xA7CB, 0x264],
[0xA7CC, 0xA7CD],
[0xA7CD, 0xA7CC],
[0xA7D0, 0xA7D1],
[0xA7D1, 0xA7D0],
[0xA7D6, 0xA7D7],
[0xA7D7, 0xA7D6],
[0xA7D8, 0xA7D9],
[0xA7D9, 0xA7D8],
[0xA7DA, 0xA7DB],
[0xA7DB, 0xA7DA],
[0xA7DC, 0x19B],
[0xA7F5, 0xA7F6],
[0xA7F6, 0xA7F5],
[0xAB53, 0xA7B3],
Expand Down
10 changes: 0 additions & 10 deletions data/iu-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ module.exports = new Map([
[0xDF, 0x1E9E],
[0xE5, 0x212B],
[0x17F, 0x53],
[0x19B, 0xA7DC],
[0x1C4, 0x1C5],
[0x1C5, 0x1C4],
[0x1C7, 0x1C8],
Expand All @@ -17,7 +16,6 @@ module.exports = new Map([
[0x1CB, 0x1CA],
[0x1F1, 0x1F2],
[0x1F2, 0x1F1],
[0x264, 0xA7CB],
[0x345, 0x1FBE],
[0x390, 0x1FD3],
[0x392, 0x3D0],
Expand Down Expand Up @@ -66,8 +64,6 @@ module.exports = new Map([
[0x1C86, 0x42A],
[0x1C87, 0x462],
[0x1C88, 0xA64A],
[0x1C89, 0x1C8A],
[0x1C8A, 0x1C89],
[0x1E60, 0x1E9B],
[0x1E9B, 0x1E60],
[0x1E9E, 0xDF],
Expand Down Expand Up @@ -141,12 +137,6 @@ module.exports = new Map([
0xE5
]],
[0xA64A, 0x1C88],
[0xA7CB, 0x264],
[0xA7CC, 0xA7CD],
[0xA7CD, 0xA7CC],
[0xA7DA, 0xA7DB],
[0xA7DB, 0xA7DA],
[0xA7DC, 0x19B],
[0xFB05, 0xFB06],
[0xFB06, 0xFB05],
[0x10400, 0x10428],
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@
},
"devDependencies": {
"jsesc": "^3.0.2",
"@unicode/unicode-16.0.0": "^1.6.0"
"@unicode/unicode-16.0.0": "^1.6.2"
}
}
36 changes: 31 additions & 5 deletions scripts/case-mappings.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,36 @@ const flattenMapping = (mapping, extendFilter) => {
return result;
};

const simpleUppercaseMapping = require('@unicode/unicode-16.0.0/Simple_Case_Mapping/Uppercase/symbols.js');
const specialUppercaseMapping = require('@unicode/unicode-16.0.0/Special_Casing/Uppercase/symbols.js');

const characterToUppercase = (character) => {
// Note: While the spec requires pulling in the Final_Sigma casing context data
// (can be accessed from ./Special_Casing/Uppercase--Final_Sigma/) to do a locale-
// insensitive full case conversion, we intentionally skip this data because
// Final_Sigma should not be activated when there is only one character in the string
return (
specialUppercaseMapping.get(character) ??
simpleUppercaseMapping.get(character) ??
character
);
};

// https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch
const canonicalize = (codepoint) => {
// when HasEitherUnicodeFlag is false and rer.[[IgnoreCase]] is true
const character = String.fromCodePoint(codepoint);
const u = characterToUppercase(character);
if (u.length !== 1) {
return codepoint;
}
const cu = u.codePointAt(0);
if (codepoint >= 0x7F && cu < 0x7F) {
return codepoint;
}
return cu;
}

// From <http://unicode.org/Public/UCD/latest/ucd/CaseFolding.txt>:
//
// The status field is:
Expand Down Expand Up @@ -161,11 +191,7 @@ for (const [from, to] of oneWayMappings) {
extend(filteredMappings, from, to);
} else {
// https://mths.be/es6#sec-runtime-semantics-canonicalize-abstract-operation
if(
// TODO: Make this not depend on the engine in which this build script
// runs. (If V8 has a bug, then the generated data has the same bug.)
!RegExp(String.fromCodePoint(from), 'i').test(String.fromCodePoint(to))
) {
if (canonicalize(from) !== canonicalize(to)) {
extend(filteredMappings, from, to);
} else if (from > 0x80 || to > 0x80) {
extend(filteredBMPMappings, from, to);
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/modifiers.js
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ const modifiersFixtures = [
'matches': ['k', 'K', '\u{212A}', '\u{0131}'],
'nonMatches': ['0', ',']
},
{
// Unicode 16
'pattern': '(?i:\u1C89)',
'expected': '(?:[\\u1C89\\u1C8A])'
},
// +m
{
'pattern': '(?m:^[a-z])',
Expand Down

0 comments on commit c9db4c2

Please sign in to comment.