Skip to content

Commit

Permalink
formatter: render numeric character references like named ones (#490)
Browse files Browse the repository at this point in the history
  • Loading branch information
gibson042 authored Oct 8, 2022
1 parent 7bc7f2e commit 3a4b91d
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 3 deletions.
43 changes: 42 additions & 1 deletion src/formatter/text.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,53 @@
import { LineBuilder } from './line-builder';
const entities = require('../../entities-processed.json');

function isBadNumericReference(codePoint: number) {
// https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
if (
// NULL character
!codePoint ||
// out of range
codePoint > 0x10ffff ||
// surrogate
(0xd800 <= codePoint && codePoint <= 0xdfff) ||
// noncharacter
(0xfdd0 <= codePoint && codePoint <= 0xfdef) ||
(codePoint & 0xfffe) === 0xfffe ||
// control character (but without exceptions for tab/line feed/form feed/space)
(0 <= codePoint && codePoint <= 0x1f) ||
(0x7f <= codePoint && codePoint <= 0x9f)
) {
return true;
}
return false;
}

export function printText(text: string, indent: number): LineBuilder {
const output: LineBuilder = new LineBuilder(indent);
if (text === '') {
return output;
}
text = text.replace(/&[a-zA-Z0-9]+;?/g, m => {
text = text.replace(/&(?:[a-zA-Z0-9]+|#[Xx]([0-9a-fA-F]+)|#([0-9]+));?/g, (m, hex, decimal) => {
if (hex || decimal) {
// pass through bad references,
// normalize '&' and '<' into named references,
// and transform everything else
const codePoint = parseInt(hex || decimal, hex ? 16 : 10);
if (isBadNumericReference(codePoint)) {
return m;
}
const ch = String.fromCodePoint(codePoint);
if (/\p{White_Space}|\p{DI}|\p{gc=M}|\p{gc=C}/u.test(ch)) {
return m;
}
if (ch === '&') {
return '&amp;';
} else if (ch === '<') {
return '&lt;';
}
return ch;
}

// entities[m] is null if the entity expands to '&', '<', or a string which has blank/control/etc characters
if ({}.hasOwnProperty.call(entities, m) && entities[m] !== null) {
return entities[m];
Expand Down
8 changes: 6 additions & 2 deletions test/formatter.js
Original file line number Diff line number Diff line change
Expand Up @@ -657,17 +657,21 @@ describe('entities', () => {
await assertDocFormatsAs(
`
<div>
some entities are transformed: &AMP; &AMP &LT &frac12; &frac12 &fjlig; &CapitalDifferentialD;
some named entities are transformed: &AMP; &AMP &LT &frac12; &frac12 &fjlig; &CapitalDifferentialD;
others are preserved: &amp; &lt; &nbsp; &nbsp &NotAnEntity;
numeric entities are transformed too: &#x26; &#38; &#x3C; &#60; &#X1d306;
unless they're whitespace etc: &#xA0;
</div>
<emu-alg>
1. This also works in algorithms, as in &laquo; 0, 1 &raquo;.
</emu-alg>
`,
dedentKeepingTrailingNewline`
<div>
some entities are transformed: &amp; &amp; &lt; ½ ½ fj ⅅ
some named entities are transformed: &amp; &amp; &lt; ½ ½ fj ⅅ
others are preserved: &amp; &lt; &nbsp; &nbsp &NotAnEntity;
numeric entities are transformed too: &amp; &amp; &lt; &lt; 𝌆
unless they're whitespace etc: &#xA0;
</div>
<emu-alg>
1. This also works in algorithms, as in « 0, 1 ».
Expand Down

0 comments on commit 3a4b91d

Please sign in to comment.