Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Support full Unicode in lexer #3117

Merged
merged 1 commit into from
Jul 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cspell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ overrides:
- filename: '**/docs/APIReference-*.md'
ignoreRegExpList: ['/href="[^"]*"/']

ignoreRegExpList:
- u\{[0-9a-f]{1,8}\}

words:
- graphiql
- sublinks
Expand Down
255 changes: 224 additions & 31 deletions src/language/__tests__/lexer-test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ function expectSyntaxError(text: string) {
}

describe('Lexer', () => {
it('disallows uncommon control characters', () => {
expectSyntaxError('\u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 1 }],
});
});

it('ignores BOM header', () => {
expect(lexOne('\uFEFF foo')).to.contain({
kind: TokenKind.NAME,
Expand Down Expand Up @@ -269,12 +262,98 @@ describe('Lexer', () => {
value: 'slashes \\ /',
});

expect(lexOne('"unescaped unicode outside BMP \u{1f600}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(
lexOne('"unescaped maximal unicode outside BMP \u{10ffff}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unescaped maximal unicode outside BMP \u{10ffff}',
});

expect(lexOne('"unicode \\u1234\\u5678\\u90AB\\uCDEF"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 34,
value: 'unicode \u1234\u5678\u90AB\uCDEF',
});

expect(lexOne('"unicode \\u{1234}\\u{5678}\\u{90AB}\\u{CDEF}"')).to.contain(
{
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'unicode \u1234\u5678\u90AB\uCDEF',
},
);

expect(
lexOne('"string with unicode escape outside BMP \\u{1F600}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 50,
value: 'string with unicode escape outside BMP \u{1f600}',
});

expect(lexOne('"string with minimal unicode escape \\u{0}"')).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 42,
value: 'string with minimal unicode escape \u{0}',
});

expect(
lexOne('"string with maximal unicode escape \\u{10FFFF}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 47,
value: 'string with maximal unicode escape \u{10FFFF}',
});

expect(
lexOne('"string with maximal minimal unicode escape \\u{00000000}"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 57,
value: 'string with maximal minimal unicode escape \u{0}',
});

expect(
lexOne('"string with unicode surrogate pair escape \\uD83D\\uDE00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with unicode surrogate pair escape \u{1f600}',
});

expect(
lexOne('"string with minimal surrogate pair escape \\uD800\\uDC00"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with minimal surrogate pair escape \u{10000}',
});

expect(
lexOne('"string with maximal surrogate pair escape \\uDBFF\\uDFFF"'),
).to.contain({
kind: TokenKind.STRING,
start: 0,
end: 56,
value: 'string with maximal surrogate pair escape \u{10FFFF}',
});
});

it('lex reports useful string errors', () => {
Expand Down Expand Up @@ -304,16 +383,19 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('"contains unescaped \u0007 control char"').to.deep.equal(
{
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 21 }],
},
);
expectSyntaxError('"bad surrogate \uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad high surrogate pair \uDEAD\uDEAD"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError('"null-byte is not \u0000 end of file"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 19 }],
expectSyntaxError('"bad low surrogate pair \uD800\uD800"').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D800.',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError('"multi\nline"').to.deep.equal({
Expand Down Expand Up @@ -360,6 +442,93 @@ describe('Lexer', () => {
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uXXXF".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{}".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FXXX} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FX".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF ".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"bad \\u{FFFF"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{FFFF"".',
locations: [{ line: 1, column: 6 }],
});

expectSyntaxError('"too high \\u{110000} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{110000}".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"way too high \\u{12345678} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{12345678}".',
locations: [{ line: 1, column: 15 }],
});

expectSyntaxError('"too long \\u{000000000} esc"').to.deep.equal({
message:
'Syntax Error: Invalid Unicode escape sequence: "\\u{000000000".',
locations: [{ line: 1, column: 11 }],
});

expectSyntaxError('"bad surrogate \\uDEAD esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError('"bad surrogate \\u{DEAD} esc"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{DEAD}".',
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"cannot use braces for surrogate pair \\u{D83D}\\u{DE00} esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\u{D83D}".',
locations: [{ line: 1, column: 39 }],
});

expectSyntaxError(
'"bad high surrogate pair \\uDEAD\\uDEAD esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uDEAD".',
locations: [{ line: 1, column: 26 }],
});

expectSyntaxError(
'"bad low surrogate pair \\uD800\\uD800 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD800".',
locations: [{ line: 1, column: 25 }],
});

expectSyntaxError(
'"cannot escape half a pair \uD83D\\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+D83D.',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError(
'"cannot escape half a pair \\uD83D\uDE00 esc"',
).to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 28 }],
});

expectSyntaxError('"bad \\uD83D\\not an escape"').to.deep.equal({
message: 'Syntax Error: Invalid Unicode escape sequence: "\\uD83D".',
locations: [{ line: 1, column: 6 }],
});
});

it('lexes block strings', () => {
Expand Down Expand Up @@ -419,6 +588,13 @@ describe('Lexer', () => {
value: 'unescaped \\n\\r\\b\\t\\f\\u1234',
});

expect(lexOne('"""unescaped unicode outside BMP \u{1f600}"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
end: 38,
value: 'unescaped unicode outside BMP \u{1f600}',
});

expect(lexOne('"""slashes \\\\ \\/"""')).to.contain({
kind: TokenKind.BLOCK_STRING,
start: 0,
Expand Down Expand Up @@ -491,18 +667,9 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 16 }],
});

expectSyntaxError(
'"""contains unescaped \u0007 control char"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0007.',
locations: [{ line: 1, column: 23 }],
});

expectSyntaxError(
'"""null-byte is not \u0000 end of file"""',
).to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+0000.',
locations: [{ line: 1, column: 21 }],
expectSyntaxError('"""contains invalid surrogate \uDEAD"""').to.deep.equal({
message: 'Syntax Error: Invalid character within String: U+DEAD.',
locations: [{ line: 1, column: 31 }],
});
});

Expand Down Expand Up @@ -842,6 +1009,16 @@ describe('Lexer', () => {
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\x00').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0000.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\b').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+0008.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u00AA').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+00AA.',
locations: [{ line: 1, column: 1 }],
Expand All @@ -856,6 +1033,16 @@ describe('Lexer', () => {
message: 'Syntax Error: Unexpected character: U+203B.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\u{1f600}').to.deep.equal({
message: 'Syntax Error: Unexpected character: U+1F600.',
locations: [{ line: 1, column: 1 }],
});

expectSyntaxError('\uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 1 }],
});
});

it('lex reports useful information for dashes in names', () => {
Expand Down Expand Up @@ -936,9 +1123,15 @@ describe('Lexer', () => {
end: 9,
value: ' Comment',
});
expectSyntaxError('# \u0007').to.deep.equal({
message: 'Syntax Error: Invalid character: U+0007.',
locations: [{ line: 1, column: 3 }],
expect(lexOne('# Comment \u{1f600}').prev).to.contain({
kind: TokenKind.COMMENT,
start: 0,
end: 12,
value: ' Comment \u{1f600}',
});
expectSyntaxError('# Invalid surrogate \uDEAD').to.deep.equal({
message: 'Syntax Error: Invalid character: U+DEAD.',
locations: [{ line: 1, column: 21 }],
});
});
});
Expand Down
Loading