Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scanner: implement UTF-32 escape code #19911

Merged
merged 2 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u16_err_a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u16_err_a.vv:2:15: error: `\u` incomplete 16 bit unicode character value
1 | fn main() {
2 | println('\u')
| ^
3 | }
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u16_err_b.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u16_err_b.vv:2:15: error: `\u` incomplete 16 bit unicode character value
1 | fn main() {
2 | println('\u345')
| ^
3 | }
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u32_err_a.vv:2:15: error: `\U` incomplete 32 bit unicode character value
1 | fn main() {
2 | println('\U')
| ^
3 | }
3 changes: 3 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_a.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fn main() {
println('\U')
}
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_b.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u32_err_b.vv:2:15: error: `\U` incomplete 32 bit unicode character value
1 | fn main() {
2 | println('\U345')
| ^
3 | }
3 changes: 3 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_b.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fn main() {
println('\U345')
}
5 changes: 0 additions & 5 deletions vlib/v/checker/tests/string_escape_u_err_a.out

This file was deleted.

5 changes: 0 additions & 5 deletions vlib/v/checker/tests/string_escape_u_err_b.out

This file was deleted.

7 changes: 5 additions & 2 deletions vlib/v/gen/native/tests/string.vv
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ fn test_escape_codes() {
println(star1)
star2 := '\u2605'
println(star2)
star3 := '\U00002605'
println(star3)

aaa := '\x61\141a'
println(aaa)
Expand All @@ -33,13 +35,14 @@ fn test_runes() {

// should all print `★`
print(`\u2605`)
print(`\U00002605`)
print(`\xe2\x98\x85`)
println(`\xe2\x98\x85`)
println(`\xe2\x98\x85`)
}

fn main() {
test_unicode_characters()
test_escape_codes()
test_raw_string()
test_runes()
}
}
3 changes: 2 additions & 1 deletion vlib/v/gen/native/tests/string.vv.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
😀😆😎💻🌎
aaa
## #
### #
hello\tworld\n
V
😀
🚀
★★★
★★★
98 changes: 75 additions & 23 deletions vlib/v/scanner/scanner.v
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,8 @@ fn (mut s Scanner) ident_string() string {
s.inc_line_number()
}
s.is_inside_string = false
mut u_escapes_pos := []int{} // pos list of \uXXXX
mut u16_escapes_pos := []int{} // pos list of \uXXXX
mut u32_escapes_pos := []int{} // pos list of \UXXXXXXXX
mut h_escapes_pos := []int{} // pos list of \xXX
mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
for {
Expand Down Expand Up @@ -1247,7 +1248,7 @@ fn (mut s Scanner) ident_string() string {
if c == scanner.b_lf {
s.inc_line_number()
}
// Escape `\x` `\u`
// Escape `\x` `\u` `\U`
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
// Escape `\x`
if c == `x` {
Expand All @@ -1263,9 +1264,23 @@ fn (mut s Scanner) ident_string() string {
|| s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
|| !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
|| !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() {
s.error(r'`\u` incomplete unicode character value')
s.error(r'`\u` incomplete 16 bit unicode character value')
}
u_escapes_pos << s.pos - 1
u16_escapes_pos << s.pos - 1
}
// Escape `\U`
if c == `U` {
if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
|| s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
|| s.text[s.pos + 5] == s.quote || s.text[s.pos + 6] == s.quote
|| s.text[s.pos + 7] == s.quote || s.text[s.pos + 8] == s.quote
|| !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
|| !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit()
|| !s.text[s.pos + 5].is_hex_digit() || !s.text[s.pos + 6].is_hex_digit()
|| !s.text[s.pos + 7].is_hex_digit() || !s.text[s.pos + 8].is_hex_digit() {
s.error(r'`\U` incomplete 32 bit unicode character value')
}
u32_escapes_pos << s.pos - 1
}
// Unknown escape sequence
if !is_escape_sequence(c) && !c.is_digit() {
Expand Down Expand Up @@ -1307,19 +1322,26 @@ fn (mut s Scanner) ident_string() string {
if !s.is_fmt {
mut segment_idx := 0
mut str_segments := []string{}
if u_escapes_pos.len + h_escapes_pos.len > 0 {
if u16_escapes_pos.len + h_escapes_pos.len + u32_escapes_pos.len > 0 {
mut all_pos := []int{}
all_pos << u_escapes_pos
all_pos << u16_escapes_pos
all_pos << u32_escapes_pos
all_pos << h_escapes_pos
if u_escapes_pos.len != 0 && h_escapes_pos.len != 0 {
all_pos.sort()
}
all_pos.sort()

for pos in all_pos {
str_segments << string_so_far[segment_idx..(pos - start)]
segment_idx = pos - start

if pos in u_escapes_pos {
end_idx, segment := s.decode_u_escape_single(string_so_far, segment_idx)
if pos in u16_escapes_pos {
end_idx, segment := s.decode_u16_escape_single(string_so_far,
segment_idx)
str_segments << segment
segment_idx = end_idx
}
if pos in u32_escapes_pos {
end_idx, segment := s.decode_u32_escape_single(string_so_far,
segment_idx)
str_segments << segment
segment_idx = end_idx
}
Expand Down Expand Up @@ -1407,7 +1429,7 @@ fn (mut s Scanner) decode_o_escapes(sinput string, start int, escapes_pos []int)
return ss.join('')
}

fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) {
fn (mut s Scanner) decode_u16_escape_single(str string, idx int) (int, string) {
end_idx := idx + 6 // "\uXXXX".len == 6
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
Expand All @@ -1418,9 +1440,32 @@ fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) {
return end_idx, utf32_to_str(u32(escaped_code_point))
}

// decode a single unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_uerune(str string) string {
end_idx, segment := s.decode_u_escape_single(str, 0)
// decode a single 16 bit unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_u16erune(str string) string {
end_idx, segment := s.decode_u16_escape_single(str, 0)
if str.len == end_idx {
return segment
}
mut ss := []string{cap: 2}
ss << segment
ss << str[end_idx..]
return ss.join('')
}

fn (mut s Scanner) decode_u32_escape_single(str string, idx int) (int, string) {
end_idx := idx + 10 // "\uXXXXXXXX".len == 10
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
if rune(escaped_code_point).length_in_bytes() == -1 {
s.error('invalid unicode point `${str}`')
}

return end_idx, utf32_to_str(u32(escaped_code_point))
}

// decode a single 32 bit unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_u32erune(str string) string {
end_idx, segment := s.decode_u32_escape_single(str, 0)
if str.len == end_idx {
return segment
}
Expand Down Expand Up @@ -1448,7 +1493,7 @@ fn trim_slash_line_break(s string) string {
@[inline]
fn is_escape_sequence(c u8) bool {
return c in [`x`, `u`, `e`, `n`, `r`, `t`, `v`, `a`, `f`, `b`, `\\`, `\``, `$`, `@`, `?`, `{`,
`}`, `'`, `"`]
`}`, `'`, `"`, `U`]
}

/// ident_char is called when a backtick "single-char" is parsed from the code
Expand All @@ -1460,6 +1505,7 @@ fn is_escape_sequence(c u8) bool {
/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
/// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a'
/// escaped unicode literals like `\u2605`
/// escaped unicode 32 literals like `\U00002605`
/// escaped utf8 runes in hex like `\xe2\x98\x85` => (★)
/// escaped utf8 runes in octal like `\342\230\205` => (★)
fn (mut s Scanner) ident_char() string {
Expand All @@ -1475,8 +1521,10 @@ fn (mut s Scanner) ident_char() string {

// set flags for advanced escapes first
escaped_hex := s.expect('\\x', start + 1)
escaped_unicode := s.expect('\\u', start + 1)
escaped_octal := !escaped_hex && !escaped_unicode && s.expect('\\', start + 1)
escaped_unicode_16 := s.expect('\\u', start + 1)
escaped_unicode_32 := s.expect('\\U', start + 1)
escaped_octal := !escaped_hex && !escaped_unicode_16 && !escaped_unicode_32
&& s.expect('\\', start + 1)

// walk the string to get characters up to the next backtick
for {
Expand Down Expand Up @@ -1505,13 +1553,17 @@ fn (mut s Scanner) ident_char() string {
// the string inside the backticks is longer than one character
// but we might only have one rune... attempt to decode escapes
// if the content expresses an escape code, it will have an even number of characters
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
// we don't handle binary escape codes in rune literals
orig := c
if c.len % 2 == 0 && (escaped_hex || escaped_unicode || escaped_octal) {
if escaped_unicode {
if c.len % 2 == 0
&& (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
if escaped_unicode_16 {
// there can only be one, so attempt to decode it now
c = s.decode_u16erune(c)
} else if escaped_unicode_32 {
// there can only be one, so attempt to decode it now
c = s.decode_uerune(c)
c = s.decode_u32erune(c)
} else {
// find escape sequence start positions
mut escapes_pos := []int{}
Expand All @@ -1530,7 +1582,7 @@ fn (mut s Scanner) ident_char() string {

u := c.runes()
if u.len != 1 {
if escaped_hex || escaped_unicode {
if escaped_hex || escaped_unicode_16 || escaped_unicode_32 {
s.error_with_pos('invalid character literal `${orig}` => `${c}` (${u}) (escape sequence did not refer to a singular rune)',
lspos)
} else if u.len == 0 {
Expand Down
55 changes: 51 additions & 4 deletions vlib/v/scanner/scanner_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ fn test_escape_rune() {
// will not work until v compiler on github is updated
// assert `\x61` == `a`
// assert `\u0061` == `a`
// assert `\U00000061` == `a`

// will not work until PR is accepted
// assert `\141` == `a`
Expand All @@ -180,11 +181,16 @@ fn test_escape_rune() {
assert result[0].kind == .chartoken
assert result[0].lit == r'\\'

// SINGLE CHAR UNICODE ESCAPE
// SINGLE CHAR 16-bit UNICODE ESCAPE
result = scan_tokens(r'`\u2605`')
assert result[0].kind == .chartoken
assert result[0].lit == r'★'

// SINGLE CHAR 32-bit UNICODE ESCAPE
result = scan_tokens(r'`\U00002605`')
assert result[0].kind == .chartoken
assert result[0].lit == r'★'

// SINGLE CHAR ESCAPED ASCII
result = scan_tokens(r'`\x61`')
assert result[0].kind == .chartoken
Expand All @@ -207,6 +213,7 @@ fn test_escape_string() {
assert '\x61' == 'a'
assert '\x62' == 'b'
assert '\u0061' == 'a'
assert '\U00000061' == 'a'
assert '\141' == 'a'
assert '\xe2\x98\x85' == '★'
assert '\342\230\205' == '★'
Expand All @@ -230,14 +237,22 @@ fn test_escape_string() {
assert result[0].kind == .string
assert result[0].lit == r'\\'

// STRING UNICODE ESCAPE
// STRING 16-bit UNICODE ESCAPE
result = scan_tokens(r"'\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
result = scan_tokens(r"'H\u2605H'")
assert result[0].kind == .string
assert result[0].lit == r'H★H'

// STRING 32-bit UNICODE ESCAPE
result = scan_tokens(r"'\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
result = scan_tokens(r"'H\U00002605H'")
assert result[0].kind == .string
assert result[0].lit == r'H★H'

// STRING ESCAPED ASCII
result = scan_tokens(r"'\x61'")
assert result[0].kind == .string
Expand All @@ -249,22 +264,54 @@ fn test_escape_string() {
assert result[0].kind == .string
assert result[0].lit.bytes() == [u8(0xe2), `9`, `8`, `8`, `5`]

// MIX STRING ESCAPES
// MIX STRING ESCAPES with UTF-16 escapes
result = scan_tokens(r"'\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'a★'
result = scan_tokens(r"'\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'★a'

// MIX STRING ESCAPES with offset
// MIX STRING ESCAPES with UTF-16 escapes with offset
result = scan_tokens(r"'x \x61\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a'
result = scan_tokens(r"'x \u2605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'x ★a★'

// MIX STRING ESCAPES with UTF-32 escapes
result = scan_tokens(r"'\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'a★'
result = scan_tokens(r"'\U00002605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'★a'

// MIX STRING ESCAPES with UTF-32 escapes with offset
result = scan_tokens(r"'x \x61\U00002605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a'
result = scan_tokens(r"'x \U00002605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'x ★a★'

// MIX STRING ESCAPES with UTF-16 and UTF-32 escapes
result = scan_tokens(r"'\u2605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'★a★'
result = scan_tokens(r"'\U00002605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★a★'

// MIX STRING ESCAPES with UTF-16 and UTF-32 escapes with offset
result = scan_tokens(r"'x \x61\U00002605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a★'
result = scan_tokens(r"'x \x61\u2605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a★'

// SHOULD RESULT IN ERRORS
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
// result = scan_tokens(r"'\x'") // should always result in an error
Expand Down
4 changes: 4 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_16_err.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vlib/v/scanner/tests/invalid_unicode_16_err.vv:1:13: error: invalid unicode point `\uD8FF`
1 | a := '\uD8FF'
| ^
2 | println(a)
4 changes: 4 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_32_err.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vlib/v/scanner/tests/invalid_unicode_32_err.vv:1:17: error: invalid unicode point `\U0000D8FF`
1 | a := '\U0000D8FF'
| ^
2 | println(a)
2 changes: 2 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_32_err.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a := '\U0000D8FF'
println(a)
Loading
Loading