Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Formatter: escape bi-directional control characters within strings #13067

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions spec/compiler/formatter/formatter_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2608,4 +2608,34 @@ describe Crystal::Formatter do
CRYSTAL
end
end

# CVE-2021-42574
describe "Unicode bi-directional control characters" do
['\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char|
assert_format %("#{char}"), %("#{char.unicode_escape}")
assert_format %("\\c#{char}"), %("c#{char.unicode_escape}")
assert_format %("#{char}\#{1}"), %("#{char.unicode_escape}\#{1}")
assert_format %("\\c#{char}\#{1}"), %("c#{char.unicode_escape}\#{1}")
assert_format %(%(#{char})), %(%(#{char.unicode_escape}))
assert_format %(%Q(#{char})), %(%Q(#{char.unicode_escape}))
assert_format %(%Q(#{char}\#{1})), %(%Q(#{char.unicode_escape}\#{1}))
assert_format %(<<-EOS\n#{char}\nEOS), %(<<-EOS\n#{char.unicode_escape}\nEOS)
assert_format %(<<-EOS\n#{char}\#{1}\nEOS), %(<<-EOS\n#{char.unicode_escape}\#{1}\nEOS)
assert_format %(def foo("#{char}" x)\nend), %(def foo("#{char.unicode_escape}" x)\nend)
assert_format %(foo("#{char}": 1)), %(foo("#{char.unicode_escape}": 1))
assert_format %(NamedTuple("#{char}": Int32)), %(NamedTuple("#{char.unicode_escape}": Int32))
assert_format %({"#{char}": 1}), %({"#{char.unicode_escape}": 1})

# the following contexts do not accept escape sequences, escaping these
# control characters would alter the meaning of the source code
assert_format %(/#{char}/)
assert_format %(%r(#{char}))
assert_format %(%q(#{char}))
assert_format %(%w(#{char}))
assert_format %(%i(#{char}))
assert_format %(/#{char}\#{1}/)
assert_format %(%r(#{char}\#{1}))
assert_format %(<<-'EOS'\n#{char}\nEOS)
end
end
end
15 changes: 15 additions & 0 deletions src/compiler/crystal/syntax/lexer.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2735,6 +2735,21 @@ module Crystal
end
end

private UNICODE_BIDI_CONTROL_CHARACTERS = {'\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'}

private FORBIDDEN_ESCAPES = UNICODE_BIDI_CONTROL_CHARACTERS.to_h { |ch| {ch, ch.unicode_escape} }

# used by the formatter
def self.escape_forbidden_characters(string)
string.each_char do |ch|
if ch.in?(UNICODE_BIDI_CONTROL_CHARACTERS)
return string.gsub(FORBIDDEN_ESCAPES)
end
end

string
end

def next_char_no_column_increment
char = @reader.next_char
if error = @reader.error
Expand Down
14 changes: 8 additions & 6 deletions src/compiler/crystal/tools/formatter.cr
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,7 @@ module Crystal
while true
case @token.type
when .string?
if @token.invalid_escape
write @token.value
else
write @token.raw
end
write_sanitized_string_body(@token.delimiter_state.allow_escapes && !is_regex)
next_string_token
when .interpolation_start?
# This is the case of #{__DIR__}
Expand Down Expand Up @@ -534,6 +530,12 @@ module Crystal
false
end

private def write_sanitized_string_body(escape)
body = @token.invalid_escape ? @token.value.as(String) : @token.raw
body = Lexer.escape_forbidden_characters(body) if escape
write body
end

def visit(node : StringInterpolation)
if @token.delimiter_state.kind.heredoc?
# For heredoc, only write the start: on a newline will print it
Expand Down Expand Up @@ -609,7 +611,7 @@ module Crystal
else
loop do
check :STRING
write @token.invalid_escape ? @token.value : @token.raw
write_sanitized_string_body(delimiter_state.allow_escapes && !is_regex)
next_string_token

# On heredoc, pieces of contents are combined due to removing indentation.
Expand Down