Skip to content

Commit

Permalink
Auto merge of rust-lang#128200 - estebank:normalize-whitespace, r=<try>
Browse files Browse the repository at this point in the history
Change output normalization logic to be linear against size of output

I believe the previous code was accidentally quadratic. Let's perf it.
  • Loading branch information
bors committed Jul 26, 2024
2 parents 48bbe12 + e35d147 commit c7620ca
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 46 deletions.
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3873,6 +3873,7 @@ version = "0.0.0"
dependencies = [
"annotate-snippets 0.10.2",
"derive_setters",
"either",
"rustc_ast",
"rustc_ast_pretty",
"rustc_data_structures",
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_errors/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
# tidy-alphabetical-start
annotate-snippets = "0.10"
derive_setters = "0.1.6"
either = "1.5.0"
rustc_ast = { path = "../rustc_ast" }
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
rustc_data_structures = { path = "../rustc_data_structures" }
Expand Down
98 changes: 52 additions & 46 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use crate::{
SuggestionStyle, TerminalUrl,
};
use derive_setters::Setters;
use either::Either;
use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet};
use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc};
use rustc_error_messages::{FluentArgs, SpanLabel};
Expand Down Expand Up @@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize {

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
// ATTENTION: keep lexicografically sorted so that the binary search will work
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\0', "␀"),
('\u{1}', "␁"),
('\u{2}', "␂"),
('\u{3}', "␃"),
('\u{4}', "␄"),
('\u{5}', "␅"),
('\u{6}', "␆"),
('\u{7}', "␇"),
('\u{8}', "␈"),
('\t', " "), // We do our own tab replacement
('\u{b}', "␋"),
('\u{c}', "␌"),
('\r', "␍"),
('\u{e}', "␎"),
('\u{f}', "␏"),
('\u{10}', "␐"),
('\u{11}', "␑"),
('\u{12}', "␒"),
('\u{13}', "␓"),
('\u{14}', "␔"),
('\u{15}', "␕"),
('\u{16}', "␖"),
('\u{17}', "␗"),
('\u{18}', "␘"),
('\u{19}', "␙"),
('\u{1a}', "␚"),
('\u{1b}', "␛"),
('\u{1c}', "␜"),
('\u{1d}', "␝"),
('\u{1e}', "␞"),
('\u{1f}', "␟"),
('\u{7f}', "␡"),
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202d}', "�"),
('\u{202e}', "�"),
('\u{2066}', "�"),
('\u{2067}', "�"),
('\u{2068}', "�"),
('\u{202C}', "�"),
('\u{2069}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\u{0000}', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{000B}', "␋"),
('\u{000C}', "␌"),
('\u{000D}', "␍"),
('\u{000E}', "␎"),
('\u{000F}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001A}', "␚"),
('\u{001B}', "␛"),
('\u{001C}', "␜"),
('\u{001D}', "␝"),
('\u{001E}', "␞"),
('\u{001F}', "␟"),
('\u{007F}', "␡"),
];

fn normalize_whitespace(str: &str) -> String {
let mut s = str.to_string();
for (c, replacement) in OUTPUT_REPLACEMENTS {
s = s.replace(*c, replacement);
}
s
// Scan the input string for a character in the ordered table above. If it's present, replace
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
// char. At the end, allocate all chars into a string in one operation.
str.chars()
.flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()),
_ => Either::Right([c].into_iter()),
})
.collect()
}

fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {
Expand Down

0 comments on commit c7620ca

Please sign in to comment.