Skip to content

Commit

Permalink
Change output normalization logic to be linear against size of output
Browse files Browse the repository at this point in the history
Scan strings to be normalized for printing in a linear scan and collect
the resulting `String` only once.

Use a binary search when looking for chars to be replaced, instead of a
`HashMap::get`.
  • Loading branch information
estebank committed Jul 29, 2024
1 parent 4db3d12 commit 6d9ee6e
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 46 deletions.
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3867,6 +3867,7 @@ version = "0.0.0"
dependencies = [
"annotate-snippets 0.10.2",
"derive_setters",
"either",
"rustc_ast",
"rustc_ast_pretty",
"rustc_data_structures",
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_errors/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
# tidy-alphabetical-start
annotate-snippets = "0.10"
derive_setters = "0.1.6"
either = "1.5.0"
rustc_ast = { path = "../rustc_ast" }
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
rustc_data_structures = { path = "../rustc_data_structures" }
Expand Down
98 changes: 52 additions & 46 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use std::iter;
use std::path::Path;

use derive_setters::Setters;
use either::Either;
use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet};
use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc};
use rustc_error_messages::{FluentArgs, SpanLabel};
Expand Down Expand Up @@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize {

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
// ATTENTION: keep lexicografically sorted so that the binary search will work
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\0', "␀"),
('\u{1}', "␁"),
('\u{2}', "␂"),
('\u{3}', "␃"),
('\u{4}', "␄"),
('\u{5}', "␅"),
('\u{6}', "␆"),
('\u{7}', "␇"),
('\u{8}', "␈"),
('\t', " "), // We do our own tab replacement
('\u{b}', "␋"),
('\u{c}', "␌"),
('\r', "␍"),
('\u{e}', "␎"),
('\u{f}', "␏"),
('\u{10}', "␐"),
('\u{11}', "␑"),
('\u{12}', "␒"),
('\u{13}', "␓"),
('\u{14}', "␔"),
('\u{15}', "␕"),
('\u{16}', "␖"),
('\u{17}', "␗"),
('\u{18}', "␘"),
('\u{19}', "␙"),
('\u{1a}', "␚"),
('\u{1b}', "␛"),
('\u{1c}', "␜"),
('\u{1d}', "␝"),
('\u{1e}', "␞"),
('\u{1f}', "␟"),
('\u{7f}', "␡"),
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202d}', "�"),
('\u{202e}', "�"),
('\u{2066}', "�"),
('\u{2067}', "�"),
('\u{2068}', "�"),
('\u{202C}', "�"),
('\u{2069}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\u{0000}', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{000B}', "␋"),
('\u{000C}', "␌"),
('\u{000D}', "␍"),
('\u{000E}', "␎"),
('\u{000F}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001A}', "␚"),
('\u{001B}', "␛"),
('\u{001C}', "␜"),
('\u{001D}', "␝"),
('\u{001E}', "␞"),
('\u{001F}', "␟"),
('\u{007F}', "␡"),
];

fn normalize_whitespace(str: &str) -> String {
let mut s = str.to_string();
for (c, replacement) in OUTPUT_REPLACEMENTS {
s = s.replace(*c, replacement);
}
s
// Scan the input string for a character in the ordered table above. If it's present, replace
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
// char. At the end, allocate all chars into a string in one operation.
str.chars()
.flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()),
_ => Either::Right([c].into_iter()),
})
.collect()
}

fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {
Expand Down

0 comments on commit 6d9ee6e

Please sign in to comment.