Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace ASCII control chars with Unicode Control Pictures #127528

Merged
merged 4 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4008,7 +4008,6 @@ dependencies = [
"termcolor",
"termize",
"tracing",
"unicode-width",
"windows",
]

Expand Down
1 change: 0 additions & 1 deletion compiler/rustc_errors/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ serde_json = "1.0.59"
termcolor = "1.2.0"
termize = "0.1.1"
tracing = "0.1"
unicode-width = "0.1.4"
# tidy-alphabetical-end

[target.'cfg(windows)'.dependencies.windows]
Expand Down
77 changes: 51 additions & 26 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
//! The output types are defined in `rustc_session::config::ErrorOutputType`.

use rustc_span::source_map::SourceMap;
use rustc_span::{FileLines, FileName, SourceFile, Span};
use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};

use crate::snippet::{
Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
Expand Down Expand Up @@ -677,10 +677,7 @@ impl HumanEmitter {
.skip(left)
.take_while(|ch| {
// Make sure that the trimming on the right will fall within the terminal width.
// FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
// is. For now, just accept that sometimes the code line will be longer than
// desired.
let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
let next = char_width(*ch);
if taken + next > right - left {
return false;
}
Expand Down Expand Up @@ -742,11 +739,7 @@ impl HumanEmitter {
let left = margin.left(source_string.len());

// Account for unicode characters of width !=0 that were removed.
let left = source_string
.chars()
.take(left)
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum();
let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();

self.draw_line(
buffer,
Expand Down Expand Up @@ -2039,7 +2032,7 @@ impl HumanEmitter {
let sub_len: usize =
if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.map(|ch| char_width(ch))
.sum();

let offset: isize = offsets
Expand Down Expand Up @@ -2076,11 +2069,8 @@ impl HumanEmitter {
}

// length of the code after substitution
let full_sub_len = part
.snippet
.chars()
.map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
.sum::<usize>() as isize;
let full_sub_len =
part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;

// length of the code to be substituted
let snippet_len = span_end_pos as isize - span_start_pos as isize;
Expand Down Expand Up @@ -2568,18 +2558,53 @@ fn num_decimal_digits(num: usize) -> usize {
}

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', ""),
('\u{2066}', ""),
('\u{2067}', ""),
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', "�"),
('\u{2066}', "�"),
('\u{2067}', "�"),
('\u{2068}', "�"),
('\u{202C}', "�"),
('\u{2069}', "�"),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\u{0000}', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{000B}', "␋"),
('\u{000C}', "␌"),
('\u{000D}', "␍"),
('\u{000E}', "␎"),
('\u{000F}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001A}', "␚"),
('\u{001B}', "␛"),
('\u{001C}', "␜"),
('\u{001D}', "␝"),
('\u{001E}', "␞"),
('\u{001F}', "␟"),
('\u{007F}', "␡"),
];

fn normalize_whitespace(str: &str) -> String {
Expand Down
2 changes: 0 additions & 2 deletions compiler/rustc_metadata/src/rmeta/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1727,7 +1727,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
source_len,
lines,
multibyte_chars,
non_narrow_chars,
normalized_pos,
stable_id,
..
Expand Down Expand Up @@ -1779,7 +1778,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
self.cnum,
lines,
multibyte_chars,
non_narrow_chars,
normalized_pos,
source_file_index,
);
Expand Down
6 changes: 0 additions & 6 deletions compiler/rustc_query_system/src/ich/impls_syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
source_len: _,
lines: _,
ref multibyte_chars,
ref non_narrow_chars,
ref normalized_pos,
} = *self;

Expand All @@ -98,11 +97,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
char_pos.hash_stable(hcx, hasher);
}

non_narrow_chars.len().hash_stable(hcx, hasher);
for &char_pos in non_narrow_chars.iter() {
char_pos.hash_stable(hcx, hasher);
}

normalized_pos.len().hash_stable(hcx, hasher);
for &char_pos in normalized_pos.iter() {
char_pos.hash_stable(hcx, hasher);
Expand Down
40 changes: 6 additions & 34 deletions compiler/rustc_span/src/analyze_source_file.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use super::*;
use unicode_width::UnicodeWidthChar;

#[cfg(test)]
mod tests;
Expand All @@ -9,15 +8,12 @@ mod tests;
///
/// This function will use an SSE2 enhanced implementation if hardware support
/// is detected at runtime.
pub fn analyze_source_file(
src: &str,
) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
let mut lines = vec![RelativeBytePos::from_u32(0)];
let mut multi_byte_chars = vec![];
let mut non_narrow_chars = vec![];

// Calls the right implementation, depending on hardware support available.
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);

// The code above optimistically registers a new line *after* each \n
// it encounters. If that point is already outside the source_file, remove
Expand All @@ -30,7 +26,7 @@ pub fn analyze_source_file(
}
}

(lines, multi_byte_chars, non_narrow_chars)
(lines, multi_byte_chars)
}

cfg_match! {
Expand All @@ -39,11 +35,10 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
if is_x86_feature_detected!("sse2") {
unsafe {
analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
analyze_source_file_sse2(src, lines, multi_byte_chars);
}
} else {
analyze_source_file_generic(
Expand All @@ -52,7 +47,6 @@ cfg_match! {
RelativeBytePos::from_u32(0),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -66,7 +60,6 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
Expand Down Expand Up @@ -159,7 +152,6 @@ cfg_match! {
RelativeBytePos::from_usize(scan_start),
lines,
multi_byte_chars,
non_narrow_chars,
);
}

Expand All @@ -172,7 +164,6 @@ cfg_match! {
RelativeBytePos::from_usize(tail_start),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -183,15 +174,13 @@ cfg_match! {
src: &str,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) {
analyze_source_file_generic(
src,
src.len(),
RelativeBytePos::from_u32(0),
lines,
multi_byte_chars,
non_narrow_chars,
);
}
}
Expand All @@ -205,7 +194,6 @@ fn analyze_source_file_generic(
output_offset: RelativeBytePos,
lines: &mut Vec<RelativeBytePos>,
multi_byte_chars: &mut Vec<MultiByteChar>,
non_narrow_chars: &mut Vec<NonNarrowChar>,
) -> usize {
assert!(src.len() >= scan_len);
let mut i = 0;
Expand All @@ -227,16 +215,8 @@ fn analyze_source_file_generic(

let pos = RelativeBytePos::from_usize(i) + output_offset;

match byte {
b'\n' => {
lines.push(pos + RelativeBytePos(1));
}
b'\t' => {
non_narrow_chars.push(NonNarrowChar::Tab(pos));
}
_ => {
non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
}
if let b'\n' = byte {
lines.push(pos + RelativeBytePos(1));
}
} else if byte >= 127 {
// The slow path:
Expand All @@ -252,14 +232,6 @@ fn analyze_source_file_generic(
let mbc = MultiByteChar { pos, bytes: char_len as u8 };
multi_byte_chars.push(mbc);
}

// Assume control characters are zero width.
// FIXME: How can we decide between `width` and `width_cjk`?
let char_width = UnicodeWidthChar::width(c).unwrap_or(0);

if char_width != 1 {
non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
}
}

i += char_len;
Expand Down
Loading
Loading