Rollup merge of #127528 - estebank:ascii-control-chars, r=oli-obk

Replace ASCII control chars with Unicode Control Pictures Replace ASCII control chars like `CR` with Unicode Control Pictures like `␍`: ``` error: bare CR not allowed in doc-comment --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:3:32 | LL | /// doc comment with bare CR: '␍' | ^ ``` Centralize the checking of unicode char width for the purposes of CLI display in one place. Account for the new replacements. Remove unneeded tracking of "zero-width" unicode chars, as we calculate these in the `SourceMap` as needed now.
rust-lang · Jul 25, 2024 · cce2db0 · cce2db0
2 parents cfc5f25 + 9bd7680
commit cce2db0
Show file tree

Hide file tree

Showing 67 changed files with 216 additions and 308 deletions.
diff --git a/Cargo.lock b/Cargo.lock
@@ -3882,7 +3882,6 @@ dependencies = [
  "termcolor",
  "termize",
  "tracing",
- "unicode-width",
  "windows",
 ]
 

diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml
@@ -26,7 +26,6 @@ serde_json = "1.0.59"
 termcolor = "1.2.0"
 termize = "0.1.1"
 tracing = "0.1"
-unicode-width = "0.1.4"
 # tidy-alphabetical-end
 
 [target.'cfg(windows)'.dependencies.windows]

diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
@@ -8,7 +8,7 @@
 //! The output types are defined in `rustc_session::config::ErrorOutputType`.
 
 use rustc_span::source_map::SourceMap;
-use rustc_span::{FileLines, FileName, SourceFile, Span};
+use rustc_span::{char_width, FileLines, FileName, SourceFile, Span};
 
 use crate::snippet::{
     Annotation, AnnotationColumn, AnnotationType, Line, MultilineAnnotation, Style, StyledString,
@@ -677,10 +677,7 @@ impl HumanEmitter {
             .skip(left)
             .take_while(|ch| {
                 // Make sure that the trimming on the right will fall within the terminal width.
-                // FIXME: `unicode_width` sometimes disagrees with terminals on how wide a `char`
-                // is. For now, just accept that sometimes the code line will be longer than
-                // desired.
-                let next = unicode_width::UnicodeWidthChar::width(*ch).unwrap_or(1);
+                let next = char_width(*ch);
                 if taken + next > right - left {
                     return false;
                 }
@@ -742,11 +739,7 @@ impl HumanEmitter {
         let left = margin.left(source_string.len());
 
         // Account for unicode characters of width !=0 that were removed.
-        let left = source_string
-            .chars()
-            .take(left)
-            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
-            .sum();
+        let left = source_string.chars().take(left).map(|ch| char_width(ch)).sum();
 
         self.draw_line(
             buffer,
@@ -2039,7 +2032,7 @@ impl HumanEmitter {
                     let sub_len: usize =
                         if is_whitespace_addition { &part.snippet } else { part.snippet.trim() }
                             .chars()
-                            .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
+                            .map(|ch| char_width(ch))
                             .sum();
 
                     let offset: isize = offsets
@@ -2076,11 +2069,8 @@ impl HumanEmitter {
                     }
 
                     // length of the code after substitution
-                    let full_sub_len = part
-                        .snippet
-                        .chars()
-                        .map(|ch| unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1))
-                        .sum::<usize>() as isize;
+                    let full_sub_len =
+                        part.snippet.chars().map(|ch| char_width(ch)).sum::<usize>() as isize;
 
                     // length of the code to be substituted
                     let snippet_len = span_end_pos as isize - span_start_pos as isize;
@@ -2568,18 +2558,53 @@ fn num_decimal_digits(num: usize) -> usize {
 }
 
 // We replace some characters so the CLI output is always consistent and underlines aligned.
+// Keep the following list in sync with `rustc_span::char_width`.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
-    ('\t', "    "),   // We do our own tab replacement
+    ('\t', "    "),    // We do our own tab replacement
     ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
-    ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
-    ('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
-    ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
-    ('\u{202E}', ""),
-    ('\u{2066}', ""),
-    ('\u{2067}', ""),
-    ('\u{2068}', ""),
-    ('\u{202C}', ""),
-    ('\u{2069}', ""),
+    ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently
+    ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
+    ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always.
+    ('\u{202E}', "�"),
+    ('\u{2066}', "�"),
+    ('\u{2067}', "�"),
+    ('\u{2068}', "�"),
+    ('\u{202C}', "�"),
+    ('\u{2069}', "�"),
+    // In terminals without Unicode support the following will be garbled, but in *all* terminals
+    // the underlying codepoint will be as well. We could gate this replacement behind a "unicode
+    // support" gate.
+    ('\u{0000}', "␀"),
+    ('\u{0001}', "␁"),
+    ('\u{0002}', "␂"),
+    ('\u{0003}', "␃"),
+    ('\u{0004}', "␄"),
+    ('\u{0005}', "␅"),
+    ('\u{0006}', "␆"),
+    ('\u{0007}', "␇"),
+    ('\u{0008}', "␈"),
+    ('\u{000B}', "␋"),
+    ('\u{000C}', "␌"),
+    ('\u{000D}', "␍"),
+    ('\u{000E}', "␎"),
+    ('\u{000F}', "␏"),
+    ('\u{0010}', "␐"),
+    ('\u{0011}', "␑"),
+    ('\u{0012}', "␒"),
+    ('\u{0013}', "␓"),
+    ('\u{0014}', "␔"),
+    ('\u{0015}', "␕"),
+    ('\u{0016}', "␖"),
+    ('\u{0017}', "␗"),
+    ('\u{0018}', "␘"),
+    ('\u{0019}', "␙"),
+    ('\u{001A}', "␚"),
+    ('\u{001B}', "␛"),
+    ('\u{001C}', "␜"),
+    ('\u{001D}', "␝"),
+    ('\u{001E}', "␞"),
+    ('\u{001F}', "␟"),
+    ('\u{007F}', "␡"),
 ];
 
 fn normalize_whitespace(str: &str) -> String {

diff --git a/compiler/rustc_metadata/src/rmeta/decoder.rs b/compiler/rustc_metadata/src/rmeta/decoder.rs
@@ -1728,7 +1728,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
                     source_len,
                     lines,
                     multibyte_chars,
-                    non_narrow_chars,
                     normalized_pos,
                     stable_id,
                     ..
@@ -1780,7 +1779,6 @@ impl<'a, 'tcx> CrateMetadataRef<'a> {
                     self.cnum,
                     lines,
                     multibyte_chars,
-                    non_narrow_chars,
                     normalized_pos,
                     source_file_index,
                 );

diff --git a/compiler/rustc_query_system/src/ich/impls_syntax.rs b/compiler/rustc_query_system/src/ich/impls_syntax.rs
@@ -73,7 +73,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
             source_len: _,
             lines: _,
             ref multibyte_chars,
-            ref non_narrow_chars,
             ref normalized_pos,
         } = *self;
 
@@ -98,11 +97,6 @@ impl<'a> HashStable<StableHashingContext<'a>> for SourceFile {
             char_pos.hash_stable(hcx, hasher);
         }
 
-        non_narrow_chars.len().hash_stable(hcx, hasher);
-        for &char_pos in non_narrow_chars.iter() {
-            char_pos.hash_stable(hcx, hasher);
-        }
-
         normalized_pos.len().hash_stable(hcx, hasher);
         for &char_pos in normalized_pos.iter() {
             char_pos.hash_stable(hcx, hasher);

diff --git a/compiler/rustc_span/src/analyze_source_file.rs b/compiler/rustc_span/src/analyze_source_file.rs
@@ -1,5 +1,4 @@
 use super::*;
-use unicode_width::UnicodeWidthChar;
 
 #[cfg(test)]
 mod tests;
@@ -9,15 +8,12 @@ mod tests;
 ///
 /// This function will use an SSE2 enhanced implementation if hardware support
 /// is detected at runtime.
-pub fn analyze_source_file(
-    src: &str,
-) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) {
+pub fn analyze_source_file(src: &str) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>) {
     let mut lines = vec![RelativeBytePos::from_u32(0)];
     let mut multi_byte_chars = vec![];
-    let mut non_narrow_chars = vec![];
 
     // Calls the right implementation, depending on hardware support available.
-    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars);
+    analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars);
 
     // The code above optimistically registers a new line *after* each \n
     // it encounters. If that point is already outside the source_file, remove
@@ -30,7 +26,7 @@ pub fn analyze_source_file(
         }
     }
 
-    (lines, multi_byte_chars, non_narrow_chars)
+    (lines, multi_byte_chars)
 }
 
 cfg_match! {
@@ -39,11 +35,10 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
-            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             if is_x86_feature_detected!("sse2") {
                 unsafe {
-                    analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars);
+                    analyze_source_file_sse2(src, lines, multi_byte_chars);
                 }
             } else {
                 analyze_source_file_generic(
@@ -52,7 +47,6 @@ cfg_match! {
                     RelativeBytePos::from_u32(0),
                     lines,
                     multi_byte_chars,
-                    non_narrow_chars,
                 );
             }
         }
@@ -66,7 +60,6 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
-            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             #[cfg(target_arch = "x86")]
             use std::arch::x86::*;
@@ -159,7 +152,6 @@ cfg_match! {
                     RelativeBytePos::from_usize(scan_start),
                     lines,
                     multi_byte_chars,
-                    non_narrow_chars,
                 );
             }
 
@@ -172,7 +164,6 @@ cfg_match! {
                     RelativeBytePos::from_usize(tail_start),
                     lines,
                     multi_byte_chars,
-                    non_narrow_chars,
                 );
             }
         }
@@ -183,15 +174,13 @@ cfg_match! {
             src: &str,
             lines: &mut Vec<RelativeBytePos>,
             multi_byte_chars: &mut Vec<MultiByteChar>,
-            non_narrow_chars: &mut Vec<NonNarrowChar>,
         ) {
             analyze_source_file_generic(
                 src,
                 src.len(),
                 RelativeBytePos::from_u32(0),
                 lines,
                 multi_byte_chars,
-                non_narrow_chars,
             );
         }
     }
@@ -205,7 +194,6 @@ fn analyze_source_file_generic(
     output_offset: RelativeBytePos,
     lines: &mut Vec<RelativeBytePos>,
     multi_byte_chars: &mut Vec<MultiByteChar>,
-    non_narrow_chars: &mut Vec<NonNarrowChar>,
 ) -> usize {
     assert!(src.len() >= scan_len);
     let mut i = 0;
@@ -227,16 +215,8 @@ fn analyze_source_file_generic(
 
             let pos = RelativeBytePos::from_usize(i) + output_offset;
 
-            match byte {
-                b'\n' => {
-                    lines.push(pos + RelativeBytePos(1));
-                }
-                b'\t' => {
-                    non_narrow_chars.push(NonNarrowChar::Tab(pos));
-                }
-                _ => {
-                    non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos));
-                }
+            if let b'\n' = byte {
+                lines.push(pos + RelativeBytePos(1));
             }
         } else if byte >= 127 {
             // The slow path:
@@ -252,14 +232,6 @@ fn analyze_source_file_generic(
                 let mbc = MultiByteChar { pos, bytes: char_len as u8 };
                 multi_byte_chars.push(mbc);
             }
-
-            // Assume control characters are zero width.
-            // FIXME: How can we decide between `width` and `width_cjk`?
-            let char_width = UnicodeWidthChar::width(c).unwrap_or(0);
-
-            if char_width != 1 {
-                non_narrow_chars.push(NonNarrowChar::new(pos, char_width));
-            }
         }
 
         i += char_len;