From f2335fe69278cb30ca338bf3ba234013c3e2125b Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 3 Nov 2023 09:17:28 -0700 Subject: [PATCH] Make Unicode-to-Unicode confusables a preview change (#8473) --- crates/ruff_linter/src/rules/ruff/mod.rs | 20 ++- .../ruff/rules/ambiguous_unicode_character.rs | 50 +++++- ...nter__rules__ruff__tests__confusables.snap | 6 - ...les__ruff__tests__preview_confusables.snap | 164 ++++++++++++++++++ 4 files changed, 225 insertions(+), 15 deletions(-) create mode 100644 crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview_confusables.snap diff --git a/crates/ruff_linter/src/rules/ruff/mod.rs b/crates/ruff_linter/src/rules/ruff/mod.rs index 6e89cc19c82eb..3993d0e14e187 100644 --- a/crates/ruff_linter/src/rules/ruff/mod.rs +++ b/crates/ruff_linter/src/rules/ruff/mod.rs @@ -17,7 +17,7 @@ mod tests { use crate::pyproject_toml::lint_pyproject_toml; use crate::registry::Rule; use crate::settings::resolve_per_file_ignores; - use crate::settings::types::{PerFileIgnore, PythonVersion}; + use crate::settings::types::{PerFileIgnore, PreviewMode, PythonVersion}; use crate::test::{test_path, test_resource_path}; use crate::{assert_messages, settings}; @@ -88,6 +88,24 @@ mod tests { Ok(()) } + #[test] + fn preview_confusables() -> Result<()> { + let diagnostics = test_path( + Path::new("ruff/confusables.py"), + &settings::LinterSettings { + preview: PreviewMode::Enabled, + allowed_confusables: FxHashSet::from_iter(['−', 'ρ', '∗']), + ..settings::LinterSettings::for_rules(vec![ + Rule::AmbiguousUnicodeCharacterString, + Rule::AmbiguousUnicodeCharacterDocstring, + Rule::AmbiguousUnicodeCharacterComment, + ]) + }, + )?; + assert_messages!(diagnostics); + Ok(()) + } + #[test] fn noqa() -> Result<()> { let diagnostics = test_path( diff --git a/crates/ruff_linter/src/rules/ruff/rules/ambiguous_unicode_character.rs b/crates/ruff_linter/src/rules/ruff/rules/ambiguous_unicode_character.rs index 3e7d6c2c13720..0710c106caa05 100644 --- a/crates/ruff_linter/src/rules/ruff/rules/ambiguous_unicode_character.rs +++ b/crates/ruff_linter/src/rules/ruff/rules/ambiguous_unicode_character.rs @@ -13,12 +13,20 @@ use crate::rules::ruff::rules::Context; use crate::settings::LinterSettings; /// ## What it does -/// Checks for ambiguous unicode characters in strings. +/// Checks for ambiguous Unicode characters in strings. /// /// ## Why is this bad? -/// The use of ambiguous unicode characters can confuse readers and cause +/// Some Unicode characters are visually similar to ASCII characters, but have +/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is +/// visually similar, but not identical, to the ASCII character `A`. +/// +/// The use of ambiguous Unicode characters can confuse readers and cause /// subtle bugs. /// +/// In [preview], this rule will also flag Unicode characters that are +/// confusable with other, non-preferred Unicode characters. For example, the +/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`. +/// /// ## Example /// ```python /// print("Ηello, world!") # "Η" is the Greek eta (`U+0397`). @@ -28,6 +36,8 @@ use crate::settings::LinterSettings; /// ```python /// print("Hello, world!") # "H" is the Latin capital H (`U+0048`). /// ``` +/// +/// [preview]: https://docs.astral.sh/ruff/preview/ #[violation] pub struct AmbiguousUnicodeCharacterString { confusable: char, @@ -50,12 +60,20 @@ impl Violation for AmbiguousUnicodeCharacterString { } /// ## What it does -/// Checks for ambiguous unicode characters in docstrings. +/// Checks for ambiguous Unicode characters in docstrings. /// /// ## Why is this bad? -/// The use of ambiguous unicode characters can confuse readers and cause +/// Some Unicode characters are visually similar to ASCII characters, but have +/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is +/// visually similar, but not identical, to the ASCII character `A`. +/// +/// The use of ambiguous Unicode characters can confuse readers and cause /// subtle bugs. /// +/// In [preview], this rule will also flag Unicode characters that are +/// confusable with other, non-preferred Unicode characters. For example, the +/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`. +/// /// ## Example /// ```python /// """A lovely docstring (with a `U+FF09` parenthesis).""" @@ -65,6 +83,8 @@ impl Violation for AmbiguousUnicodeCharacterString { /// ```python /// """A lovely docstring (with no strange parentheses).""" /// ``` +/// +/// [preview]: https://docs.astral.sh/ruff/preview/ #[violation] pub struct AmbiguousUnicodeCharacterDocstring { confusable: char, @@ -87,12 +107,20 @@ impl Violation for AmbiguousUnicodeCharacterDocstring { } /// ## What it does -/// Checks for ambiguous unicode characters in comments. +/// Checks for ambiguous Unicode characters in comments. /// /// ## Why is this bad? -/// The use of ambiguous unicode characters can confuse readers and cause +/// Some Unicode characters are visually similar to ASCII characters, but have +/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is +/// visually similar, but not identical, to the ASCII character `A`. +/// +/// The use of ambiguous Unicode characters can confuse readers and cause /// subtle bugs. /// +/// In [preview], this rule will also flag Unicode characters that are +/// confusable with other, non-preferred Unicode characters. For example, the +/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`. +/// /// ## Example /// ```python /// foo() # nоqa # "о" is Cyrillic (`U+043E`) @@ -102,6 +130,8 @@ impl Violation for AmbiguousUnicodeCharacterDocstring { /// ```python /// foo() # noqa # "o" is Latin (`U+006F`) /// ``` +/// +/// [preview]: https://docs.astral.sh/ruff/preview/ #[violation] pub struct AmbiguousUnicodeCharacterComment { confusable: char, @@ -159,7 +189,9 @@ pub(crate) fn ambiguous_unicode_character( // Check if the boundary character is itself an ambiguous unicode character, in which // case, it's always included as a diagnostic. if !current_char.is_ascii() { - if let Some(representant) = confusable(current_char as u32) { + if let Some(representant) = confusable(current_char as u32) + .filter(|representant| settings.preview.is_enabled() || representant.is_ascii()) + { let candidate = Candidate::new( TextSize::try_from(relative_offset).unwrap() + range.start(), current_char, @@ -173,7 +205,9 @@ pub(crate) fn ambiguous_unicode_character( } else if current_char.is_ascii() { // The current word contains at least one ASCII character. word_flags |= WordFlags::ASCII; - } else if let Some(representant) = confusable(current_char as u32) { + } else if let Some(representant) = confusable(current_char as u32) + .filter(|representant| settings.preview.is_enabled() || representant.is_ascii()) + { // The current word contains an ambiguous unicode character. word_candidates.push(Candidate::new( TextSize::try_from(relative_offset).unwrap() + range.start(), diff --git a/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__confusables.snap b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__confusables.snap index 1a7b2d480542d..541fc82af67a1 100644 --- a/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__confusables.snap +++ b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__confusables.snap @@ -155,10 +155,4 @@ confusables.py:46:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE 47 | }" | -confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)? - | -55 | assert getattr(Labware(), "µL") == 1.5 - | ^ RUF001 - | - diff --git a/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview_confusables.snap b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview_confusables.snap new file mode 100644 index 0000000000000..1a7b2d480542d --- /dev/null +++ b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview_confusables.snap @@ -0,0 +1,164 @@ +--- +source: crates/ruff_linter/src/rules/ruff/mod.rs +--- +confusables.py:1:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)? + | +1 | x = "𝐁ad string" + | ^ RUF001 +2 | y = "−" + | + +confusables.py:6:56: RUF002 Docstring contains ambiguous `)` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)? + | +5 | def f(): +6 | """Here's a docstring with an unusual parenthesis: )""" + | ^^ RUF002 +7 | # And here's a comment with an unusual punctuation mark: ᜵ +8 | ... + | + +confusables.py:7:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)? + | +5 | def f(): +6 | """Here's a docstring with an unusual parenthesis: )""" +7 | # And here's a comment with an unusual punctuation mark: ᜵ + | ^ RUF003 +8 | ... + | + +confusables.py:17:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)? + | +17 | x = "𝐁ad string" + | ^ RUF001 +18 | x = "−" + | + +confusables.py:26:10: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)? + | +24 | # The first word should be ignored, while the second should be included, since it +25 | # contains ASCII. +26 | x = "βα Bαd" + | ^ RUF001 +27 | +28 | # The two characters should be flagged here. The first character is a "word" + | + +confusables.py:31:6: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)? + | +29 | # consisting of a single ambiguous character, while the second character is a "word +30 | # boundary" (whitespace) that it itself ambiguous. +31 | x = "Р усский" + | ^ RUF001 +32 | +33 | # Same test cases as above but using f-strings instead: + | + +confusables.py:31:7: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)? + | +29 | # consisting of a single ambiguous character, while the second character is a "word +30 | # boundary" (whitespace) that it itself ambiguous. +31 | x = "Р усский" + | ^ RUF001 +32 | +33 | # Same test cases as above but using f-strings instead: + | + +confusables.py:34:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)? + | +33 | # Same test cases as above but using f-strings instead: +34 | x = f"𝐁ad string" + | ^ RUF001 +35 | x = f"−" +36 | x = f"Русский" + | + +confusables.py:37:11: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)? + | +35 | x = f"−" +36 | x = f"Русский" +37 | x = f"βα Bαd" + | ^ RUF001 +38 | x = f"Р усский" + | + +confusables.py:38:7: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)? + | +36 | x = f"Русский" +37 | x = f"βα Bαd" +38 | x = f"Р усский" + | ^ RUF001 +39 | +40 | # Nested f-strings + | + +confusables.py:38:8: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)? + | +36 | x = f"Русский" +37 | x = f"βα Bαd" +38 | x = f"Р усский" + | ^ RUF001 +39 | +40 | # Nested f-strings + | + +confusables.py:41:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)? + | +40 | # Nested f-strings +41 | x = f"𝐁ad string {f" {f"Р усский"}"}" + | ^ RUF001 +42 | +43 | # Comments inside f-strings + | + +confusables.py:41:21: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)? + | +40 | # Nested f-strings +41 | x = f"𝐁ad string {f" {f"Р усский"}"}" + | ^ RUF001 +42 | +43 | # Comments inside f-strings + | + +confusables.py:41:25: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)? + | +40 | # Nested f-strings +41 | x = f"𝐁ad string {f" {f"Р усский"}"}" + | ^ RUF001 +42 | +43 | # Comments inside f-strings + | + +confusables.py:41:26: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)? + | +40 | # Nested f-strings +41 | x = f"𝐁ad string {f" {f"Р усский"}"}" + | ^ RUF001 +42 | +43 | # Comments inside f-strings + | + +confusables.py:44:68: RUF003 Comment contains ambiguous `)` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)? + | +43 | # Comments inside f-strings +44 | x = f"string { # And here's a comment with an unusual parenthesis: ) + | ^^ RUF003 +45 | # And here's a comment with a greek alpha: ∗ +46 | foo # And here's a comment with an unusual punctuation mark: ᜵ + | + +confusables.py:46:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)? + | +44 | x = f"string { # And here's a comment with an unusual parenthesis: ) +45 | # And here's a comment with a greek alpha: ∗ +46 | foo # And here's a comment with an unusual punctuation mark: ᜵ + | ^ RUF003 +47 | }" + | + +confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)? + | +55 | assert getattr(Labware(), "µL") == 1.5 + | ^ RUF001 + | + +