Skip to content

Commit

Permalink
Make Unicode-to-Unicode confusables a preview change (#8473)
Browse files Browse the repository at this point in the history
  • Loading branch information
charliermarsh committed Nov 3, 2023
1 parent b0f9a14 commit f2335fe
Show file tree
Hide file tree
Showing 4 changed files with 225 additions and 15 deletions.
20 changes: 19 additions & 1 deletion crates/ruff_linter/src/rules/ruff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mod tests {
use crate::pyproject_toml::lint_pyproject_toml;
use crate::registry::Rule;
use crate::settings::resolve_per_file_ignores;
use crate::settings::types::{PerFileIgnore, PythonVersion};
use crate::settings::types::{PerFileIgnore, PreviewMode, PythonVersion};
use crate::test::{test_path, test_resource_path};
use crate::{assert_messages, settings};

Expand Down Expand Up @@ -88,6 +88,24 @@ mod tests {
Ok(())
}

#[test]
fn preview_confusables() -> Result<()> {
let diagnostics = test_path(
Path::new("ruff/confusables.py"),
&settings::LinterSettings {
preview: PreviewMode::Enabled,
allowed_confusables: FxHashSet::from_iter(['−', 'ρ', '∗']),
..settings::LinterSettings::for_rules(vec![
Rule::AmbiguousUnicodeCharacterString,
Rule::AmbiguousUnicodeCharacterDocstring,
Rule::AmbiguousUnicodeCharacterComment,
])
},
)?;
assert_messages!(diagnostics);
Ok(())
}

#[test]
fn noqa() -> Result<()> {
let diagnostics = test_path(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@ use crate::rules::ruff::rules::Context;
use crate::settings::LinterSettings;

/// ## What it does
/// Checks for ambiguous unicode characters in strings.
/// Checks for ambiguous Unicode characters in strings.
///
/// ## Why is this bad?
/// The use of ambiguous unicode characters can confuse readers and cause
/// Some Unicode characters are visually similar to ASCII characters, but have
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
/// visually similar, but not identical, to the ASCII character `A`.
///
/// The use of ambiguous Unicode characters can confuse readers and cause
/// subtle bugs.
///
/// In [preview], this rule will also flag Unicode characters that are
/// confusable with other, non-preferred Unicode characters. For example, the
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
///
/// ## Example
/// ```python
/// print("Ηello, world!") # "Η" is the Greek eta (`U+0397`).
Expand All @@ -28,6 +36,8 @@ use crate::settings::LinterSettings;
/// ```python
/// print("Hello, world!") # "H" is the Latin capital H (`U+0048`).
/// ```
///
/// [preview]: https://docs.astral.sh/ruff/preview/
#[violation]
pub struct AmbiguousUnicodeCharacterString {
confusable: char,
Expand All @@ -50,12 +60,20 @@ impl Violation for AmbiguousUnicodeCharacterString {
}

/// ## What it does
/// Checks for ambiguous unicode characters in docstrings.
/// Checks for ambiguous Unicode characters in docstrings.
///
/// ## Why is this bad?
/// The use of ambiguous unicode characters can confuse readers and cause
/// Some Unicode characters are visually similar to ASCII characters, but have
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
/// visually similar, but not identical, to the ASCII character `A`.
///
/// The use of ambiguous Unicode characters can confuse readers and cause
/// subtle bugs.
///
/// In [preview], this rule will also flag Unicode characters that are
/// confusable with other, non-preferred Unicode characters. For example, the
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
///
/// ## Example
/// ```python
/// """A lovely docstring (with a `U+FF09` parenthesis)."""
Expand All @@ -65,6 +83,8 @@ impl Violation for AmbiguousUnicodeCharacterString {
/// ```python
/// """A lovely docstring (with no strange parentheses)."""
/// ```
///
/// [preview]: https://docs.astral.sh/ruff/preview/
#[violation]
pub struct AmbiguousUnicodeCharacterDocstring {
confusable: char,
Expand All @@ -87,12 +107,20 @@ impl Violation for AmbiguousUnicodeCharacterDocstring {
}

/// ## What it does
/// Checks for ambiguous unicode characters in comments.
/// Checks for ambiguous Unicode characters in comments.
///
/// ## Why is this bad?
/// The use of ambiguous unicode characters can confuse readers and cause
/// Some Unicode characters are visually similar to ASCII characters, but have
/// different code points. For example, `LATIN CAPITAL LETTER A` (`U+0041`) is
/// visually similar, but not identical, to the ASCII character `A`.
///
/// The use of ambiguous Unicode characters can confuse readers and cause
/// subtle bugs.
///
/// In [preview], this rule will also flag Unicode characters that are
/// confusable with other, non-preferred Unicode characters. For example, the
/// spec recommends `GREEK CAPITAL LETTER OMEGA` over `OHM SIGN`.
///
/// ## Example
/// ```python
/// foo() # nоqa # "о" is Cyrillic (`U+043E`)
Expand All @@ -102,6 +130,8 @@ impl Violation for AmbiguousUnicodeCharacterDocstring {
/// ```python
/// foo() # noqa # "o" is Latin (`U+006F`)
/// ```
///
/// [preview]: https://docs.astral.sh/ruff/preview/
#[violation]
pub struct AmbiguousUnicodeCharacterComment {
confusable: char,
Expand Down Expand Up @@ -159,7 +189,9 @@ pub(crate) fn ambiguous_unicode_character(
// Check if the boundary character is itself an ambiguous unicode character, in which
// case, it's always included as a diagnostic.
if !current_char.is_ascii() {
if let Some(representant) = confusable(current_char as u32) {
if let Some(representant) = confusable(current_char as u32)
.filter(|representant| settings.preview.is_enabled() || representant.is_ascii())
{
let candidate = Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
current_char,
Expand All @@ -173,7 +205,9 @@ pub(crate) fn ambiguous_unicode_character(
} else if current_char.is_ascii() {
// The current word contains at least one ASCII character.
word_flags |= WordFlags::ASCII;
} else if let Some(representant) = confusable(current_char as u32) {
} else if let Some(representant) = confusable(current_char as u32)
.filter(|representant| settings.preview.is_enabled() || representant.is_ascii())
{
// The current word contains an ambiguous unicode character.
word_candidates.push(Candidate::new(
TextSize::try_from(relative_offset).unwrap() + range.start(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,4 @@ confusables.py:46:62: RUF003 Comment contains ambiguous `᜵` (PHILIPPINE SINGLE
47 | }"
|

confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)?
|
55 | assert getattr(Labware(), "µL") == 1.5
| ^ RUF001
|


Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
---
source: crates/ruff_linter/src/rules/ruff/mod.rs
---
confusables.py:1:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
|
1 | x = "𝐁ad string"
| ^ RUF001
2 | y = ""
|

confusables.py:6:56: RUF002 Docstring contains ambiguous `` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)?
|
5 | def f():
6 | """Here's a docstring with an unusual parenthesis: )"""
| ^^ RUF002
7 | # And here's a comment with an unusual punctuation mark:
8 | ...
|

confusables.py:7:62: RUF003 Comment contains ambiguous `` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)?
|
5 | def f():
6 | """Here's a docstring with an unusual parenthesis: )"""
7 | # And here's a comment with an unusual punctuation mark:
| ^ RUF003
8 | ...
|

confusables.py:17:6: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
|
17 | x = "𝐁ad string"
| ^ RUF001
18 | x = ""
|

confusables.py:26:10: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)?
|
24 | # The first word should be ignored, while the second should be included, since it
25 | # contains ASCII.
26 | x = "βα Bαd"
| ^ RUF001
27 |
28 | # The two characters should be flagged here. The first character is a "word"
|

confusables.py:31:6: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
|
29 | # consisting of a single ambiguous character, while the second character is a "word
30 | # boundary" (whitespace) that it itself ambiguous.
31 | x = "Р усский"
| ^ RUF001
32 |
33 | # Same test cases as above but using f-strings instead:
|

confusables.py:31:7: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
|
29 | # consisting of a single ambiguous character, while the second character is a "word
30 | # boundary" (whitespace) that it itself ambiguous.
31 | x = "Р усский"
| ^ RUF001
32 |
33 | # Same test cases as above but using f-strings instead:
|

confusables.py:34:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
|
33 | # Same test cases as above but using f-strings instead:
34 | x = f"𝐁ad string"
| ^ RUF001
35 | x = f""
36 | x = f"Русский"
|

confusables.py:37:11: RUF001 String contains ambiguous `α` (GREEK SMALL LETTER ALPHA). Did you mean `a` (LATIN SMALL LETTER A)?
|
35 | x = f""
36 | x = f"Русский"
37 | x = f"βα Bαd"
| ^ RUF001
38 | x = f"Р усский"
|

confusables.py:38:7: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
|
36 | x = f"Русский"
37 | x = f"βα Bαd"
38 | x = f"Р усский"
| ^ RUF001
39 |
40 | # Nested f-strings
|

confusables.py:38:8: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
|
36 | x = f"Русский"
37 | x = f"βα Bαd"
38 | x = f"Р усский"
| ^ RUF001
39 |
40 | # Nested f-strings
|

confusables.py:41:7: RUF001 String contains ambiguous `𝐁` (MATHEMATICAL BOLD CAPITAL B). Did you mean `B` (LATIN CAPITAL LETTER B)?
|
40 | # Nested f-strings
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
| ^ RUF001
42 |
43 | # Comments inside f-strings
|

confusables.py:41:21: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
|
40 | # Nested f-strings
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
| ^ RUF001
42 |
43 | # Comments inside f-strings
|

confusables.py:41:25: RUF001 String contains ambiguous `Р` (CYRILLIC CAPITAL LETTER ER). Did you mean `P` (LATIN CAPITAL LETTER P)?
|
40 | # Nested f-strings
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
| ^ RUF001
42 |
43 | # Comments inside f-strings
|

confusables.py:41:26: RUF001 String contains ambiguous ` ` (EN QUAD). Did you mean ` ` (SPACE)?
|
40 | # Nested f-strings
41 | x = f"𝐁ad string {f" {f"Р усский"}"}"
| ^ RUF001
42 |
43 | # Comments inside f-strings
|

confusables.py:44:68: RUF003 Comment contains ambiguous `` (FULLWIDTH RIGHT PARENTHESIS). Did you mean `)` (RIGHT PARENTHESIS)?
|
43 | # Comments inside f-strings
44 | x = f"string { # And here's a comment with an unusual parenthesis:
| ^^ RUF003
45 | # And here's a comment with a greek alpha:
46 | foo # And here's a comment with an unusual punctuation mark:
|

confusables.py:46:62: RUF003 Comment contains ambiguous `` (PHILIPPINE SINGLE PUNCTUATION). Did you mean `/` (SOLIDUS)?
|
44 | x = f"string { # And here's a comment with an unusual parenthesis:
45 | # And here's a comment with a greek alpha:
46 | foo # And here's a comment with an unusual punctuation mark:
| ^ RUF003
47 | }"
|

confusables.py:55:28: RUF001 String contains ambiguous `µ` (MICRO SIGN). Did you mean `μ` (GREEK SMALL LETTER MU)?
|
55 | assert getattr(Labware(), "µL") == 1.5
| ^ RUF001
|


0 comments on commit f2335fe

Please sign in to comment.