From 7893be7dc699dcf7f6a72f69b3b8aae18634f010 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Fri, 9 Feb 2024 10:22:20 -0500 Subject: [PATCH] Introduce `Indent` datatype --- .../src/string/docstring.rs | 205 ++++++++++++------ 1 file changed, 142 insertions(+), 63 deletions(-) diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index b09324a10f479..e9a2863967167 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -2,11 +2,13 @@ // "reStructuredText." #![allow(clippy::doc_markdown)] +use std::cmp::Ordering; use std::{borrow::Cow, collections::VecDeque}; +use itertools::Itertools; + use ruff_formatter::printer::SourceMapGeneration; use ruff_python_parser::ParseError; - use {once_cell::sync::Lazy, regex::Regex}; use { ruff_formatter::{write, FormatOptions, IndentStyle, LineWidth, Printed}, @@ -180,7 +182,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form .clone() // We don't want to count whitespace-only lines as miss-indented .filter(|line| !line.trim().is_empty()) - .map(indentation_length) + .map(|line| Indent::from_str(line).len()) .min() .unwrap_or_default(); @@ -345,7 +347,7 @@ impl<'ast, 'buf, 'fmt, 'src> DocstringLinePrinter<'ast, 'buf, 'fmt, 'src> { }; // This looks suspicious, but it's consistent with the whitespace // normalization that will occur anyway. - let indent = " ".repeat(min_indent); + let indent = " ".repeat(min_indent.len()); for docline in formatted_lines { self.print_one( &docline.map(|line| std::format!("{indent}{line}")), @@ -355,7 +357,7 @@ impl<'ast, 'buf, 'fmt, 'src> DocstringLinePrinter<'ast, 'buf, 'fmt, 'src> { CodeExampleKind::Markdown(fenced) => { // This looks suspicious, but it's consistent with the whitespace // normalization that will occur anyway. - let indent = " ".repeat(fenced.opening_fence_indent); + let indent = " ".repeat(fenced.opening_fence_indent.len()); for docline in formatted_lines { self.print_one( &docline.map(|line| std::format!("{indent}{line}")), @@ -400,7 +402,7 @@ impl<'ast, 'buf, 'fmt, 'src> DocstringLinePrinter<'ast, 'buf, 'fmt, 'src> { // overindented, in which case we strip the additional whitespace // (see example in [`format_docstring`] doc comment). We then // prepend the in-docstring indentation to the string. - let indent_len = indentation_length(trim_end) - self.stripped_indentation_length; + let indent_len = Indent::from_str(trim_end).len() - self.stripped_indentation_length; let in_docstring_indent = " ".repeat(indent_len) + trim_end.trim_start(); text(&in_docstring_indent).fmt(self.f)?; } else { @@ -907,7 +909,7 @@ struct CodeExampleRst<'src> { /// The content body of a block needs to be indented more than the line /// opening the block, so we use this indentation to look for indentation /// that is "more than" it. - opening_indent: usize, + opening_indent: Indent, /// The minimum indent of the block measured via `indentation_length`. /// @@ -926,7 +928,7 @@ struct CodeExampleRst<'src> { /// When the code snippet has been extracted, it is re-built before being /// reformatted. The minimum indent is stripped from each line when it is /// re-built. - min_indent: Option, + min_indent: Option, /// Whether this is a directive block or not. When not a directive, this is /// a literal block. The main difference between them is that they start @@ -975,7 +977,7 @@ impl<'src> CodeExampleRst<'src> { } Some(CodeExampleRst { lines: vec![], - opening_indent: indentation_length(opening_indent), + opening_indent: Indent::from_str(opening_indent), min_indent: None, is_directive: false, }) @@ -1013,7 +1015,7 @@ impl<'src> CodeExampleRst<'src> { } Some(CodeExampleRst { lines: vec![], - opening_indent: indentation_length(original.line), + opening_indent: Indent::from_str(original.line), min_indent: None, is_directive: true, }) @@ -1033,7 +1035,7 @@ impl<'src> CodeExampleRst<'src> { line.code = if line.original.line.trim().is_empty() { "" } else { - indentation_trim(min_indent, line.original.line) + min_indent.trim(line.original.line) }; } &self.lines @@ -1070,7 +1072,7 @@ impl<'src> CodeExampleRst<'src> { // an empty line followed by an unindented non-empty line. if let Some(next) = original.next { let (next_indent, next_rest) = indent_with_suffix(next); - if !next_rest.is_empty() && indentation_length(next_indent) <= self.opening_indent { + if !next_rest.is_empty() && Indent::from_str(next_indent) <= self.opening_indent { self.push_format_action(queue); return None; } @@ -1082,7 +1084,7 @@ impl<'src> CodeExampleRst<'src> { queue.push_back(CodeExampleAddAction::Kept); return Some(self); } - let indent_len = indentation_length(indent); + let indent_len = Indent::from_str(indent); if indent_len <= self.opening_indent { // If we find an unindented non-empty line at the same (or less) // indentation of the opening line at this point, then we know it @@ -1144,7 +1146,7 @@ impl<'src> CodeExampleRst<'src> { queue.push_back(CodeExampleAddAction::Print { original }); return Some(self); } - let min_indent = indentation_length(indent); + let min_indent = Indent::from_str(indent); // At this point, we found a non-empty line. The only thing we require // is that its indentation is strictly greater than the indentation of // the line containing the `::`. Otherwise, we treat this as an invalid @@ -1223,7 +1225,7 @@ struct CodeExampleMarkdown<'src> { /// /// This indentation is trimmed from the indentation of every line in the /// body of the code block, - opening_fence_indent: usize, + opening_fence_indent: Indent, /// The kind of fence, backticks or tildes, used for this block. We need to /// keep track of which kind was used to open the block in order to look @@ -1292,7 +1294,7 @@ impl<'src> CodeExampleMarkdown<'src> { }; Some(CodeExampleMarkdown { lines: vec![], - opening_fence_indent: indentation_length(opening_fence_indent), + opening_fence_indent: Indent::from_str(opening_fence_indent), fence_kind, fence_len, }) @@ -1325,7 +1327,7 @@ impl<'src> CodeExampleMarkdown<'src> { // its indent normalized. And, at the time of writing, a subsequent // formatting run undoes this indentation, thus violating idempotency. if !original.line.trim_whitespace().is_empty() - && indentation_length(original.line) < self.opening_fence_indent + && Indent::from_str(original.line) < self.opening_fence_indent { queue.push_back(self.into_reset_action()); queue.push_back(CodeExampleAddAction::Print { original }); @@ -1371,7 +1373,7 @@ impl<'src> CodeExampleMarkdown<'src> { // Unlike reStructuredText blocks, for Markdown fenced code blocks, the // indentation that we want to strip from each line is known when the // block is opened. So we can strip it as we collect lines. - let code = indentation_trim(self.opening_fence_indent, original.line); + let code = self.opening_fence_indent.trim(original.line); self.lines.push(CodeExampleLine { original, code }); } @@ -1537,53 +1539,131 @@ fn needs_chaperone_space(normalized: &NormalizedString, trim_end: &str) -> bool || trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1 } -/// Returns the indentation's visual width in columns/spaces. -/// -/// For docstring indentation, black counts spaces as 1 and tabs by increasing the indentation up -/// to the next multiple of 8. This is effectively a port of -/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs), -/// which black [calls with the default tab width of 8](https://github.com/psf/black/blob/c36e468794f9256d5e922c399240d49782ba04f1/src/black/strings.py#L61). -fn indentation_length(line: &str) -> usize { - let mut indentation = 0usize; - for char in line.chars() { - if char == '\t' { - // Pad to the next multiple of tab_width - indentation += 8 - (indentation.rem_euclid(8)); - } else if char.is_whitespace() { - indentation += char.len_utf8(); +#[derive(Copy, Clone, Debug)] +enum Indent { + /// Space only indentation or an empty indentation. + /// + /// The value is the number of spaces. + Spaces(usize), + + /// Tabs only indentation. + Tabs { count: usize }, + + /// Smart tab indentation that uses tabs for indents, and spaces for alignment. + Align { tabs: usize, spaces: usize }, + + /// Mixed indentation of tabs and spaces. + Mixed(usize), +} + +impl Indent { + fn from_str(s: &str) -> Indent { + let mut iter = s.chars().peekable(); + + let spaces = iter.peeking_take_while(|c| *c == ' ').count(); + let tabs = iter.peeking_take_while(|c| *c == '\t').count(); + + if tabs == 0 { + // No indent, or spaces only indent + return Indent::Spaces(spaces); + } + + // Test if there are any spaces following the tabs + let spaces = iter.peeking_take_while(|c| *c == ' ').count(); + + if spaces == 0 { + return Indent::Tabs { count: tabs }; + } + + // At this point it's either a smart tab (tabs followed by spaces) or a wild mix of tabs and spaces. + if iter.peek().copied() == Some('\t') { + // Sequence of tabs, spaces, tabs... + let indent_width = 8; + let mut indentation = tabs * indent_width + spaces; + + for char in iter { + if char == '\t' { + // Pad to the next multiple of tab_width + indentation += indent_width - (indentation.rem_euclid(indent_width)); + } else if char.is_whitespace() { + indentation += char.len_utf8(); + } else { + break; + } + } + + // Mixed tabs and spaces + Indent::Mixed(indentation) } else { - break; + Indent::Align { tabs, spaces } } } - indentation -} -/// Trims at most `indent_len` indentation from the beginning of `line`. -/// -/// This treats indentation in precisely the same way as `indentation_length`. -/// As such, it is expected that `indent_len` is computed from -/// `indentation_length`. This is useful when one needs to trim some minimum -/// level of indentation from a code snippet collected from a docstring before -/// attempting to reformat it. -fn indentation_trim(indent_len: usize, line: &str) -> &str { - let mut seen_indent_len = 0; - let mut trimmed = line; - for char in line.chars() { - if seen_indent_len >= indent_len { - return trimmed; + /// Returns the indentation's visual width in columns/spaces. + /// + /// For docstring indentation, black counts spaces as 1 and tabs by increasing the indentation up + /// to the next multiple of 8. This is effectively a port of + /// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs), + /// which black [calls with the default tab width of 8](https://github.com/psf/black/blob/c36e468794f9256d5e922c399240d49782ba04f1/src/black/strings.py#L61). + const fn len(self) -> usize { + let indent_width = 8usize; + match self { + Indent::Spaces(count) => count, + Indent::Tabs { count } => count * indent_width, + Indent::Align { tabs, spaces } => tabs * indent_width + spaces, + Indent::Mixed(width) => width, } - if char == '\t' { - // Pad to the next multiple of tab_width - seen_indent_len += 8 - (seen_indent_len.rem_euclid(8)); - trimmed = &trimmed[1..]; - } else if char.is_whitespace() { - seen_indent_len += char.len_utf8(); - trimmed = &trimmed[char.len_utf8()..]; - } else { - break; + } + + /// Trims at most `indent_len` indentation from the beginning of `line`. + /// + /// This treats indentation in precisely the same way as `indentation_length`. + /// As such, it is expected that `indent_len` is computed from + /// `indentation_length`. This is useful when one needs to trim some minimum + /// level of indentation from a code snippet collected from a docstring before + /// attempting to reformat it. + fn trim(self, line: &str) -> &str { + let indent_width = 8usize; + + let mut seen_indent_len = 0; + let mut trimmed = line; + let indent_len = self.len(); + + for char in line.chars() { + if seen_indent_len >= indent_len { + return trimmed; + } + if char == '\t' { + // Pad to the next multiple of tab_width + seen_indent_len += indent_width - (seen_indent_len.rem_euclid(indent_width)); + trimmed = &trimmed[1..]; + } else if char.is_whitespace() { + seen_indent_len += char.len_utf8(); + trimmed = &trimmed[char.len_utf8()..]; + } else { + break; + } } + trimmed + } +} + +impl PartialOrd for Indent { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.len().cmp(&other.len())) + } +} + +impl PartialEq for Indent { + fn eq(&self, other: &Self) -> bool { + self.len() == other.len() + } +} + +impl Default for Indent { + fn default() -> Self { + Indent::Spaces(0) } - trimmed } /// Returns the indentation of the given line and everything following it. @@ -1613,14 +1693,13 @@ fn is_rst_option(line: &str) -> bool { #[cfg(test)] mod tests { - - use super::indentation_length; + use crate::string::docstring::Indent; #[test] fn test_indentation_like_black() { - assert_eq!(indentation_length("\t \t \t"), 24); - assert_eq!(indentation_length("\t \t"), 24); - assert_eq!(indentation_length("\t\t\t"), 24); - assert_eq!(indentation_length(" "), 4); + assert_eq!(Indent::from_str("\t \t \t").len(), 24); + assert_eq!(Indent::from_str("\t \t").len(), 24); + assert_eq!(Indent::from_str("\t\t\t").len(), 24); + assert_eq!(Indent::from_str(" ").len(), 4); } }