diff --git a/crates/ruff_python_formatter/src/expression/string.rs b/crates/ruff_python_formatter/src/expression/string/docstring.rs similarity index 52% rename from crates/ruff_python_formatter/src/expression/string.rs rename to crates/ruff_python_formatter/src/expression/string/docstring.rs index 2c711ac60ef1f..8f1f335218f6d 100644 --- a/crates/ruff_python_formatter/src/expression/string.rs +++ b/crates/ruff_python_formatter/src/expression/string/docstring.rs @@ -1,832 +1,14 @@ use std::borrow::Cow; -use bitflags::bitflags; - -use ruff_formatter::{format_args, write, Printed}; -use ruff_python_ast::AnyNodeRef; -use ruff_python_ast::{ - self as ast, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef, -}; -use ruff_source_file::Locator; -use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; - -use crate::comments::{leading_comments, trailing_comments}; -use crate::expression::parentheses::{ - in_parentheses_only_group, in_parentheses_only_soft_line_break_or_space, +use { + ruff_formatter::{write, Printed}, + ruff_source_file::Locator, + ruff_text_size::{Ranged, TextLen, TextRange, TextSize}, }; -use crate::expression::Expr; -use crate::prelude::*; -use crate::FormatModuleError; -use crate::QuoteStyle; - -#[derive(Copy, Clone, Debug)] -enum Quoting { - CanChange, - Preserve, -} - -#[derive(Clone, Debug)] -pub(super) enum AnyString<'a> { - String(&'a ExprStringLiteral), - Bytes(&'a ExprBytesLiteral), - FString(&'a ExprFString), -} - -impl<'a> AnyString<'a> { - pub(crate) fn from_expression(expression: &'a Expr) -> Option> { - match expression { - Expr::StringLiteral(string) => Some(AnyString::String(string)), - Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)), - Expr::FString(fstring) => Some(AnyString::FString(fstring)), - _ => None, - } - } - - fn quoting(&self, locator: &Locator) -> Quoting { - match self { - Self::String(_) | Self::Bytes(_) => Quoting::CanChange, - Self::FString(f_string) => { - let unprefixed = locator - .slice(f_string.range) - .trim_start_matches(|c| c != '"' && c != '\''); - let triple_quoted = - unprefixed.starts_with(r#"""""#) || unprefixed.starts_with(r"'''"); - if f_string.value.elements().any(|value| match value { - Expr::FormattedValue(ast::ExprFormattedValue { range, .. }) => { - let string_content = locator.slice(*range); - if triple_quoted { - string_content.contains(r#"""""#) || string_content.contains("'''") - } else { - string_content.contains(['"', '\'']) - } - } - _ => false, - }) { - Quoting::Preserve - } else { - Quoting::CanChange - } - } - } - } - - /// Returns `true` if the string is implicitly concatenated. - pub(super) fn is_implicit_concatenated(&self) -> bool { - match self { - Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(), - Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(), - Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(), - } - } - - fn parts(&self) -> Vec> { - match self { - Self::String(ExprStringLiteral { value, .. }) => { - value.parts().map(AnyStringPart::String).collect() - } - Self::Bytes(ExprBytesLiteral { value, .. }) => { - value.parts().map(AnyStringPart::Bytes).collect() - } - Self::FString(ExprFString { value, .. }) => value - .parts() - .map(|f_string_part| match f_string_part { - ast::FStringPart::Literal(string_literal) => { - AnyStringPart::String(string_literal) - } - ast::FStringPart::FString(f_string) => AnyStringPart::FString(f_string), - }) - .collect(), - } - } -} - -impl Ranged for AnyString<'_> { - fn range(&self) -> TextRange { - match self { - Self::String(expr) => expr.range(), - Self::Bytes(expr) => expr.range(), - Self::FString(expr) => expr.range(), - } - } -} - -impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> { - fn from(value: &AnyString<'a>) -> Self { - match value { - AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr), - AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr), - AnyString::FString(expr) => AnyNodeRef::ExprFString(expr), - } - } -} - -impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> { - fn from(value: &AnyString<'a>) -> Self { - match value { - AnyString::String(expr) => ExpressionRef::StringLiteral(expr), - AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr), - AnyString::FString(expr) => ExpressionRef::FString(expr), - } - } -} - -#[derive(Clone, Debug)] -enum AnyStringPart<'a> { - String(&'a ast::StringLiteral), - Bytes(&'a ast::BytesLiteral), - FString(&'a ast::FString), -} - -impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> { - fn from(value: &AnyStringPart<'a>) -> Self { - match value { - AnyStringPart::String(part) => AnyNodeRef::StringLiteral(part), - AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part), - AnyStringPart::FString(part) => AnyNodeRef::FString(part), - } - } -} - -impl Ranged for AnyStringPart<'_> { - fn range(&self) -> TextRange { - match self { - Self::String(part) => part.range(), - Self::Bytes(part) => part.range(), - Self::FString(part) => part.range(), - } - } -} - -pub(super) struct FormatString<'a> { - string: &'a AnyString<'a>, - layout: StringLayout, -} - -#[derive(Default, Copy, Clone, Debug)] -pub enum StringLayout { - #[default] - Default, - DocString, - /// An implicit concatenated string in a binary like (e.g. `a + b` or `a < b`) expression. - /// - /// Formats the implicit concatenated string parts without the enclosing group because the group - /// is added by the binary like formatting. - ImplicitConcatenatedStringInBinaryLike, -} - -impl<'a> FormatString<'a> { - pub(super) fn new(string: &'a AnyString<'a>) -> Self { - Self { - string, - layout: StringLayout::Default, - } - } - - pub(super) fn with_layout(mut self, layout: StringLayout) -> Self { - self.layout = layout; - self - } -} - -impl<'a> Format> for FormatString<'a> { - fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { - let parent_docstring_quote_style = f.context().docstring(); - let locator = f.context().locator(); - let result = match self.layout { - StringLayout::Default => { - if self.string.is_implicit_concatenated() { - in_parentheses_only_group(&FormatStringContinuation::new(self.string)).fmt(f) - } else { - StringPart::from_source(self.string.range(), &locator) - .normalize( - self.string.quoting(&locator), - &locator, - f.options().quote_style(), - parent_docstring_quote_style, - ) - .fmt(f) - } - } - StringLayout::DocString => { - let string_part = StringPart::from_source(self.string.range(), &locator); - let normalized = string_part.normalize( - Quoting::CanChange, - &locator, - f.options().quote_style(), - parent_docstring_quote_style, - ); - format_docstring(&normalized, f) - } - StringLayout::ImplicitConcatenatedStringInBinaryLike => { - FormatStringContinuation::new(self.string).fmt(f) - } - }; - // TODO(dhruvmanila): With PEP 701, comments can be inside f-strings. - // This is to mark all of those comments as formatted but we need to - // figure out how to handle them. Note that this needs to be done only - // after the f-string is formatted, so only for all the non-formatted - // comments. - if let AnyString::FString(fstring) = self.string { - let comments = f.context().comments(); - fstring.value.elements().for_each(|value| { - comments.mark_verbatim_node_comments_formatted(value.into()); - }); - } - result - } -} - -struct FormatStringContinuation<'a> { - string: &'a AnyString<'a>, -} - -impl<'a> FormatStringContinuation<'a> { - fn new(string: &'a AnyString<'a>) -> Self { - Self { string } - } -} - -impl Format> for FormatStringContinuation<'_> { - fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { - let comments = f.context().comments().clone(); - let locator = f.context().locator(); - let in_docstring = f.context().docstring(); - let quote_style = f.options().quote_style(); - - let mut joiner = f.join_with(in_parentheses_only_soft_line_break_or_space()); - - for part in self.string.parts() { - let normalized = StringPart::from_source(part.range(), &locator).normalize( - self.string.quoting(&locator), - &locator, - quote_style, - in_docstring, - ); - - joiner.entry(&format_args![ - line_suffix_boundary(), - leading_comments(comments.leading(&part)), - normalized, - trailing_comments(comments.trailing(&part)) - ]); - } - - joiner.finish() - } -} - -#[derive(Debug)] -struct StringPart { - /// The prefix. - prefix: StringPrefix, - - /// The actual quotes of the string in the source - quotes: StringQuotes, - - /// The range of the string's content (full range minus quotes and prefix) - content_range: TextRange, -} - -impl StringPart { - fn from_source(range: TextRange, locator: &Locator) -> Self { - let string_content = locator.slice(range); - - let prefix = StringPrefix::parse(string_content); - let after_prefix = &string_content[usize::from(prefix.text_len())..]; - - let quotes = - StringQuotes::parse(after_prefix).expect("Didn't find string quotes after prefix"); - let relative_raw_content_range = TextRange::new( - prefix.text_len() + quotes.text_len(), - string_content.text_len() - quotes.text_len(), - ); - let raw_content_range = relative_raw_content_range + range.start(); - - Self { - prefix, - content_range: raw_content_range, - quotes, - } - } - - /// Computes the strings preferred quotes and normalizes its content. - /// - /// The parent docstring quote style should be set when formatting a code - /// snippet within the docstring. The quote style should correspond to the - /// style of quotes used by said docstring. Normalization will ensure the - /// quoting styles don't conflict. - fn normalize<'a>( - self, - quoting: Quoting, - locator: &'a Locator, - configured_style: QuoteStyle, - parent_docstring_quote_style: Option, - ) -> NormalizedString<'a> { - // Per PEP 8 and PEP 257, always prefer double quotes for docstrings - // and triple-quoted strings. (We assume docstrings are always - // triple-quoted.) - let preferred_style = if self.quotes.triple { - // ... unless we're formatting a code snippet inside a docstring, - // then we specifically want to invert our quote style to avoid - // writing out invalid Python. - // - // It's worth pointing out that we can actually wind up being - // somewhat out of sync with PEP8 in this case. Consider this - // example: - // - // def foo(): - // ''' - // Something. - // - // >>> """tricksy""" - // ''' - // pass - // - // Ideally, this would be reformatted as: - // - // def foo(): - // """ - // Something. - // - // >>> '''tricksy''' - // """ - // pass - // - // But the logic here results in the original quoting being - // preserved. This is because the quoting style of the outer - // docstring is determined, in part, by looking at its contents. In - // this case, it notices that it contains a `"""` and thus infers - // that using `'''` would overall read better because it avoids - // the need to escape the interior `"""`. Except... in this case, - // the `"""` is actually part of a code snippet that could get - // reformatted to using a different quoting style itself. - // - // Fixing this would, I believe, require some fairly seismic - // changes to how formatting strings works. Namely, we would need - // to look for code snippets before normalizing the docstring, and - // then figure out the quoting style more holistically by looking - // at the various kinds of quotes used in the code snippets and - // what reformatting them might look like. - // - // Overall this is a bit of a corner case and just inverting the - // style from what the parent ultimately decided upon works, even - // if it doesn't have perfect alignment with PEP8. - if let Some(style) = parent_docstring_quote_style { - style.invert() - } else { - QuoteStyle::Double - } - } else { - configured_style - }; - - let raw_content = locator.slice(self.content_range); - - let quotes = match quoting { - Quoting::Preserve => self.quotes, - Quoting::CanChange => { - if self.prefix.is_raw_string() { - choose_quotes_raw(raw_content, self.quotes, preferred_style) - } else { - choose_quotes(raw_content, self.quotes, preferred_style) - } - } - }; - - let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix); - - NormalizedString { - prefix: self.prefix, - content_range: self.content_range, - text: normalized, - quotes, - } - } -} - -#[derive(Debug)] -struct NormalizedString<'a> { - prefix: StringPrefix, - - /// The quotes of the normalized string (preferred quotes) - quotes: StringQuotes, - - /// The range of the string's content in the source (minus prefix and quotes). - content_range: TextRange, - - /// The normalized text - text: Cow<'a, str>, -} - -impl Ranged for NormalizedString<'_> { - fn range(&self) -> TextRange { - self.content_range - } -} - -impl Format> for NormalizedString<'_> { - fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { - write!(f, [self.prefix, self.quotes])?; - match &self.text { - Cow::Borrowed(_) => { - source_text_slice(self.range()).fmt(f)?; - } - Cow::Owned(normalized) => { - text(normalized, Some(self.start())).fmt(f)?; - } - } - self.quotes.fmt(f) - } -} - -bitflags! { - #[derive(Copy, Clone, Debug, PartialEq, Eq)] - pub(super) struct StringPrefix: u8 { - const UNICODE = 0b0000_0001; - /// `r"test"` - const RAW = 0b0000_0010; - /// `R"test" - const RAW_UPPER = 0b0000_0100; - const BYTE = 0b0000_1000; - const F_STRING = 0b0001_0000; - } -} - -impl StringPrefix { - pub(super) fn parse(input: &str) -> StringPrefix { - let chars = input.chars(); - let mut prefix = StringPrefix::empty(); - - for c in chars { - let flag = match c { - 'u' | 'U' => StringPrefix::UNICODE, - 'f' | 'F' => StringPrefix::F_STRING, - 'b' | 'B' => StringPrefix::BYTE, - 'r' => StringPrefix::RAW, - 'R' => StringPrefix::RAW_UPPER, - '\'' | '"' => break, - c => { - unreachable!( - "Unexpected character '{c}' terminating the prefix of a string literal" - ); - } - }; - - prefix |= flag; - } - - prefix - } - - pub(super) const fn text_len(self) -> TextSize { - TextSize::new(self.bits().count_ones()) - } - - pub(super) const fn is_raw_string(self) -> bool { - self.contains(StringPrefix::RAW) || self.contains(StringPrefix::RAW_UPPER) - } - - pub(super) const fn is_fstring(self) -> bool { - self.contains(StringPrefix::F_STRING) - } -} - -impl Format> for StringPrefix { - fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { - // Retain the casing for the raw prefix: - // https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#r-strings-and-r-strings - if self.contains(StringPrefix::RAW) { - token("r").fmt(f)?; - } else if self.contains(StringPrefix::RAW_UPPER) { - token("R").fmt(f)?; - } - - if self.contains(StringPrefix::BYTE) { - token("b").fmt(f)?; - } - - if self.contains(StringPrefix::F_STRING) { - token("f").fmt(f)?; - } - - // Remove the unicode prefix `u` if any because it is meaningless in Python 3+. - - Ok(()) - } -} - -/// Choose the appropriate quote style for a raw string. -/// -/// The preferred quote style is chosen unless the string contains unescaped quotes of the -/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote -/// style is double quotes. -fn choose_quotes_raw( - input: &str, - quotes: StringQuotes, - preferred_style: QuoteStyle, -) -> StringQuotes { - let preferred_quote_char = preferred_style.as_char(); - let mut chars = input.chars().peekable(); - let contains_unescaped_configured_quotes = loop { - match chars.next() { - Some('\\') => { - // Ignore escaped characters - chars.next(); - } - // `"` or `'` - Some(c) if c == preferred_quote_char => { - if !quotes.triple { - break true; - } - - match chars.peek() { - // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser - // about where the closing triple quotes start - None => break true, - Some(next) if *next == preferred_quote_char => { - // `""` or `''` - chars.next(); - - // We can't turn `r'''""'''` into `r""""""""`, nor can we have - // `"""` or `'''` respectively inside the string - if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) { - break true; - } - } - _ => {} - } - } - Some(_) => continue, - None => break false, - } - }; - - StringQuotes { - triple: quotes.triple, - style: if contains_unescaped_configured_quotes { - quotes.style - } else { - preferred_style - }, - } -} -/// Choose the appropriate quote style for a string. -/// -/// For single quoted strings, the preferred quote style is used, unless the alternative quote style -/// would require fewer escapes. -/// -/// For triple quoted strings, the preferred quote style is always used, unless the string contains -/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be -/// used unless the string contains `"""`). -fn choose_quotes(input: &str, quotes: StringQuotes, preferred_style: QuoteStyle) -> StringQuotes { - let style = if quotes.triple { - // True if the string contains a triple quote sequence of the configured quote style. - let mut uses_triple_quotes = false; - let mut chars = input.chars().peekable(); +use crate::{prelude::*, FormatModuleError, QuoteStyle}; - while let Some(c) = chars.next() { - let preferred_quote_char = preferred_style.as_char(); - match c { - '\\' => { - if matches!(chars.peek(), Some('"' | '\\')) { - chars.next(); - } - } - // `"` or `'` - c if c == preferred_quote_char => { - match chars.peek().copied() { - Some(c) if c == preferred_quote_char => { - // `""` or `''` - chars.next(); - - match chars.peek().copied() { - Some(c) if c == preferred_quote_char => { - // `"""` or `'''` - chars.next(); - uses_triple_quotes = true; - break; - } - Some(_) => {} - None => { - // Handle `''' ""'''`. At this point we have consumed both - // double quotes, so on the next iteration the iterator is empty - // and we'd miss the string ending with a preferred quote - uses_triple_quotes = true; - break; - } - } - } - Some(_) => { - // A single quote char, this is ok - } - None => { - // Trailing quote at the end of the comment - uses_triple_quotes = true; - break; - } - } - } - _ => continue, - } - } - - if uses_triple_quotes { - // String contains a triple quote sequence of the configured quote style. - // Keep the existing quote style. - quotes.style - } else { - preferred_style - } - } else { - let mut single_quotes = 0u32; - let mut double_quotes = 0u32; - - for c in input.chars() { - match c { - '\'' => { - single_quotes += 1; - } - - '"' => { - double_quotes += 1; - } - - _ => continue, - } - } - - match preferred_style { - QuoteStyle::Single => { - if single_quotes > double_quotes { - QuoteStyle::Double - } else { - QuoteStyle::Single - } - } - QuoteStyle::Double => { - if double_quotes > single_quotes { - QuoteStyle::Single - } else { - QuoteStyle::Double - } - } - } - }; - - StringQuotes { - triple: quotes.triple, - style, - } -} - -#[derive(Copy, Clone, Debug)] -pub(super) struct StringQuotes { - triple: bool, - style: QuoteStyle, -} - -impl StringQuotes { - pub(super) fn parse(input: &str) -> Option { - let mut chars = input.chars(); - - let quote_char = chars.next()?; - let style = QuoteStyle::try_from(quote_char).ok()?; - - let triple = chars.next() == Some(quote_char) && chars.next() == Some(quote_char); - - Some(Self { triple, style }) - } - - pub(super) const fn is_triple(self) -> bool { - self.triple - } - - const fn text_len(self) -> TextSize { - if self.triple { - TextSize::new(3) - } else { - TextSize::new(1) - } - } -} - -impl Format> for StringQuotes { - fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { - let quotes = match (self.style, self.triple) { - (QuoteStyle::Single, false) => "'", - (QuoteStyle::Single, true) => "'''", - (QuoteStyle::Double, false) => "\"", - (QuoteStyle::Double, true) => "\"\"\"", - }; - - token(quotes).fmt(f) - } -} - -/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input` -/// with the provided [`StringQuotes`] style. -/// -/// Returns the normalized string and whether it contains new lines. -fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow { - // The normalized string if `input` is not yet normalized. - // `output` must remain empty if `input` is already normalized. - let mut output = String::new(); - // Tracks the last index of `input` that has been written to `output`. - // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. - let mut last_index = 0; - - let style = quotes.style; - let preferred_quote = style.as_char(); - let opposite_quote = style.invert().as_char(); - - let mut chars = input.char_indices().peekable(); - - let is_raw = prefix.is_raw_string(); - let is_fstring = prefix.is_fstring(); - let mut formatted_value_nesting = 0u32; - - while let Some((index, c)) = chars.next() { - if is_fstring && matches!(c, '{' | '}') { - if chars.peek().copied().is_some_and(|(_, next)| next == c) { - // Skip over the second character of the double braces - chars.next(); - } else if c == '{' { - formatted_value_nesting += 1; - } else { - // Safe to assume that `c == '}'` here because of the matched pattern above - formatted_value_nesting = formatted_value_nesting.saturating_sub(1); - } - continue; - } - if c == '\r' { - output.push_str(&input[last_index..index]); - - // Skip over the '\r' character, keep the `\n` - if chars.peek().copied().is_some_and(|(_, next)| next == '\n') { - chars.next(); - } - // Replace the `\r` with a `\n` - else { - output.push('\n'); - } - - last_index = index + '\r'.len_utf8(); - } else if !quotes.triple && !is_raw { - if c == '\\' { - if let Some((_, next)) = chars.peek().copied() { - #[allow(clippy::if_same_then_else)] - if next == opposite_quote && formatted_value_nesting == 0 { - // Remove the escape by ending before the backslash and starting again with the quote - chars.next(); - output.push_str(&input[last_index..index]); - last_index = index + '\\'.len_utf8(); - } else if next == preferred_quote { - // Quote is already escaped, skip over it. - chars.next(); - } else if next == '\\' { - // Skip over escaped backslashes - chars.next(); - } - } - } else if c == preferred_quote && formatted_value_nesting == 0 { - // Escape the quote - output.push_str(&input[last_index..index]); - output.push('\\'); - output.push(c); - last_index = index + preferred_quote.len_utf8(); - } - } - } - - let normalized = if last_index == 0 { - Cow::Borrowed(input) - } else { - output.push_str(&input[last_index..]); - Cow::Owned(output) - }; - - normalized -} - -/// For docstring indentation, black counts spaces as 1 and tabs by increasing the indentation up -/// to the next multiple of 8. This is effectively a port of -/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs), -/// which black [calls with the default tab width of 8](https://github.com/psf/black/blob/c36e468794f9256d5e922c399240d49782ba04f1/src/black/strings.py#L61). -fn indentation_length(line: &str) -> TextSize { - let mut indentation = 0u32; - for char in line.chars() { - if char == '\t' { - // Pad to the next multiple of tab_width - indentation += 8 - (indentation.rem_euclid(8)); - } else if char.is_whitespace() { - indentation += u32::from(char.text_len()); - } else { - break; - } - } - TextSize::new(indentation) -} +use super::NormalizedString; /// Format a docstring by trimming whitespace and adjusting the indentation. /// @@ -913,7 +95,7 @@ fn indentation_length(line: &str) -> TextSize { /// line c /// """ /// ``` -fn format_docstring(normalized: &NormalizedString, f: &mut PyFormatter) -> FormatResult<()> { +pub(super) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> FormatResult<()> { let docstring = &normalized.text; // Black doesn't change the indentation of docstrings that contain an escaped newline @@ -1551,11 +733,31 @@ fn needs_chaperone_space(normalized: &NormalizedString, trim_end: &str) -> bool || trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1 } +/// For docstring indentation, black counts spaces as 1 and tabs by increasing the indentation up +/// to the next multiple of 8. This is effectively a port of +/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs), +/// which black [calls with the default tab width of 8](https://github.com/psf/black/blob/c36e468794f9256d5e922c399240d49782ba04f1/src/black/strings.py#L61). +fn indentation_length(line: &str) -> TextSize { + let mut indentation = 0u32; + for char in line.chars() { + if char == '\t' { + // Pad to the next multiple of tab_width + indentation += 8 - (indentation.rem_euclid(8)); + } else if char.is_whitespace() { + indentation += u32::from(char.text_len()); + } else { + break; + } + } + TextSize::new(indentation) +} + #[cfg(test)] mod tests { - use crate::expression::string::indentation_length; use ruff_text_size::TextSize; + use super::indentation_length; + #[test] fn test_indentation_like_black() { assert_eq!(indentation_length("\t \t \t"), TextSize::new(24)); diff --git a/crates/ruff_python_formatter/src/expression/string/mod.rs b/crates/ruff_python_formatter/src/expression/string/mod.rs new file mode 100644 index 0000000000000..5a7a6a1b4677b --- /dev/null +++ b/crates/ruff_python_formatter/src/expression/string/mod.rs @@ -0,0 +1,811 @@ +use std::borrow::Cow; + +use bitflags::bitflags; + +use ruff_formatter::{format_args, write}; +use ruff_python_ast::AnyNodeRef; +use ruff_python_ast::{ + self as ast, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef, +}; +use ruff_source_file::Locator; +use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; + +use crate::comments::{leading_comments, trailing_comments}; +use crate::expression::parentheses::{ + in_parentheses_only_group, in_parentheses_only_soft_line_break_or_space, +}; +use crate::expression::Expr; +use crate::prelude::*; +use crate::QuoteStyle; + +mod docstring; + +#[derive(Copy, Clone, Debug)] +enum Quoting { + CanChange, + Preserve, +} + +#[derive(Clone, Debug)] +pub(super) enum AnyString<'a> { + String(&'a ExprStringLiteral), + Bytes(&'a ExprBytesLiteral), + FString(&'a ExprFString), +} + +impl<'a> AnyString<'a> { + pub(crate) fn from_expression(expression: &'a Expr) -> Option> { + match expression { + Expr::StringLiteral(string) => Some(AnyString::String(string)), + Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)), + Expr::FString(fstring) => Some(AnyString::FString(fstring)), + _ => None, + } + } + + fn quoting(&self, locator: &Locator) -> Quoting { + match self { + Self::String(_) | Self::Bytes(_) => Quoting::CanChange, + Self::FString(f_string) => { + let unprefixed = locator + .slice(f_string.range) + .trim_start_matches(|c| c != '"' && c != '\''); + let triple_quoted = + unprefixed.starts_with(r#"""""#) || unprefixed.starts_with(r"'''"); + if f_string.value.elements().any(|value| match value { + Expr::FormattedValue(ast::ExprFormattedValue { range, .. }) => { + let string_content = locator.slice(*range); + if triple_quoted { + string_content.contains(r#"""""#) || string_content.contains("'''") + } else { + string_content.contains(['"', '\'']) + } + } + _ => false, + }) { + Quoting::Preserve + } else { + Quoting::CanChange + } + } + } + } + + /// Returns `true` if the string is implicitly concatenated. + pub(super) fn is_implicit_concatenated(&self) -> bool { + match self { + Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(), + Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(), + Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(), + } + } + + fn parts(&self) -> Vec> { + match self { + Self::String(ExprStringLiteral { value, .. }) => { + value.parts().map(AnyStringPart::String).collect() + } + Self::Bytes(ExprBytesLiteral { value, .. }) => { + value.parts().map(AnyStringPart::Bytes).collect() + } + Self::FString(ExprFString { value, .. }) => value + .parts() + .map(|f_string_part| match f_string_part { + ast::FStringPart::Literal(string_literal) => { + AnyStringPart::String(string_literal) + } + ast::FStringPart::FString(f_string) => AnyStringPart::FString(f_string), + }) + .collect(), + } + } +} + +impl Ranged for AnyString<'_> { + fn range(&self) -> TextRange { + match self { + Self::String(expr) => expr.range(), + Self::Bytes(expr) => expr.range(), + Self::FString(expr) => expr.range(), + } + } +} + +impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> { + fn from(value: &AnyString<'a>) -> Self { + match value { + AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr), + AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr), + AnyString::FString(expr) => AnyNodeRef::ExprFString(expr), + } + } +} + +impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> { + fn from(value: &AnyString<'a>) -> Self { + match value { + AnyString::String(expr) => ExpressionRef::StringLiteral(expr), + AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr), + AnyString::FString(expr) => ExpressionRef::FString(expr), + } + } +} + +#[derive(Clone, Debug)] +enum AnyStringPart<'a> { + String(&'a ast::StringLiteral), + Bytes(&'a ast::BytesLiteral), + FString(&'a ast::FString), +} + +impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> { + fn from(value: &AnyStringPart<'a>) -> Self { + match value { + AnyStringPart::String(part) => AnyNodeRef::StringLiteral(part), + AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part), + AnyStringPart::FString(part) => AnyNodeRef::FString(part), + } + } +} + +impl Ranged for AnyStringPart<'_> { + fn range(&self) -> TextRange { + match self { + Self::String(part) => part.range(), + Self::Bytes(part) => part.range(), + Self::FString(part) => part.range(), + } + } +} + +pub(super) struct FormatString<'a> { + string: &'a AnyString<'a>, + layout: StringLayout, +} + +#[derive(Default, Copy, Clone, Debug)] +pub enum StringLayout { + #[default] + Default, + DocString, + /// An implicit concatenated string in a binary like (e.g. `a + b` or `a < b`) expression. + /// + /// Formats the implicit concatenated string parts without the enclosing group because the group + /// is added by the binary like formatting. + ImplicitConcatenatedStringInBinaryLike, +} + +impl<'a> FormatString<'a> { + pub(super) fn new(string: &'a AnyString<'a>) -> Self { + Self { + string, + layout: StringLayout::Default, + } + } + + pub(super) fn with_layout(mut self, layout: StringLayout) -> Self { + self.layout = layout; + self + } +} + +impl<'a> Format> for FormatString<'a> { + fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + let parent_docstring_quote_style = f.context().docstring(); + let locator = f.context().locator(); + let result = match self.layout { + StringLayout::Default => { + if self.string.is_implicit_concatenated() { + in_parentheses_only_group(&FormatStringContinuation::new(self.string)).fmt(f) + } else { + StringPart::from_source(self.string.range(), &locator) + .normalize( + self.string.quoting(&locator), + &locator, + f.options().quote_style(), + parent_docstring_quote_style, + ) + .fmt(f) + } + } + StringLayout::DocString => { + let string_part = StringPart::from_source(self.string.range(), &locator); + let normalized = string_part.normalize( + Quoting::CanChange, + &locator, + f.options().quote_style(), + parent_docstring_quote_style, + ); + docstring::format(&normalized, f) + } + StringLayout::ImplicitConcatenatedStringInBinaryLike => { + FormatStringContinuation::new(self.string).fmt(f) + } + }; + // TODO(dhruvmanila): With PEP 701, comments can be inside f-strings. + // This is to mark all of those comments as formatted but we need to + // figure out how to handle them. Note that this needs to be done only + // after the f-string is formatted, so only for all the non-formatted + // comments. + if let AnyString::FString(fstring) = self.string { + let comments = f.context().comments(); + fstring.value.elements().for_each(|value| { + comments.mark_verbatim_node_comments_formatted(value.into()); + }); + } + result + } +} + +struct FormatStringContinuation<'a> { + string: &'a AnyString<'a>, +} + +impl<'a> FormatStringContinuation<'a> { + fn new(string: &'a AnyString<'a>) -> Self { + Self { string } + } +} + +impl Format> for FormatStringContinuation<'_> { + fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + let comments = f.context().comments().clone(); + let locator = f.context().locator(); + let in_docstring = f.context().docstring(); + let quote_style = f.options().quote_style(); + + let mut joiner = f.join_with(in_parentheses_only_soft_line_break_or_space()); + + for part in self.string.parts() { + let normalized = StringPart::from_source(part.range(), &locator).normalize( + self.string.quoting(&locator), + &locator, + quote_style, + in_docstring, + ); + + joiner.entry(&format_args![ + line_suffix_boundary(), + leading_comments(comments.leading(&part)), + normalized, + trailing_comments(comments.trailing(&part)) + ]); + } + + joiner.finish() + } +} + +#[derive(Debug)] +struct StringPart { + /// The prefix. + prefix: StringPrefix, + + /// The actual quotes of the string in the source + quotes: StringQuotes, + + /// The range of the string's content (full range minus quotes and prefix) + content_range: TextRange, +} + +impl StringPart { + fn from_source(range: TextRange, locator: &Locator) -> Self { + let string_content = locator.slice(range); + + let prefix = StringPrefix::parse(string_content); + let after_prefix = &string_content[usize::from(prefix.text_len())..]; + + let quotes = + StringQuotes::parse(after_prefix).expect("Didn't find string quotes after prefix"); + let relative_raw_content_range = TextRange::new( + prefix.text_len() + quotes.text_len(), + string_content.text_len() - quotes.text_len(), + ); + let raw_content_range = relative_raw_content_range + range.start(); + + Self { + prefix, + content_range: raw_content_range, + quotes, + } + } + + /// Computes the strings preferred quotes and normalizes its content. + /// + /// The parent docstring quote style should be set when formatting a code + /// snippet within the docstring. The quote style should correspond to the + /// style of quotes used by said docstring. Normalization will ensure the + /// quoting styles don't conflict. + fn normalize<'a>( + self, + quoting: Quoting, + locator: &'a Locator, + configured_style: QuoteStyle, + parent_docstring_quote_style: Option, + ) -> NormalizedString<'a> { + // Per PEP 8 and PEP 257, always prefer double quotes for docstrings + // and triple-quoted strings. (We assume docstrings are always + // triple-quoted.) + let preferred_style = if self.quotes.triple { + // ... unless we're formatting a code snippet inside a docstring, + // then we specifically want to invert our quote style to avoid + // writing out invalid Python. + // + // It's worth pointing out that we can actually wind up being + // somewhat out of sync with PEP8 in this case. Consider this + // example: + // + // def foo(): + // ''' + // Something. + // + // >>> """tricksy""" + // ''' + // pass + // + // Ideally, this would be reformatted as: + // + // def foo(): + // """ + // Something. + // + // >>> '''tricksy''' + // """ + // pass + // + // But the logic here results in the original quoting being + // preserved. This is because the quoting style of the outer + // docstring is determined, in part, by looking at its contents. In + // this case, it notices that it contains a `"""` and thus infers + // that using `'''` would overall read better because it avoids + // the need to escape the interior `"""`. Except... in this case, + // the `"""` is actually part of a code snippet that could get + // reformatted to using a different quoting style itself. + // + // Fixing this would, I believe, require some fairly seismic + // changes to how formatting strings works. Namely, we would need + // to look for code snippets before normalizing the docstring, and + // then figure out the quoting style more holistically by looking + // at the various kinds of quotes used in the code snippets and + // what reformatting them might look like. + // + // Overall this is a bit of a corner case and just inverting the + // style from what the parent ultimately decided upon works, even + // if it doesn't have perfect alignment with PEP8. + if let Some(style) = parent_docstring_quote_style { + style.invert() + } else { + QuoteStyle::Double + } + } else { + configured_style + }; + + let raw_content = locator.slice(self.content_range); + + let quotes = match quoting { + Quoting::Preserve => self.quotes, + Quoting::CanChange => { + if self.prefix.is_raw_string() { + choose_quotes_raw(raw_content, self.quotes, preferred_style) + } else { + choose_quotes(raw_content, self.quotes, preferred_style) + } + } + }; + + let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix); + + NormalizedString { + prefix: self.prefix, + content_range: self.content_range, + text: normalized, + quotes, + } + } +} + +#[derive(Debug)] +struct NormalizedString<'a> { + prefix: StringPrefix, + + /// The quotes of the normalized string (preferred quotes) + quotes: StringQuotes, + + /// The range of the string's content in the source (minus prefix and quotes). + content_range: TextRange, + + /// The normalized text + text: Cow<'a, str>, +} + +impl Ranged for NormalizedString<'_> { + fn range(&self) -> TextRange { + self.content_range + } +} + +impl Format> for NormalizedString<'_> { + fn fmt(&self, f: &mut Formatter>) -> FormatResult<()> { + write!(f, [self.prefix, self.quotes])?; + match &self.text { + Cow::Borrowed(_) => { + source_text_slice(self.range()).fmt(f)?; + } + Cow::Owned(normalized) => { + text(normalized, Some(self.start())).fmt(f)?; + } + } + self.quotes.fmt(f) + } +} + +bitflags! { + #[derive(Copy, Clone, Debug, PartialEq, Eq)] + pub(super) struct StringPrefix: u8 { + const UNICODE = 0b0000_0001; + /// `r"test"` + const RAW = 0b0000_0010; + /// `R"test" + const RAW_UPPER = 0b0000_0100; + const BYTE = 0b0000_1000; + const F_STRING = 0b0001_0000; + } +} + +impl StringPrefix { + pub(super) fn parse(input: &str) -> StringPrefix { + let chars = input.chars(); + let mut prefix = StringPrefix::empty(); + + for c in chars { + let flag = match c { + 'u' | 'U' => StringPrefix::UNICODE, + 'f' | 'F' => StringPrefix::F_STRING, + 'b' | 'B' => StringPrefix::BYTE, + 'r' => StringPrefix::RAW, + 'R' => StringPrefix::RAW_UPPER, + '\'' | '"' => break, + c => { + unreachable!( + "Unexpected character '{c}' terminating the prefix of a string literal" + ); + } + }; + + prefix |= flag; + } + + prefix + } + + pub(super) const fn text_len(self) -> TextSize { + TextSize::new(self.bits().count_ones()) + } + + pub(super) const fn is_raw_string(self) -> bool { + self.contains(StringPrefix::RAW) || self.contains(StringPrefix::RAW_UPPER) + } + + pub(super) const fn is_fstring(self) -> bool { + self.contains(StringPrefix::F_STRING) + } +} + +impl Format> for StringPrefix { + fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + // Retain the casing for the raw prefix: + // https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#r-strings-and-r-strings + if self.contains(StringPrefix::RAW) { + token("r").fmt(f)?; + } else if self.contains(StringPrefix::RAW_UPPER) { + token("R").fmt(f)?; + } + + if self.contains(StringPrefix::BYTE) { + token("b").fmt(f)?; + } + + if self.contains(StringPrefix::F_STRING) { + token("f").fmt(f)?; + } + + // Remove the unicode prefix `u` if any because it is meaningless in Python 3+. + + Ok(()) + } +} + +/// Choose the appropriate quote style for a raw string. +/// +/// The preferred quote style is chosen unless the string contains unescaped quotes of the +/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote +/// style is double quotes. +fn choose_quotes_raw( + input: &str, + quotes: StringQuotes, + preferred_style: QuoteStyle, +) -> StringQuotes { + let preferred_quote_char = preferred_style.as_char(); + let mut chars = input.chars().peekable(); + let contains_unescaped_configured_quotes = loop { + match chars.next() { + Some('\\') => { + // Ignore escaped characters + chars.next(); + } + // `"` or `'` + Some(c) if c == preferred_quote_char => { + if !quotes.triple { + break true; + } + + match chars.peek() { + // We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser + // about where the closing triple quotes start + None => break true, + Some(next) if *next == preferred_quote_char => { + // `""` or `''` + chars.next(); + + // We can't turn `r'''""'''` into `r""""""""`, nor can we have + // `"""` or `'''` respectively inside the string + if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) { + break true; + } + } + _ => {} + } + } + Some(_) => continue, + None => break false, + } + }; + + StringQuotes { + triple: quotes.triple, + style: if contains_unescaped_configured_quotes { + quotes.style + } else { + preferred_style + }, + } +} + +/// Choose the appropriate quote style for a string. +/// +/// For single quoted strings, the preferred quote style is used, unless the alternative quote style +/// would require fewer escapes. +/// +/// For triple quoted strings, the preferred quote style is always used, unless the string contains +/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be +/// used unless the string contains `"""`). +fn choose_quotes(input: &str, quotes: StringQuotes, preferred_style: QuoteStyle) -> StringQuotes { + let style = if quotes.triple { + // True if the string contains a triple quote sequence of the configured quote style. + let mut uses_triple_quotes = false; + let mut chars = input.chars().peekable(); + + while let Some(c) = chars.next() { + let preferred_quote_char = preferred_style.as_char(); + match c { + '\\' => { + if matches!(chars.peek(), Some('"' | '\\')) { + chars.next(); + } + } + // `"` or `'` + c if c == preferred_quote_char => { + match chars.peek().copied() { + Some(c) if c == preferred_quote_char => { + // `""` or `''` + chars.next(); + + match chars.peek().copied() { + Some(c) if c == preferred_quote_char => { + // `"""` or `'''` + chars.next(); + uses_triple_quotes = true; + break; + } + Some(_) => {} + None => { + // Handle `''' ""'''`. At this point we have consumed both + // double quotes, so on the next iteration the iterator is empty + // and we'd miss the string ending with a preferred quote + uses_triple_quotes = true; + break; + } + } + } + Some(_) => { + // A single quote char, this is ok + } + None => { + // Trailing quote at the end of the comment + uses_triple_quotes = true; + break; + } + } + } + _ => continue, + } + } + + if uses_triple_quotes { + // String contains a triple quote sequence of the configured quote style. + // Keep the existing quote style. + quotes.style + } else { + preferred_style + } + } else { + let mut single_quotes = 0u32; + let mut double_quotes = 0u32; + + for c in input.chars() { + match c { + '\'' => { + single_quotes += 1; + } + + '"' => { + double_quotes += 1; + } + + _ => continue, + } + } + + match preferred_style { + QuoteStyle::Single => { + if single_quotes > double_quotes { + QuoteStyle::Double + } else { + QuoteStyle::Single + } + } + QuoteStyle::Double => { + if double_quotes > single_quotes { + QuoteStyle::Single + } else { + QuoteStyle::Double + } + } + } + }; + + StringQuotes { + triple: quotes.triple, + style, + } +} + +#[derive(Copy, Clone, Debug)] +pub(super) struct StringQuotes { + triple: bool, + style: QuoteStyle, +} + +impl StringQuotes { + pub(super) fn parse(input: &str) -> Option { + let mut chars = input.chars(); + + let quote_char = chars.next()?; + let style = QuoteStyle::try_from(quote_char).ok()?; + + let triple = chars.next() == Some(quote_char) && chars.next() == Some(quote_char); + + Some(Self { triple, style }) + } + + pub(super) const fn is_triple(self) -> bool { + self.triple + } + + const fn text_len(self) -> TextSize { + if self.triple { + TextSize::new(3) + } else { + TextSize::new(1) + } + } +} + +impl Format> for StringQuotes { + fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> { + let quotes = match (self.style, self.triple) { + (QuoteStyle::Single, false) => "'", + (QuoteStyle::Single, true) => "'''", + (QuoteStyle::Double, false) => "\"", + (QuoteStyle::Double, true) => "\"\"\"", + }; + + token(quotes).fmt(f) + } +} + +/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input` +/// with the provided [`StringQuotes`] style. +/// +/// Returns the normalized string and whether it contains new lines. +fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow { + // The normalized string if `input` is not yet normalized. + // `output` must remain empty if `input` is already normalized. + let mut output = String::new(); + // Tracks the last index of `input` that has been written to `output`. + // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. + let mut last_index = 0; + + let style = quotes.style; + let preferred_quote = style.as_char(); + let opposite_quote = style.invert().as_char(); + + let mut chars = input.char_indices().peekable(); + + let is_raw = prefix.is_raw_string(); + let is_fstring = prefix.is_fstring(); + let mut formatted_value_nesting = 0u32; + + while let Some((index, c)) = chars.next() { + if is_fstring && matches!(c, '{' | '}') { + if chars.peek().copied().is_some_and(|(_, next)| next == c) { + // Skip over the second character of the double braces + chars.next(); + } else if c == '{' { + formatted_value_nesting += 1; + } else { + // Safe to assume that `c == '}'` here because of the matched pattern above + formatted_value_nesting = formatted_value_nesting.saturating_sub(1); + } + continue; + } + if c == '\r' { + output.push_str(&input[last_index..index]); + + // Skip over the '\r' character, keep the `\n` + if chars.peek().copied().is_some_and(|(_, next)| next == '\n') { + chars.next(); + } + // Replace the `\r` with a `\n` + else { + output.push('\n'); + } + + last_index = index + '\r'.len_utf8(); + } else if !quotes.triple && !is_raw { + if c == '\\' { + if let Some((_, next)) = chars.peek().copied() { + #[allow(clippy::if_same_then_else)] + if next == opposite_quote && formatted_value_nesting == 0 { + // Remove the escape by ending before the backslash and starting again with the quote + chars.next(); + output.push_str(&input[last_index..index]); + last_index = index + '\\'.len_utf8(); + } else if next == preferred_quote { + // Quote is already escaped, skip over it. + chars.next(); + } else if next == '\\' { + // Skip over escaped backslashes + chars.next(); + } + } + } else if c == preferred_quote && formatted_value_nesting == 0 { + // Escape the quote + output.push_str(&input[last_index..index]); + output.push('\\'); + output.push(c); + last_index = index + preferred_quote.len_utf8(); + } + } + } + + let normalized = if last_index == 0 { + Cow::Borrowed(input) + } else { + output.push_str(&input[last_index..]); + Cow::Owned(output) + }; + + normalized +}