Skip to content

Commit

Permalink
Perf: Skip string normalization when possible (astral-sh#10116)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaReiser authored Feb 26, 2024
1 parent 15b87ea commit 8dc22d5
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 88 deletions.
8 changes: 4 additions & 4 deletions crates/ruff_python_formatter/src/other/f_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,16 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
return result;
}

let quotes = normalizer.choose_quotes(&string, &locator);
let quote_selection = normalizer.choose_quotes(&string, &locator);

let context = FStringContext::new(
string.prefix(),
quotes,
quote_selection.quotes(),
FStringLayout::from_f_string(self.value, &locator),
);

// Starting prefix and quote
write!(f, [string.prefix(), quotes])?;
write!(f, [string.prefix(), quote_selection.quotes()])?;

f.join()
.entries(
Expand All @@ -80,7 +80,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
.finish()?;

// Ending quote
quotes.fmt(f)
quote_selection.quotes().fmt(f)
}
}

Expand Down
1 change: 1 addition & 0 deletions crates/ruff_python_formatter/src/other/f_string_element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ impl Format<PyFormatContext<'_>> for FormatFStringLiteralElement<'_> {
let literal_content = f.context().locator().slice(self.element.range());
let normalized = normalize_string(
literal_content,
0,
self.context.quotes(),
self.context.prefix(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
Expand Down
249 changes: 165 additions & 84 deletions crates/ruff_python_formatter/src/string/normalize.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::iter::FusedIterator;

use ruff_formatter::FormatContext;
use ruff_source_file::Locator;
Expand Down Expand Up @@ -44,68 +45,8 @@ impl StringNormalizer {
self
}

/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};

let quoting = if let FStringState::InsideExpressionElement(context) = self.f_string_state {
fn quoting(&self, string: &StringPart) -> Quoting {
if let FStringState::InsideExpressionElement(context) = self.f_string_state {
// If we're inside an f-string, we need to make sure to preserve the
// existing quotes unless we're inside a triple-quoted f-string and
// the inner string itself isn't triple-quoted. For example:
Expand All @@ -129,22 +70,110 @@ impl StringNormalizer {
}
} else {
self.quoting
};
}
}

/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> QuoteSelection {
let raw_content = locator.slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));

match quoting {
let quotes = match self.quoting(string) {
Quoting::Preserve => string.quotes(),
Quoting::CanChange => {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};

if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
let raw_content = locator.slice(string.content_range());
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
if let Some(first_quote_or_normalized_char_offset) =
first_quote_or_normalized_char_offset
{
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
} else {
choose_quotes_impl(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
}
} else {
choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
StringQuotes {
quote_char: preferred_quote,
triple: string.quotes().is_triple(),
}
}
} else {
string.quotes()
}
}
};

QuoteSelection {
quotes,
first_quote_or_normalized_char_offset,
}
}

Expand All @@ -156,25 +185,48 @@ impl StringNormalizer {
) -> NormalizedString<'a> {
let raw_content = locator.slice(string.content_range());

let quotes = self.choose_quotes(string, locator);

let normalized = normalize_string(
raw_content,
quotes,
string.prefix(),
self.normalize_hex,
self.format_fstring,
);
let quote_selection = self.choose_quotes(string, locator);

let normalized = if let Some(first_quote_or_escape_offset) =
quote_selection.first_quote_or_normalized_char_offset
{
normalize_string(
raw_content,
first_quote_or_escape_offset,
quote_selection.quotes,
string.prefix(),
self.normalize_hex,
// TODO: Remove the `b'{'` in `choose_quotes` when promoting the
// `format_fstring` preview style
self.format_fstring,
)
} else {
Cow::Borrowed(raw_content)
};

NormalizedString {
prefix: string.prefix(),
content_range: string.content_range(),
text: normalized,
quotes,
quotes: quote_selection.quotes,
}
}
}

#[derive(Debug)]
pub(crate) struct QuoteSelection {
quotes: StringQuotes,

/// Offset to the first quote character or character that needs special handling in [`normalize_string`].
first_quote_or_normalized_char_offset: Option<usize>,
}

impl QuoteSelection {
pub(crate) fn quotes(&self) -> StringQuotes {
self.quotes
}
}

#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
prefix: crate::string::StringPrefix,
Expand Down Expand Up @@ -399,6 +451,7 @@ fn choose_quotes_impl(
/// Returns the normalized string and whether it contains new lines.
pub(crate) fn normalize_string(
input: &str,
start_offset: usize,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
Expand All @@ -415,7 +468,7 @@ pub(crate) fn normalize_string(
let preferred_quote = quote.as_char();
let opposite_quote = quote.invert().as_char();

let mut chars = input.char_indices().peekable();
let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();

let is_raw = prefix.is_raw_string();
let is_fstring = !format_fstring && prefix.is_fstring();
Expand Down Expand Up @@ -454,13 +507,11 @@ pub(crate) fn normalize_string(
// Skip over escaped backslashes
chars.next();
} else if normalize_hex {
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
.and_then(|escape| {
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
})
.and_then(|escape| escape.normalize(&input[index + escape_start_len..]))
{
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
output.push_str(&input[last_index..escape_start_offset]);
Expand Down Expand Up @@ -510,6 +561,35 @@ pub(crate) fn normalize_string(
normalized
}

#[derive(Clone, Debug)]
struct CharIndicesWithOffset<'str> {
chars: std::str::Chars<'str>,
next_offset: usize,
}

impl<'str> CharIndicesWithOffset<'str> {
fn new(input: &'str str, start_offset: usize) -> Self {
Self {
chars: input[start_offset..].chars(),
next_offset: start_offset,
}
}
}

impl<'str> Iterator for CharIndicesWithOffset<'str> {
type Item = (usize, char);

fn next(&mut self) -> Option<Self::Item> {
self.chars.next().map(|c| {
let index = self.next_offset;
self.next_offset += c.len_utf8();
(index, c)
})
}
}

impl FusedIterator for CharIndicesWithOffset<'_> {}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
Expand Down Expand Up @@ -651,6 +731,7 @@ mod tests {

let normalized = normalize_string(
input,
0,
StringQuotes {
triple: false,
quote_char: QuoteChar::Double,
Expand Down

0 comments on commit 8dc22d5

Please sign in to comment.