Skip to content

Commit

Permalink
Perf: Skip string normalization when possible
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaReiser committed Feb 25, 2024
1 parent 51ce88b commit 1c7e244
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 77 deletions.
8 changes: 4 additions & 4 deletions crates/ruff_python_formatter/src/other/f_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,16 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
return result;
}

let quotes = normalizer.choose_quotes(&string, &locator);
let quote_selection = normalizer.choose_quotes(&string, &locator);

let context = FStringContext::new(
string.prefix(),
quotes,
quote_selection.quotes(),
FStringLayout::from_f_string(self.value, &locator),
);

// Starting prefix and quote
write!(f, [string.prefix(), quotes])?;
write!(f, [string.prefix(), quote_selection.quotes()])?;

f.join()
.entries(
Expand All @@ -80,7 +80,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
.finish()?;

// Ending quote
quotes.fmt(f)
quote_selection.quotes().fmt(f)
}
}

Expand Down
1 change: 1 addition & 0 deletions crates/ruff_python_formatter/src/other/f_string_element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ impl Format<PyFormatContext<'_>> for FormatFStringLiteralElement<'_> {
let literal_content = f.context().locator().slice(self.element.range());
let normalized = normalize_string(
literal_content,
0,
self.context.quotes(),
self.context.prefix(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
Expand Down
234 changes: 161 additions & 73 deletions crates/ruff_python_formatter/src/string/normalize.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::iter::FusedIterator;

use ruff_formatter::FormatContext;
use ruff_source_file::Locator;
Expand Down Expand Up @@ -42,68 +43,8 @@ impl StringNormalizer {
self
}

/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};

let quoting = if let FStringState::InsideExpressionElement(context) = self.f_string_state {
fn quoting(&self, string: &StringPart) -> Quoting {
if let FStringState::InsideExpressionElement(context) = self.f_string_state {
// If we're inside an f-string, we need to make sure to preserve the
// existing quotes unless we're inside a triple-quoted f-string and
// the inner string itself isn't triple-quoted. For example:
Expand All @@ -127,22 +68,110 @@ impl StringNormalizer {
}
} else {
self.quoting
};
}
}

match quoting {
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> QuoteSelection {
let raw_content = locator.slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));

let quotes = match self.quoting(string) {
Quoting::Preserve => string.quotes(),
Quoting::CanChange => {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};

if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
let raw_content = locator.slice(string.content_range());
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
if let Some(first_quote_or_normalized_char_offset) =
first_quote_or_normalized_char_offset
{
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
} else {
choose_quotes_impl(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
}
} else {
choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
StringQuotes {
quote_char: preferred_quote,
triple: string.quotes().is_triple(),
}
}
} else {
string.quotes()
}
}
};

QuoteSelection {
quotes,
first_quote_or_normalized_char_offset,
}
}

Expand All @@ -154,19 +183,45 @@ impl StringNormalizer {
) -> NormalizedString<'a> {
let raw_content = locator.slice(string.content_range());

let quotes = self.choose_quotes(string, locator);

let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex);
let quote_selection = self.choose_quotes(string, locator);

let normalized = if let Some(first_quote_or_escape_offset) =
quote_selection.first_quote_or_normalized_char_offset
{
normalize_string(
raw_content,
first_quote_or_escape_offset,
quote_selection.quotes,
string.prefix(),
self.normalize_hex,
)
} else {
Cow::Borrowed(raw_content)
};

NormalizedString {
prefix: string.prefix(),
content_range: string.content_range(),
text: normalized,
quotes,
quotes: quote_selection.quotes,
}
}
}

#[derive(Debug)]
pub(crate) struct QuoteSelection {
quotes: StringQuotes,

/// Offset to the first quote character or character that needs special handling in [`normalize_string`].
first_quote_or_normalized_char_offset: Option<usize>,
}

impl QuoteSelection {
pub(crate) fn quotes(&self) -> StringQuotes {
self.quotes
}
}

#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
prefix: crate::string::StringPrefix,
Expand Down Expand Up @@ -391,6 +446,7 @@ fn choose_quotes_impl(
/// Returns the normalized string and whether it contains new lines.
pub(crate) fn normalize_string(
input: &str,
start_offset: usize,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
Expand All @@ -406,7 +462,7 @@ pub(crate) fn normalize_string(
let preferred_quote = quote.as_char();
let opposite_quote = quote.invert().as_char();

let mut chars = input.char_indices().peekable();
let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();

let is_raw = prefix.is_raw_string();
let is_fstring = prefix.is_fstring();
Expand Down Expand Up @@ -501,6 +557,37 @@ pub(crate) fn normalize_string(
normalized
}

#[derive(Clone, Debug)]
struct CharIndicesWithOffset<'str> {
chars: std::str::Chars<'str>,
start_offset: usize,
offset: usize,
}

impl<'str> CharIndicesWithOffset<'str> {
fn new(input: &'str str, start_offset: usize) -> Self {
Self {
chars: input.chars(),
start_offset,
offset: 0,
}
}
}

impl<'str> Iterator for CharIndicesWithOffset<'str> {
type Item = (usize, char);

fn next(&mut self) -> Option<Self::Item> {
self.chars.next().map(|c| {
let index = self.start_offset + self.offset;
self.offset += c.len_utf8();
(index, c)
})
}
}

impl FusedIterator for CharIndicesWithOffset<'_> {}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
Expand Down Expand Up @@ -642,6 +729,7 @@ mod tests {

let normalized = normalize_string(
input,
0,
StringQuotes {
triple: false,
quote_char: QuoteChar::Double,
Expand Down

0 comments on commit 1c7e244

Please sign in to comment.