Implement UnnecessaryListAllocationForFirstElement (astral-sh#5549)

## Summary Fixes astral-sh#5503. Ready for final review as the `mkdocs` issue involving SSH keys is fixed. Note that this will only throw on a `Name` - it will be refactorable once we have a type-checker. This means that this is the only sort of input that will throw. ```python x = range(10) list(x)[0] ``` I thought it'd be confusing if we supported direct function results. Consider this example, assuming we support direct results: ```python # throws list(range(10))[0] def createRange(bound): return range(bound) # "why doesn't this throw, but a direct `range(10)` call does?" list(createRange(10))[0] ``` If it's necessary, I can go through the list of built-ins and find those which produce iterables, then add them to the throwing list. ## Test Plan Added a new fixture, then ran `cargo t`
tjkuson · Jul 10, 2023 · 28fe2d3 · 28fe2d3
1 parent 3562d80
commit 28fe2d3
Show file tree

Hide file tree

Showing 8 changed files with 630 additions and 1 deletion.
diff --git a/crates/ruff/resources/test/fixtures/ruff/RUF015.py b/crates/ruff/resources/test/fixtures/ruff/RUF015.py
@@ -0,0 +1,44 @@
+x = range(10)
+
+# RUF015
+list(x)[0]
+list(x)[:1]
+list(x)[:1:1]
+list(x)[:1:2]
+tuple(x)[0]
+tuple(x)[:1]
+tuple(x)[:1:1]
+tuple(x)[:1:2]
+list(i for i in x)[0]
+list(i for i in x)[:1]
+list(i for i in x)[:1:1]
+list(i for i in x)[:1:2]
+[i for i in x][0]
+[i for i in x][:1]
+[i for i in x][:1:1]
+[i for i in x][:1:2]
+
+# OK (not indexing (solely) the first element)
+list(x)
+list(x)[1]
+list(x)[-1]
+list(x)[1:]
+list(x)[:3:2]
+list(x)[::2]
+list(x)[::]
+[i for i in x]
+[i for i in x][1]
+[i for i in x][-1]
+[i for i in x][1:]
+[i for i in x][:3:2]
+[i for i in x][::2]
+[i for i in x][::]
+
+# OK (doesn't mirror the underlying list)
+[i + 1 for i in x][0]
+[i for i in x if i > 5][0]
+[(i, i + 1) for i in x][0]
+
+# OK (multiple generators)
+y = range(10)
+[i + j for i in x for j in y][0]
diff --git a/crates/ruff/src/checkers/ast/mod.rs b/crates/ruff/src/checkers/ast/mod.rs
@@ -2142,7 +2142,7 @@ where
 
         // Pre-visit.
         match expr {
-            Expr::Subscript(ast::ExprSubscript { value, slice, .. }) => {
+            subscript @ Expr::Subscript(ast::ExprSubscript { value, slice, .. }) => {
                 // Ex) Optional[...], Union[...]
                 if self.any_enabled(&[
                     Rule::FutureRewritableTypeAnnotation,
@@ -2225,6 +2225,9 @@ where
                 if self.enabled(Rule::UncapitalizedEnvironmentVariables) {
                     flake8_simplify::rules::use_capital_environment_variables(self, expr);
                 }
+                if self.enabled(Rule::UnnecessaryIterableAllocationForFirstElement) {
+                    ruff::rules::unnecessary_iterable_allocation_for_first_element(self, subscript);
+                }
 
                 pandas_vet::rules::subscript(self, value, expr);
             }

diff --git a/crates/ruff/src/codes.rs b/crates/ruff/src/codes.rs
@@ -779,6 +779,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
         (Ruff, "013") => (RuleGroup::Unspecified, rules::ruff::rules::ImplicitOptional),
         #[cfg(feature = "unreachable-code")]
         (Ruff, "014") => (RuleGroup::Nursery, rules::ruff::rules::UnreachableCode),
+        (Ruff, "015") => (RuleGroup::Unspecified, rules::ruff::rules::UnnecessaryIterableAllocationForFirstElement),
         (Ruff, "100") => (RuleGroup::Unspecified, rules::ruff::rules::UnusedNOQA),
         (Ruff, "200") => (RuleGroup::Unspecified, rules::ruff::rules::InvalidPyprojectToml),
 

diff --git a/crates/ruff/src/rules/ruff/mod.rs b/crates/ruff/src/rules/ruff/mod.rs
@@ -30,6 +30,10 @@ mod tests {
     #[test_case(Rule::MutableDataclassDefault, Path::new("RUF008.py"))]
     #[test_case(Rule::PairwiseOverZipped, Path::new("RUF007.py"))]
     #[test_case(Rule::StaticKeyDictComprehension, Path::new("RUF011.py"))]
+    #[test_case(
+        Rule::UnnecessaryIterableAllocationForFirstElement,
+        Path::new("RUF015.py")
+    )]
     #[cfg_attr(
         feature = "unreachable-code",
         test_case(Rule::UnreachableCode, Path::new("RUF014.py"))

diff --git a/crates/ruff/src/rules/ruff/rules/mod.rs b/crates/ruff/src/rules/ruff/rules/mod.rs
@@ -9,6 +9,7 @@ pub(crate) use mutable_class_default::*;
 pub(crate) use mutable_dataclass_default::*;
 pub(crate) use pairwise_over_zipped::*;
 pub(crate) use static_key_dict_comprehension::*;
+pub(crate) use unnecessary_iterable_allocation_for_first_element::*;
 #[cfg(feature = "unreachable-code")]
 pub(crate) use unreachable::*;
 pub(crate) use unused_noqa::*;
@@ -26,6 +27,7 @@ mod mutable_class_default;
 mod mutable_dataclass_default;
 mod pairwise_over_zipped;
 mod static_key_dict_comprehension;
+mod unnecessary_iterable_allocation_for_first_element;
 #[cfg(feature = "unreachable-code")]
 pub(crate) mod unreachable;
 mod unused_noqa;

diff --git a/crates/ruff/src/rules/ruff/rules/unnecessary_iterable_allocation_for_first_element.rs b/crates/ruff/src/rules/ruff/rules/unnecessary_iterable_allocation_for_first_element.rs
@@ -0,0 +1,236 @@
+use num_bigint::BigInt;
+use num_traits::{One, Zero};
+use rustpython_parser::ast::{self, Comprehension, Constant, Expr};
+
+use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
+use ruff_macros::{derive_message_formats, violation};
+use ruff_python_semantic::SemanticModel;
+
+use crate::checkers::ast::Checker;
+use crate::registry::AsRule;
+
+/// ## What it does
+/// Checks for uses of `list(...)[0]` that can be replaced with
+/// `next(iter(...))`.
+///
+/// ## Why is this bad?
+/// Calling `list(...)` will create a new list of the entire collection, which
+/// can be very expensive for large collections. If you only need the first
+/// element of the collection, you can use `next(iter(...))` to lazily fetch
+/// the first element without creating a new list.
+///
+/// Note that migrating from `list(...)[0]` to `next(iter(...))` can change
+/// the behavior of your program in two ways:
+///
+/// 1. First, `list(...)` will eagerly evaluate the entire collection, while
+///    `next(iter(...))` will only evaluate the first element. As such, any
+///    side effects that occur during iteration will be delayed.
+/// 2. Second, `list(...)[0]` will raise `IndexError` if the collection is
+///   empty, while `next(iter(...))` will raise `StopIteration`.
+///
+/// ## Example
+/// ```python
+/// head = list(range(1000000000000))[0]
+/// ```
+///
+/// Use instead:
+/// ```python
+/// head = next(iter(range(1000000000000)))
+/// ```
+///
+/// ## References
+/// - [Iterators and Iterables in Python: Run Efficient Iterations](https://realpython.com/python-iterators-iterables/#when-to-use-an-iterator-in-python)
+#[violation]
+pub(crate) struct UnnecessaryIterableAllocationForFirstElement {
+    iterable: String,
+    subscript_kind: HeadSubscriptKind,
+}
+
+impl AlwaysAutofixableViolation for UnnecessaryIterableAllocationForFirstElement {
+    #[derive_message_formats]
+    fn message(&self) -> String {
+        let UnnecessaryIterableAllocationForFirstElement {
+            iterable,
+            subscript_kind,
+        } = self;
+        match subscript_kind {
+            HeadSubscriptKind::Index => {
+                format!("Prefer `next(iter({iterable}))` over `list({iterable})[0]`")
+            }
+            HeadSubscriptKind::Slice => {
+                format!("Prefer `[next(iter({iterable}))]` over `list({iterable})[:1]`")
+            }
+        }
+    }
+
+    fn autofix_title(&self) -> String {
+        let UnnecessaryIterableAllocationForFirstElement {
+            iterable,
+            subscript_kind,
+        } = self;
+        match subscript_kind {
+            HeadSubscriptKind::Index => format!("Replace with `next(iter({iterable}))`"),
+            HeadSubscriptKind::Slice => format!("Replace with `[next(iter({iterable}))]"),
+        }
+    }
+}
+
+/// RUF015
+pub(crate) fn unnecessary_iterable_allocation_for_first_element(
+    checker: &mut Checker,
+    subscript: &Expr,
+) {
+    let Expr::Subscript(ast::ExprSubscript {
+        value,
+        slice,
+        range,
+        ..
+    }) = subscript
+    else {
+        return;
+    };
+
+    let Some(subscript_kind) = classify_subscript(slice) else {
+        return;
+    };
+
+    let Some(iterable) = iterable_name(value, checker.semantic()) else {
+        return;
+    };
+
+    let mut diagnostic = Diagnostic::new(
+        UnnecessaryIterableAllocationForFirstElement {
+            iterable: iterable.to_string(),
+            subscript_kind,
+        },
+        *range,
+    );
+
+    if checker.patch(diagnostic.kind.rule()) {
+        let replacement = match subscript_kind {
+            HeadSubscriptKind::Index => format!("next(iter({iterable}))"),
+            HeadSubscriptKind::Slice => format!("[next(iter({iterable}))]"),
+        };
+        diagnostic.set_fix(Fix::suggested(Edit::range_replacement(replacement, *range)));
+    }
+
+    checker.diagnostics.push(diagnostic);
+}
+
+/// A subscript slice that represents the first element of a list.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum HeadSubscriptKind {
+    /// The subscript is an index (e.g., `[0]`).
+    Index,
+    /// The subscript is a slice (e.g., `[:1]`).
+    Slice,
+}
+
+/// Check that the slice [`Expr`] is functionally equivalent to slicing into the first element. The
+/// first `bool` checks that the element is in fact first, the second checks if it's a slice or an
+/// index.
+fn classify_subscript(expr: &Expr) -> Option<HeadSubscriptKind> {
+    match expr {
+        Expr::Constant(ast::ExprConstant {
+            value: Constant::Int(value),
+            ..
+        }) if value.is_zero() => Some(HeadSubscriptKind::Index),
+        Expr::Slice(ast::ExprSlice {
+            step, lower, upper, ..
+        }) => {
+            // Avoid, e.g., `list(...)[:2]`
+            let upper = upper.as_ref()?;
+            let upper = as_int(upper)?;
+            if !upper.is_one() {
+                return None;
+            }
+
+            // Avoid, e.g., `list(...)[2:]`.
+            if let Some(lower) = lower.as_ref() {
+                let lower = as_int(lower)?;
+                if !lower.is_zero() {
+                    return None;
+                }
+            }
+
+            // Avoid, e.g., `list(...)[::-1]`
+            if let Some(step) = step.as_ref() {
+                let step = as_int(step)?;
+                if step < upper {
+                    return None;
+                }
+            }
+
+            Some(HeadSubscriptKind::Slice)
+        }
+        _ => None,
+    }
+}
+
+/// Fetch the name of the iterable from an expression if the expression returns an unmodified list
+/// which can be sliced into.
+fn iterable_name<'a>(expr: &'a Expr, model: &SemanticModel) -> Option<&'a str> {
+    match expr {
+        Expr::Call(ast::ExprCall { func, args, .. }) => {
+            let ast::ExprName { id, .. } = func.as_name_expr()?;
+
+            if !matches!(id.as_str(), "tuple" | "list") {
+                return None;
+            }
+
+            if !model.is_builtin(id.as_str()) {
+                return None;
+            }
+
+            match args.first() {
+                Some(Expr::Name(ast::ExprName { id: arg_name, .. })) => Some(arg_name.as_str()),
+                Some(Expr::GeneratorExp(ast::ExprGeneratorExp {
+                    elt, generators, ..
+                })) => generator_iterable(elt, generators),
+                _ => None,
+            }
+        }
+        Expr::ListComp(ast::ExprListComp {
+            elt, generators, ..
+        }) => generator_iterable(elt, generators),
+        _ => None,
+    }
+}
+
+/// Given a comprehension, returns the name of the iterable over which it iterates, if it's
+/// a simple comprehension (e.g., `x` for `[i for i in x]`).
+fn generator_iterable<'a>(elt: &'a Expr, generators: &'a Vec<Comprehension>) -> Option<&'a str> {
+    // If the `elt` field is anything other than a [`Expr::Name`], we can't be sure that it
+    // doesn't modify the elements of the underlying iterator (e.g., `[i + 1 for i in x][0]`).
+    if !elt.is_name_expr() {
+        return None;
+    }
+
+    // If there's more than 1 generator, we can't safely say that it fits the diagnostic conditions
+    // (e.g., `[(i, j) for i in x for j in y][0]`).
+    let [generator] = generators.as_slice() else {
+        return None;
+    };
+
+    // Ignore if there's an `if` statement in the comprehension, since it filters the list.
+    if !generator.ifs.is_empty() {
+        return None;
+    }
+
+    let ast::ExprName { id, .. } = generator.iter.as_name_expr()?;
+    Some(id.as_str())
+}
+
+/// If an expression is a constant integer, returns the value of that integer; otherwise,
+/// returns `None`.
+fn as_int(expr: &Expr) -> Option<&BigInt> {
+    if let Expr::Constant(ast::ExprConstant {
+        value: Constant::Int(value),
+        ..
+    }) = expr
+    {
+        Some(value)
+    } else {
+        None
+    }
+}