Use empty range when there's "gap" in token source (#11032)

## Summary This fixes a bug where the parser would panic when there is a "gap" in the token source. What's a gap? The reason it's `<=` instead of just `==` is because there could be whitespaces between the two tokens. For example: ```python # last token end # | current token (newline) start # v v def foo \n # ^ # assume there's trailing whitespace here ``` Or, there could tokens that are considered "trivia" and thus aren't emitted by the token source. These are comments and non-logical newlines. For example: ```python # last token end # v def foo # comment\n # ^ current token (newline) start ``` In either of the above cases, there's a "gap" between the end of the last token and start of the current token. ## Test Plan Add test cases and update the snapshots.
astral-sh · Apr 19, 2024 · d3cd61f · d3cd61f
1 parent 9b80cc0
commit d3cd61f
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 26 deletions.
diff --git a/crates/ruff_python_parser/resources/inline/err/node_range_with_gaps.py b/crates/ruff_python_parser/resources/inline/err/node_range_with_gaps.py
@@ -0,0 +1,3 @@
+def foo # comment
+def bar(): ...
+def baz
diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs
@@ -261,12 +261,59 @@ impl<'src> Parser<'src> {
     }
 
     fn node_range(&self, start: TextSize) -> TextRange {
-        // It's possible during error recovery that the parsing didn't consume any tokens. In that case,
-        // `last_token_end` still points to the end of the previous token but `start` is the start of the current token.
-        // Calling `TextRange::new(start, self.last_token_end)` would panic in that case because `start > end`.
-        // This path "detects" this case and creates an empty range instead.
-        if self.node_start() == start {
-            TextRange::empty(start)
+        // It's possible during error recovery that the parsing didn't consume any tokens. In that
+        // case, `last_token_end` still points to the end of the previous token but `start` is the
+        // start of the current token. Calling `TextRange::new(start, self.last_token_end)` would
+        // panic in that case because `start > end`. This path "detects" this case and creates an
+        // empty range instead.
+        //
+        // The reason it's `<=` instead of just `==` is because there could be whitespaces between
+        // the two tokens. For example:
+        //
+        // ```python
+        // #     last token end
+        // #     | current token (newline) start
+        // #     v v
+        // def foo \n
+        // #      ^
+        // #      assume there's trailing whitespace here
+        // ```
+        //
+        // Or, there could tokens that are considered "trivia" and thus aren't emitted by the token
+        // source. These are comments and non-logical newlines. For example:
+        //
+        // ```python
+        // #     last token end
+        // #     v
+        // def foo # comment\n
+        // #                ^ current token (newline) start
+        // ```
+        //
+        // In either of the above cases, there's a "gap" between the end of the last token and start
+        // of the current token.
+        if self.last_token_end <= start {
+            // We need to create an empty range at the last token end instead of the start because
+            // otherwise this node range will fall outside the range of it's parent node. Taking
+            // the above example:
+            //
+            // ```python
+            // if True:
+            // #   function start
+            // #   |     function end
+            // #   v     v
+            //     def foo # comment
+            // #                    ^ current token start
+            // ```
+            //
+            // Here, the current token start is the start of parameter range but the function ends
+            // at `foo`. Even if there's a function body, the range of parameters would still be
+            // before the comment.
+
+            // test_err node_range_with_gaps
+            // def foo # comment
+            // def bar(): ...
+            // def baz
+            TextRange::empty(self.last_token_end)
         } else {
             TextRange::new(start, self.last_token_end)
         }

diff --git a/crates/ruff_python_parser/src/parser/statement.rs b/crates/ruff_python_parser/src/parser/statement.rs
@@ -1663,23 +1663,19 @@ impl<'src> Parser<'src> {
         // x = 10
         let type_params = self.try_parse_type_params();
 
+        // test_ok function_def_parameter_range
+        // def foo(
+        //     first: int,
+        //     second: int,
+        // ) -> int: ...
+
         // test_err function_def_unclosed_parameter_list
         // def foo(a: int, b:
         // def foo():
         //     return 42
         // def foo(a: int, b: str
         // x = 10
-        let parameters_start = self.node_start();
-        self.expect(TokenKind::Lpar);
-        let mut parameters = self.parse_parameters(FunctionKind::FunctionDef);
-        self.expect(TokenKind::Rpar);
-
-        // test_ok function_def_parameter_range
-        // def foo(
-        //     first: int,
-        //     second: int,
-        // ) -> int: ...
-        parameters.range = self.node_range(parameters_start);
+        let parameters = self.parse_parameters(FunctionKind::FunctionDef);
 
         let returns = if self.eat(TokenKind::Rarrow) {
             if self.at_expr() {
@@ -2844,19 +2840,16 @@ impl<'src> Parser<'src> {
     pub(super) fn parse_parameters(&mut self, function_kind: FunctionKind) -> ast::Parameters {
         let start = self.node_start();
 
+        if matches!(function_kind, FunctionKind::FunctionDef) {
+            self.expect(TokenKind::Lpar);
+        }
+
         // TODO(dhruvmanila): This has the same problem as `parse_match_pattern_mapping`
         // has where if there are multiple kwarg or vararg, the last one will win and
         // the parser will drop the previous ones. Another thing is the vararg and kwarg
         // uses `Parameter` (not `ParameterWithDefault`) which means that the parser cannot
         // recover well from `*args=(1, 2)`.
-        let mut parameters = ast::Parameters {
-            range: TextRange::default(),
-            posonlyargs: vec![],
-            args: vec![],
-            kwonlyargs: vec![],
-            vararg: None,
-            kwarg: None,
-        };
+        let mut parameters = ast::Parameters::empty(TextRange::default());
 
         let mut seen_default_param = false; // `a=10`
         let mut seen_positional_only_separator = false; // `/`
@@ -3094,6 +3087,10 @@ impl<'src> Parser<'src> {
             self.add_error(ParseErrorType::ExpectedKeywordParam, star_range);
         }
 
+        if matches!(function_kind, FunctionKind::FunctionDef) {
+            self.expect(TokenKind::Rpar);
+        }
+
         parameters.range = self.node_range(start);
 
         // test_err params_duplicate_names

diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@node_range_with_gaps.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@node_range_with_gaps.py.snap
@@ -0,0 +1,122 @@
+---
+source: crates/ruff_python_parser/tests/fixtures.rs
+input_file: crates/ruff_python_parser/resources/inline/err/node_range_with_gaps.py
+---
+## AST
+
+```
+Module(
+    ModModule {
+        range: 0..41,
+        body: [
+            FunctionDef(
+                StmtFunctionDef {
+                    range: 0..7,
+                    is_async: false,
+                    decorator_list: [],
+                    name: Identifier {
+                        id: "foo",
+                        range: 4..7,
+                    },
+                    type_params: None,
+                    parameters: Parameters {
+                        range: 7..7,
+                        posonlyargs: [],
+                        args: [],
+                        vararg: None,
+                        kwonlyargs: [],
+                        kwarg: None,
+                    },
+                    returns: None,
+                    body: [],
+                },
+            ),
+            FunctionDef(
+                StmtFunctionDef {
+                    range: 18..32,
+                    is_async: false,
+                    decorator_list: [],
+                    name: Identifier {
+                        id: "bar",
+                        range: 22..25,
+                    },
+                    type_params: None,
+                    parameters: Parameters {
+                        range: 25..27,
+                        posonlyargs: [],
+                        args: [],
+                        vararg: None,
+                        kwonlyargs: [],
+                        kwarg: None,
+                    },
+                    returns: None,
+                    body: [
+                        Expr(
+                            StmtExpr {
+                                range: 29..32,
+                                value: EllipsisLiteral(
+                                    ExprEllipsisLiteral {
+                                        range: 29..32,
+                                    },
+                                ),
+                            },
+                        ),
+                    ],
+                },
+            ),
+            FunctionDef(
+                StmtFunctionDef {
+                    range: 33..40,
+                    is_async: false,
+                    decorator_list: [],
+                    name: Identifier {
+                        id: "baz",
+                        range: 37..40,
+                    },
+                    type_params: None,
+                    parameters: Parameters {
+                        range: 40..40,
+                        posonlyargs: [],
+                        args: [],
+                        vararg: None,
+                        kwonlyargs: [],
+                        kwarg: None,
+                    },
+                    returns: None,
+                    body: [],
+                },
+            ),
+        ],
+    },
+)
+```
+## Errors
+
+  |
+1 | def foo # comment
+  |                  ^ Syntax Error: Expected '(', found newline
+2 | def bar(): ...
+3 | def baz
+  |
+
+
+  |
+1 | def foo # comment
+2 | def bar(): ...
+  | ^^^ Syntax Error: Expected ')', found 'def'
+3 | def baz
+  |
+
+
+  |
+1 | def foo # comment
+2 | def bar(): ...
+3 | def baz
+  |        ^ Syntax Error: Expected '(', found newline
+  |
+
+
+  |
+2 | def bar(): ...
+3 | def baz
+  |
diff --git a/...ython_parser/tests/snapshots/invalid_syntax@unterminated_fstring_newline_recovery.py.snap b/...ython_parser/tests/snapshots/invalid_syntax@unterminated_fstring_newline_recovery.py.snap
@@ -167,7 +167,7 @@ Module(
                                                         conversion: None,
                                                         format_spec: Some(
                                                             FStringFormatSpec {
-                                                                range: 43..43,
+                                                                range: 42..42,
                                                                 elements: [],
                                                             },
                                                         ),