sqlfluff · barrywhart · Apr 12, 2022 · Apr 7, 2022 · Apr 8, 2022 · Apr 8, 2022
diff --git a/docs/source/developingrules.rst b/docs/source/developingrules.rst
@@ -3,11 +3,70 @@
 Developing Rules
 ================
 
-`Rules` in `SQLFluff` are implemented as `crawlers`. These are entities
-which work their way through the parsed structure of a query to evaluate
-a particular rule or set of rules. The intent is that the definition of
-each specific rule should be really streamlined and only contain the logic
-for the rule itself, with all the other mechanics abstracted away.
+`Rules` in `SQLFluff` are implemented as classes inheriting from ``BaseRule``.
+SQLFluff crawls through the parse tree of a SQL file, calling the rule's
+``_eval()`` function for each segment in the tree. For many rules, this allows
+the rule code to be really streamlined and only contain the logic for the rule
+itself, with all the other mechanics abstracted away.
+
+Traversal Options
+-----------------
+
+``recurse_into``
+^^^^^^^^^^^^^^^^
+Some rules are a poor fit for the simple traversal pattern described above.
+Typical reasons include:
+
+* The rule only looks at a small portion of the file (e.g. the beginning or
+  end).
+* The rule needs to traverse the parse tree in a non-standard way.
+
+These rules can override ``BaseRule``'s ``recurse_into`` field, setting it to
+``False``. For these rules ``False``, ``_eval()`` is only called *once*, with
+the root segment of the tree. This can be much more efficient, especially on
+large files. For example, see rules ``L050`` and ``L009`` , which only look at
+the beginning or end of the file, respectively.
+
+``_works_on_unparsable``
+^^^^^^^^^^^^^^^^^^^^^^^^
+By default, `SQLFluff` calls ``_eval()`` for all segments, even "unparsable"
+segments, i.e. segments that didn't match the parsing rules in the dialect.
+This causes issues for some rules. If so, setting ``_works_on_unparsable``
+to ``False`` tells SQLFluff not to call ``_eval()`` for unparsable segments and
+their descendants.
+
+Performance-related Options
+---------------------------
+These are other fields on ``BaseRule``. Rules can override them.
+
+``needs_raw_stack``
+^^^^^^^^^^^^^^^^^^^
+``needs_raw_stack`` defaults to ``False``. Some rules use
+``RuleContext.raw_stack`` property to access earlier segments in the traversal.
+This can be useful, but it adds significant overhead to the linting process.
+For this reason, it is disabled by default.
+
+``lint_phase``
+^^^^^^^^^^^^^^
+There are two phases of rule running.
+
+1. The ``main`` phase is appropriate for most rules. These rules are assumed to
+interact and potentially cause a cascade of fixes requiring multiple passes.
+These rules run the `runaway_limit` number of times (default 10).
+
+2. The ``post`` phase is for post-processing rules, not expected to trigger
+any downstream rules, e.g. capitalization fixes. They are run in a
+post-processing loop at the end. This loop is identical to the ``main`` loop,
+but is only run 2 times at the end (once to fix, and once again to confirm no
+remaining issues).
+
+The two phases add complexity, but they also improve performance by allowing
+SQLFluff to run fewer rules during the ``main`` phase, which often runs several
+times.
+
+NOTE: ``post`` rules also run on the *first* pass of the ``main`` phase so that
+any issues they find will be presented in the list of issues output by
+``sqlfluff fix`` and ``sqlfluff lint``.
 
 Base Rules
 ----------

diff --git a/src/sqlfluff/core/parser/segments/base.py b/src/sqlfluff/core/parser/segments/base.py
@@ -885,11 +885,15 @@ def select_children(
                 buff.append(seg)
         return buff
 
-    def recursive_crawl_all(self):
+    def recursive_crawl_all(self, reverse: bool = False):
         """Recursively crawl all descendant segments."""
+        if reverse:
+            for seg in reversed(self.segments):
+                yield from seg.recursive_crawl_all(reverse=reverse)
         yield self
-        for seg in self.segments:
-            yield from seg.recursive_crawl_all()
+        if not reverse:
+            for seg in self.segments:
+                yield from seg.recursive_crawl_all(reverse=reverse)
 
     def recursive_crawl(self, *seg_type: str, recurse_into: bool = True):
         """Recursively crawl for segments of a given type.

diff --git a/src/sqlfluff/core/rules/base.py b/src/sqlfluff/core/rules/base.py
@@ -379,6 +379,23 @@ def siblings_post(self) -> Tuple[BaseSegment, ...]:
         else:
             return tuple()
 
+    @cached_property
+    def final_segment(self) -> BaseSegment:
+        """Returns rightmost & lowest descendant.
+
+        Similar in spirit to BaseRule.is_final_segment(), but:
+        - Much faster
+        - Does not allow filtering out meta segments
+        """
+        last_segment: BaseSegment = (
+            self.parent_stack[0] if self.parent_stack else self.segment
+        )
+        while True:
+            try:
+                last_segment = last_segment.segments[-1]
+            except IndexError:
+                return last_segment
+
     @property
     def functional(self):
         """Returns a Surrogates object that simplifies writing rules."""
@@ -503,12 +520,28 @@ class BaseRule:
     # Lint loop / crawl behavior. When appropriate, rules can (and should)
     # override these values to make linting faster.
     recurse_into = True
-    needs_raw_stack = True
+    # "needs_raw_stack" defaults to False because rules run faster that way, and
+    # most rules don't need it. Rules that use it are usually those that look
+    # at the surroundings of a segment, e.g. "is there whitespace preceding this
+    # segment?" In the long run, it would be good to review rules that use
+    # raw_stack to try and eliminate its use. These rules will often be good
+    # candidates for one of the following:
+    # - Rewriting to use "RuleContext.raw_segment_pre", which is similar to
+    #   "raw_stack", but it's only the ONE raw segment prior to the current
+    #   one.
+    # - Rewriting to use "BaseRule.recurse_into = False" and traversing the
+    #   parse tree directly.
+    # - Using "RuleContext.memory" to implement custom, lighter weight tracking
+    #   of just the MINIMUM required state across calls to _eval().  Reason:
+    #   "raw_stack" becomes very large for large files (thousands or more
+    #   segments!). In practice, most rules only need to look at a few adjacent
+    #   segments, e.g. others on the same line or in the same statement.
+    needs_raw_stack = False
     # Rules can override this to specify "post". "Post" rules are those that are
     # not expected to trigger any downstream rules, e.g. capitalization fixes.
     # They run on two occasions:
-    # - On the first loop of the main phase
-    # - In a second linter loop after the main rules run
+    # - On the first pass of the main phase
+    # - In a second linter pass after the main phase
     lint_phase = "main"
 
     def __init__(self, code, description, **kwargs):

diff --git a/src/sqlfluff/rules/L001.py b/src/sqlfluff/rules/L001.py
@@ -30,6 +30,8 @@ class Rule_L001(BaseRule):
         FROM foo
     """
 
+    needs_raw_stack = True
+
     def _eval(self, context: RuleContext) -> LintResult:
         """Unnecessary trailing whitespace.
 

diff --git a/src/sqlfluff/rules/L002.py b/src/sqlfluff/rules/L002.py
@@ -54,7 +54,7 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
 
         if context.segment.is_type("whitespace"):
             if " " in context.segment.raw and "\t" in context.segment.raw:
-                if len(context.raw_stack) == 0 or context.raw_stack[-1].is_type(
+                if context.raw_segment_pre is None or context.raw_segment_pre.is_type(
                     "newline"
                 ):
                     # We've got a single whitespace at the beginning of a line.

diff --git a/src/sqlfluff/rules/L003.py b/src/sqlfluff/rules/L003.py
@@ -193,6 +193,7 @@ class Rule_L003(BaseRule):
 
     targets_templated = True
     _works_on_unparsable = False
+    needs_raw_stack = True
     _adjust_anchors = True
     _ignore_types: List[str] = ["script_content"]
     config_keywords = ["tab_space_size", "indent_unit"]
@@ -413,7 +414,7 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
                 # First non-whitespace element is our trigger
                 memory.trigger = segment
 
-        is_last = self.is_final_segment(context)
+        is_last = context.segment is context.final_segment
         if not segment.is_type("newline") and not is_last:
             # Process on line ends or file end
             return LintResult(memory=memory)

diff --git a/src/sqlfluff/rules/L004.py b/src/sqlfluff/rules/L004.py
@@ -77,8 +77,8 @@ def _eval(self, context: RuleContext) -> LintResult:
                 )
                 # Only attempt a fix at the start of a newline for now
                 and (
-                    len(context.raw_stack) == 0
-                    or context.raw_stack[-1].is_type("newline")
+                    context.raw_segment_pre is None
+                    or context.raw_segment_pre.is_type("newline")
                 )
             ):
                 fixes = [
@@ -90,7 +90,8 @@ def _eval(self, context: RuleContext) -> LintResult:
                     )
                 ]
             elif not (
-                len(context.raw_stack) == 0 or context.raw_stack[-1].is_type("newline")
+                context.raw_segment_pre is None
+                or context.raw_segment_pre.is_type("newline")
             ):
                 # give a helpful message if the wrong indent has been found and is not
                 # at the start of a newline

diff --git a/src/sqlfluff/rules/L005.py b/src/sqlfluff/rules/L005.py
@@ -1,6 +1,7 @@
 """Implementation of Rule L005."""
 from typing import Optional
 
+from sqlfluff.core.parser import RawSegment
 from sqlfluff.core.rules.base import BaseRule, LintResult, LintFix, RuleContext
 from sqlfluff.core.rules.doc_decorators import document_fix_compatible
 
@@ -38,19 +39,15 @@ class Rule_L005(BaseRule):
     """
 
     def _eval(self, context: RuleContext) -> Optional[LintResult]:
-        """Commas should not have whitespace directly before them.
-
-        We need at least one segment behind us for this to work.
-
-        """
-        if len(context.raw_stack) >= 1:
-            cm1 = context.raw_stack[-1]
-            if (
-                context.segment.is_type("comma")
-                and cm1.is_type("whitespace")
-                and cm1.pos_marker.line_pos > 1
-            ):
-                anchor = cm1
-                return LintResult(anchor=anchor, fixes=[LintFix.delete(cm1)])
-        # Otherwise fine
+        """Commas should not have whitespace directly before them."""
+        anchor: Optional[RawSegment] = context.raw_segment_pre
+        if (
+            # We need at least one segment previous segment for this to work.
+            anchor is not None
+            and context.segment.is_type("comma")
+            and anchor.is_type("whitespace")
+            and anchor.pos_marker.line_pos > 1
+        ):
+            return LintResult(anchor=anchor, fixes=[LintFix.delete(anchor)])
+        # Otherwise fine.
         return None
diff --git a/src/sqlfluff/rules/L009.py b/src/sqlfluff/rules/L009.py
@@ -1,13 +1,35 @@
 """Implementation of Rule L009."""
-from typing import Optional
-
-from sqlfluff.core.parser import NewlineSegment
+from typing import List, Optional, Tuple
 
+from sqlfluff.core.parser import BaseSegment, NewlineSegment
 from sqlfluff.core.rules.base import BaseRule, LintResult, LintFix, RuleContext
 from sqlfluff.core.rules.doc_decorators import document_fix_compatible
 from sqlfluff.core.rules.functional import Segments, sp, tsp
 
 
+def get_trailing_newlines(segment: BaseSegment) -> List[BaseSegment]:
+    """Returns list of trailing newlines in the tree."""
+    result = []
+    for seg in segment.recursive_crawl_all(reverse=True):
+        if seg.is_type("newline"):
+            result.append(seg)
+        if not seg.is_whitespace and not seg.is_type("dedent"):
+            break
+    return result
+
+
+def get_last_segment(segment: Segments) -> Tuple[List[BaseSegment], Segments]:
+    """Returns rightmost & lowest descendant and its "parent stack"."""
+    parent_stack: List[BaseSegment] = []
+    while True:
+        children = segment.children()
+        if children:
+            parent_stack.append(segment[0])
+            segment = children.last()
+        else:
+            return parent_stack, segment
+
+
 @document_fix_compatible
 class Rule_L009(BaseRule):
     """Files must end with a single trailing newline.
@@ -82,6 +104,9 @@ class Rule_L009(BaseRule):
     """
 
     targets_templated = True
+    # TRICKY: Tells linter to only call _eval() ONCE, with the root segment
+    recurse_into = False
+    lint_phase = "post"
 
     def _eval(self, context: RuleContext) -> Optional[LintResult]:
         """Files must end with a single trailing newline.
@@ -91,21 +116,9 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
 
         """
         # We only care about the final segment of the parse tree.
-        if not self.is_final_segment(context, filter_meta=False):
-            return None
+        parent_stack, segment = get_last_segment(context.functional.segment)
 
-        # Include current segment for complete stack and reverse.
-        parent_stack: Segments = context.functional.parent_stack
-        complete_stack: Segments = (
-            context.functional.raw_stack + context.functional.segment
-        )
-        reversed_complete_stack = complete_stack.reversed()
-
-        # Find the trailing newline segments.
-        trailing_newlines = reversed_complete_stack.select(
-            select_if=sp.is_type("newline"),
-            loop_while=sp.or_(sp.is_whitespace(), sp.is_type("dedent")),
-        )
+        trailing_newlines = Segments(*get_trailing_newlines(context.segment))
         trailing_literal_newlines = trailing_newlines
         if context.templated_file:
             trailing_literal_newlines = trailing_newlines.select(
@@ -116,12 +129,12 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
         if not trailing_literal_newlines:
             # We make an edit to create this segment after the child of the FileSegment.
             if len(parent_stack) == 1:
-                fix_anchor_segment = context.segment
+                fix_anchor_segment = segment[0]
             else:
                 fix_anchor_segment = parent_stack[1]
 
             return LintResult(
-                anchor=context.segment,
+                anchor=segment[0],
                 fixes=[
                     LintFix.create_after(
                         fix_anchor_segment,
@@ -132,7 +145,7 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
         elif len(trailing_literal_newlines) > 1:
             # Delete extra newlines.
             return LintResult(
-                anchor=context.segment,
+                anchor=segment[0],
                 fixes=[LintFix.delete(d) for d in trailing_literal_newlines[1:]],
             )
         else:

diff --git a/src/sqlfluff/rules/L010.py b/src/sqlfluff/rules/L010.py
@@ -43,6 +43,7 @@ class Rule_L010(BaseRule):
         from foo
     """
 
+    lint_phase = "post"
     # Binary operators behave like keywords too.
     _target_elems: List[Tuple[str, str]] = [
         ("type", "keyword"),

diff --git a/src/sqlfluff/rules/L011.py b/src/sqlfluff/rules/L011.py
@@ -52,9 +52,6 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
 
         We look for the alias segment, and then evaluate its parent and whether
         it contains an AS keyword. This is the _eval function for both L011 and L012.
-
-        The use of `raw_stack` is just for working out how much whitespace to add.
-
         """
         # Config type hints
         self.aliasing: str
@@ -68,14 +65,14 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
 
                             # Remove the AS as we're using implict aliasing
                             fixes.append(LintFix.delete(context.segment.segments[0]))
-                            anchor = context.raw_stack[-1]
+                            anchor = context.raw_segment_pre
 
                             # Remove whitespace before (if exists) or after (if not)
                             if (
-                                len(context.raw_stack) > 0
-                                and context.raw_stack[-1].type == "whitespace"
+                                context.raw_segment_pre is not None
+                                and context.raw_segment_pre.type == "whitespace"
                             ):
-                                fixes.append(LintFix.delete(context.raw_stack[-1]))
+                                fixes.append(LintFix.delete(context.raw_segment_pre))
                             elif (
                                 len(context.segment.segments) > 0
                                 and context.segment.segments[1].type == "whitespace"
@@ -90,7 +87,8 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
                     insert_buff: List[Union[WhitespaceSegment, KeywordSegment]] = []
 
                     # Add initial whitespace if we need to...
-                    if context.raw_stack[-1].name not in ["whitespace", "newline"]:
+                    assert context.raw_segment_pre
+                    if context.raw_segment_pre.name not in ["whitespace", "newline"]:
                         insert_buff.append(WhitespaceSegment())
 
                     # Add an AS (Uppercase for now, but could be corrected later)

diff --git a/src/sqlfluff/rules/L014.py b/src/sqlfluff/rules/L014.py
@@ -67,6 +67,7 @@ class Rule_L014(Rule_L010):
 
     """
 
+    lint_phase = "post"
     _target_elems: List[Tuple[str, str]] = [
         ("name", "naked_identifier"),
         ("name", "properties_naked_identifier"),