Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deduplicate violations in the source space #4041

Merged
merged 11 commits into from
Nov 10, 2022
34 changes: 27 additions & 7 deletions src/sqlfluff/core/errors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Errors - these are closely linked to what used to be called violations."""
from typing import Optional, Tuple
from typing import Optional, Tuple, Any

CheckTuple = Tuple[str, int, int]

Expand Down Expand Up @@ -83,6 +83,18 @@ def get_info_dict(self):
"description": self.desc(),
}

def check_tuple(self) -> CheckTuple:
"""Get a tuple representing this error. Mostly for testing."""
return (
self.rule_code(),
self.line_no,
self.line_pos,
)

def source_signature(self) -> Tuple[Any, ...]:
"""Return hashable source signature for deduplication."""
return (self.check_tuple(), self.desc())

def ignore_if_in(self, ignore_iterable):
"""Ignore this violation if it matches the iterable."""
# Type conversion
Expand Down Expand Up @@ -182,13 +194,21 @@ def fixable(self):
return True
return False

def check_tuple(self) -> CheckTuple:
"""Get a tuple representing this error. Mostly for testing."""
return (
self.rule.code,
self.line_no,
self.line_pos,
def source_signature(self) -> Tuple[Any, ...]:
"""Return hashable source signature for deduplication.

For linting errors we need to dedupe on more than just location and
description, we also need to check the edits potentially made, both
in the templated file but also in the source.
"""
fix_raws = tuple(
tuple(e.raw for e in f.edit) if f.edit else None for f in self.fixes
)
source_fixes = tuple(
tuple(tuple(e.source_fixes) for e in f.edit) if f.edit else None
for f in self.fixes
)
return (self.check_tuple(), self.description, fix_raws, source_fixes)

def __repr__(self):
return "<SQLLintError: rule {} pos:{!r}, #fixes: {}, description: {}>".format(
Expand Down
31 changes: 28 additions & 3 deletions src/sqlfluff/core/linter/linted_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,43 @@ def check_tuples(self, raise_on_non_linting_violations=True) -> List[CheckTuple]
"""Make a list of check_tuples.

This assumes that all the violations found are
linting violations (and therefore implement `check_tuple()`).
If they don't then this function raises that error.
linting violations. If they don't then this function
raises that error.
"""
vs: List[CheckTuple] = []
v: SQLLintError
for v in self.get_violations():
if hasattr(v, "check_tuple"):
if isinstance(v, SQLLintError):
vs.append(v.check_tuple())
elif raise_on_non_linting_violations:
raise v
return vs

@staticmethod
def deduplicate_in_source_space(
violations: List[SQLBaseError],
) -> List[SQLBaseError]:
"""Removes duplicates in the source space.

This is useful for templated files with loops, where we'll
get a violation for each pass around the loop, but the user
only cares about it once and we're only going to fix it once.

By filtering them early we get a more a more helpful CLI
output *and* and more efficient fixing routine (by handling
fewer fixes).
"""
new_violations = []
dedupe_buffer = set()
for v in violations:
signature = v.source_signature()
if signature not in dedupe_buffer:
new_violations.append(v)
dedupe_buffer.add(signature)
else:
linter_logger.debug("Removing duplicate source violation: %s", v)
return new_violations
Comment on lines +83 to +92
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👨🏻‍🍳 👌🏻


def get_violations(
self,
rules: Optional[Union[str, Tuple[str, ...]]] = None,
Expand Down
3 changes: 2 additions & 1 deletion src/sqlfluff/core/linter/linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,8 @@ def lint_parsed(

linted_file = LintedFile(
parsed.fname,
violations,
# Deduplicate violations
LintedFile.deduplicate_in_source_space(violations),
time_dict,
tree,
ignore_mask=ignore_buff,
Expand Down
11 changes: 8 additions & 3 deletions test/core/linter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,9 @@ def test__linter__empty_file():
(
False,
[
("L006", 3, 16),
("L006", 3, 16),
# there are still two of each because L006 checks
# for both *before* and *after* the operator.
# The deduplication filter makes sure there aren't 4.
("L006", 3, 16),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even after deduplication, it looks like there are still 2 occurrences of:

("L006", 3, 16),

Any idea why?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - yeah I can see this is confusing. The rules we're checking is L006, which checks for whitespace before and after the operator. I've added a comment to explain this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deduplication means that we only get 2 rather than 4.

("L006", 3, 16),
("L006", 3, 39),
Expand All @@ -418,7 +419,11 @@ def test__linter__empty_file():
],
)
def test__linter__mask_templated_violations(ignore_templated_areas, check_tuples):
"""Test linter masks files properly around templated content."""
"""Test linter masks files properly around templated content.

NOTE: this also tests deduplication of fixes which have the same
source position. i.e. `LintedFile.deduplicate_in_source_space()`.
"""
lntr = Linter(
config=FluffConfig(
overrides={
Expand Down