Skip to content

Commit

Permalink
Fixed SystemError: AST constructor recursion depth mismatch failing…
Browse files Browse the repository at this point in the history
… the entire job (#3000)

This PR adds more deterministic, Go-style, error handling for parsing
Python code

Fix #2976
  • Loading branch information
nfx authored Oct 17, 2024
1 parent 519df78 commit 8f6400f
Show file tree
Hide file tree
Showing 21 changed files with 510 additions and 267 deletions.
121 changes: 5 additions & 116 deletions src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
from pathlib import Path
from typing import Any, BinaryIO, TextIO

from astroid import AstroidSyntaxError, NodeNG # type: ignore
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError
from astroid import NodeNG # type: ignore
from sqlglot import Expression, parse as parse_sql
from sqlglot.errors import SqlglotError

from databricks.sdk.service import compute
from databricks.sdk.service.workspace import Language

from databricks.labs.blueprint.paths import WorkspacePath

from databricks.labs.ucx.source_code.python.python_ast import Tree

if sys.version_info >= (3, 11):
from typing import Self
Expand Down Expand Up @@ -137,12 +137,13 @@ class SqlLinter(Linter):

def lint(self, code: str) -> Iterable[Advice]:
try:
# TODO: unify with SqlParser.walk_expressions(...)
expressions = parse_sql(code, read='databricks')
for expression in expressions:
if not expression:
continue
yield from self.lint_expression(expression)
except SqlParseError as e:
except SqlglotError as e:
logger.debug(f"Failed to parse SQL: {code}", exc_info=e)
yield self.sql_parse_failure(code)

Expand All @@ -162,16 +163,6 @@ def sql_parse_failure(code: str) -> Failure:
def lint_expression(self, expression: Expression) -> Iterable[Advice]: ...


class PythonLinter(Linter):

def lint(self, code: str) -> Iterable[Advice]:
tree = Tree.normalize_and_parse(code)
yield from self.lint_tree(tree)

@abstractmethod
def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...


class Fixer(ABC):

@property
Expand Down Expand Up @@ -271,20 +262,6 @@ class UsedTableNode:
node: NodeNG


class TablePyCollector(TableCollector, ABC):

def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
try:
tree = Tree.normalize_and_parse(source_code)
for table_node in self.collect_tables_from_tree(tree):
yield table_node.table
except AstroidSyntaxError as e:
logger.warning('syntax-error', exc_info=e)

@abstractmethod
def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: ...


class TableSqlCollector(TableCollector, ABC): ...


Expand All @@ -309,17 +286,6 @@ class DfsaCollector(ABC):
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: ...


class DfsaPyCollector(DfsaCollector, ABC):

def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
tree = Tree.normalize_and_parse(source_code)
for dfsa_node in self.collect_dfsas_from_tree(tree):
yield dfsa_node.dfsa

@abstractmethod
def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: ...


class DfsaSqlCollector(DfsaCollector, ABC): ...


Expand Down Expand Up @@ -395,83 +361,6 @@ def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
yield from collector.collect_tables(source_code)


class PythonSequentialLinter(Linter, DfsaCollector, TableCollector):

def __init__(
self,
linters: list[PythonLinter],
dfsa_collectors: list[DfsaPyCollector],
table_collectors: list[TablePyCollector],
):
self._linters = linters
self._dfsa_collectors = dfsa_collectors
self._table_collectors = table_collectors
self._tree: Tree | None = None

def lint(self, code: str) -> Iterable[Advice]:
try:
tree = self._parse_and_append(code)
yield from self.lint_tree(tree)
except AstroidSyntaxError as e:
yield Failure('syntax-error', str(e), 0, 0, 0, 0)

def lint_tree(self, tree: Tree) -> Iterable[Advice]:
for linter in self._linters:
yield from linter.lint_tree(tree)

def _parse_and_append(self, code: str) -> Tree:
tree = Tree.normalize_and_parse(code)
self.append_tree(tree)
return tree

def append_tree(self, tree: Tree) -> None:
self._make_tree().append_tree(tree)

def append_nodes(self, nodes: list[NodeNG]) -> None:
self._make_tree().append_nodes(nodes)

def append_globals(self, globs: dict) -> None:
self._make_tree().append_globals(globs)

def process_child_cell(self, code: str) -> None:
try:
this_tree = self._make_tree()
tree = Tree.normalize_and_parse(code)
this_tree.append_tree(tree)
except AstroidSyntaxError as e:
# error already reported when linting enclosing notebook
logger.warning(f"Failed to parse Python cell: {code}", exc_info=e)

def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
try:
tree = self._parse_and_append(source_code)
for dfsa_node in self.collect_dfsas_from_tree(tree):
yield dfsa_node.dfsa
except AstroidSyntaxError as e:
logger.warning('syntax-error', exc_info=e)

def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]:
for collector in self._dfsa_collectors:
yield from collector.collect_dfsas_from_tree(tree)

def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
try:
tree = self._parse_and_append(source_code)
for table_node in self.collect_tables_from_tree(tree):
yield table_node.table
except AstroidSyntaxError as e:
logger.warning('syntax-error', exc_info=e)

def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]:
for collector in self._table_collectors:
yield from collector.collect_tables_from_tree(tree)

def _make_tree(self) -> Tree:
if self._tree is None:
self._tree = Tree.new_module()
return self._tree


SUPPORTED_EXTENSION_LANGUAGES = {
'.py': Language.PYTHON,
'.sql': Language.SQL,
Expand Down
11 changes: 7 additions & 4 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
SourceInfo,
UsedTable,
LineageAtom,
PythonSequentialLinter,
read_text,
)
from databricks.labs.ucx.source_code.directfs_access import (
Expand All @@ -52,7 +51,7 @@
)
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
from databricks.labs.ucx.source_code.python.python_ast import Tree
from databricks.labs.ucx.source_code.python.python_ast import Tree, PythonSequentialLinter
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
Expand Down Expand Up @@ -638,8 +637,12 @@ def _collect_from_notebook(
if cell.language is CellLanguage.PYTHON:
if inherited_tree is None:
inherited_tree = Tree.new_module()
tree = Tree.normalize_and_parse(cell.original_code)
inherited_tree.append_tree(tree)
maybe_tree = Tree.maybe_normalized_parse(cell.original_code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
continue
assert maybe_tree.tree is not None
inherited_tree.append_tree(maybe_tree.tree)

def _collect_from_source(
self,
Expand Down
10 changes: 6 additions & 4 deletions src/databricks/labs/ucx/source_code/linters/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,18 @@
Linter,
SqlSequentialLinter,
CurrentSessionState,
PythonSequentialLinter,
PythonLinter,
SqlLinter,
TablePyCollector,
TableSqlCollector,
TableCollector,
DfsaCollector,
DfsaPyCollector,
DfsaSqlCollector,
)
from databricks.labs.ucx.source_code.python.python_ast import (
PythonLinter,
TablePyCollector,
DfsaPyCollector,
PythonSequentialLinter,
)
from databricks.labs.ucx.source_code.linters.directfs import DirectFsAccessPyLinter, DirectFsAccessSqlLinter
from databricks.labs.ucx.source_code.linters.imports import DbutilsPyLinter

Expand Down
10 changes: 7 additions & 3 deletions src/databricks/labs/ucx/source_code/linters/directfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@
Advice,
Deprecation,
CurrentSessionState,
PythonLinter,
SqlLinter,
DfsaPyCollector,
DirectFsAccessNode,
DfsaSqlCollector,
DirectFsAccess,
)
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor, TreeHelper
from databricks.labs.ucx.source_code.python.python_ast import (
Tree,
TreeVisitor,
TreeHelper,
PythonLinter,
DfsaPyCollector,
)
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
from databricks.labs.ucx.source_code.sql.sql_parser import SqlParser, SqlExpression

Expand Down
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/linters/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
NodeNG,
)

from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState, PythonLinter
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor
from databricks.labs.ucx.source_code.base import Advice, Advisory, CurrentSessionState
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, TreeVisitor, PythonLinter
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
from databricks.labs.ucx.source_code.path_lookup import PathLookup

Expand Down
26 changes: 20 additions & 6 deletions src/databricks/labs/ucx/source_code/linters/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,24 @@
Advisory,
Deprecation,
CurrentSessionState,
PythonLinter,
SqlLinter,
Fixer,
UsedTable,
UsedTableNode,
TablePyCollector,
TableSqlCollector,
DfsaPyCollector,
DfsaSqlCollector,
)
from databricks.labs.ucx.source_code.linters.directfs import DIRECT_FS_ACCESS_PATTERNS, DirectFsAccessNode
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, MatchingVisitor
from databricks.labs.ucx.source_code.python.python_ast import (
Tree,
TreeHelper,
MatchingVisitor,
PythonLinter,
TablePyCollector,
DfsaPyCollector,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -408,7 +412,12 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
yield from matcher.lint(self._from_table, self._index, self._session_state, node)

def apply(self, code: str) -> str:
tree = Tree.parse(code)
maybe_tree = Tree.maybe_parse(code)
if not maybe_tree.tree:
assert maybe_tree.failure is not None
logger.warning(maybe_tree.failure.message)
return code
tree = maybe_tree.tree
# we won't be doing it like this in production, but for the sake of the example
for node in tree.walk():
matcher = self._find_matcher(node)
Expand Down Expand Up @@ -477,7 +486,12 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
def apply(self, code: str) -> str:
if not self._sql_fixer:
return code
tree = Tree.normalize_and_parse(code)
maybe_tree = Tree.maybe_normalized_parse(code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
return code
assert maybe_tree.tree is not None
tree = maybe_tree.tree
for _call_node, query in self._visit_call_nodes(tree):
if not isinstance(query, Const) or not isinstance(query.value, str):
continue
Expand Down
3 changes: 1 addition & 2 deletions src/databricks/labs/ucx/source_code/linters/spark_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
from databricks.labs.ucx.source_code.base import (
Advice,
Failure,
PythonLinter,
CurrentSessionState,
)
from databricks.sdk.service.compute import DataSecurityMode

from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, PythonLinter


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@

from databricks.labs.ucx.source_code.base import (
Advice,
PythonLinter,
)
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeHelper, PythonLinter


@dataclass
Expand Down
20 changes: 10 additions & 10 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pathlib import Path
from typing import cast

from astroid import AstroidSyntaxError, Module, NodeNG # type: ignore
from astroid import Module, NodeNG # type: ignore

from databricks.sdk.service.workspace import Language

Expand All @@ -17,7 +17,6 @@
Advice,
Failure,
Linter,
PythonSequentialLinter,
CurrentSessionState,
Advisory,
file_language,
Expand All @@ -37,7 +36,7 @@
UnresolvedPath,
)
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter
from databricks.labs.ucx.source_code.notebooks.cells import (
CellLanguage,
Cell,
Expand Down Expand Up @@ -196,13 +195,14 @@ def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) ->
continue

def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]:
try:
tree = Tree.normalize_and_parse(python_cell.original_code)
if register_trees:
self._python_trees[python_cell] = tree
yield from self._load_children_from_tree(tree)
except AstroidSyntaxError as e:
yield Failure('syntax-error', str(e), 0, 0, 0, 0)
maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code)
if maybe_tree.failure:
yield maybe_tree.failure
assert maybe_tree.tree is not None
tree = maybe_tree.tree
if register_trees:
self._python_trees[python_cell] = tree
yield from self._load_children_from_tree(tree)

def _load_children_from_tree(self, tree: Tree) -> Iterable[Advice]:
assert isinstance(tree.node, Module)
Expand Down
Loading

0 comments on commit 8f6400f

Please sign in to comment.