-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Change logic of direct filesystem access linting #2766
Changes from 2 commits
6b51cb1
fe0f942
51afb4d
e1eac0d
d6b3d6e
6bc6f7e
843b2a0
97d62fa
9a3beb6
d1f5cde
2c0aafd
ac586a5
7e50936
573bf4f
ca3c6ce
74cb5f9
4ccbae6
9bec2ad
21f83e7
56b2ce0
c7b7a0d
02fa0a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -3,7 +3,7 @@ | |||||||||||||||||||||||
from abc import ABC | ||||||||||||||||||||||||
from collections.abc import Iterable | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
from astroid import Attribute, Call, Const, InferenceError, JoinedStr, Name, NodeNG # type: ignore | ||||||||||||||||||||||||
from astroid import Call, InferenceError, NodeNG # type: ignore | ||||||||||||||||||||||||
from sqlglot import Expression as SqlExpression, parse as parse_sql, ParseError as SqlParseError | ||||||||||||||||||||||||
from sqlglot.expressions import Alter, Create, Delete, Drop, Identifier, Insert, Literal, Select | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
|
@@ -72,43 +72,37 @@ class _DetectDirectFsAccessVisitor(TreeVisitor): | |||||||||||||||||||||||
def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None: | ||||||||||||||||||||||||
self._session_state = session_state | ||||||||||||||||||||||||
self._directfs_nodes: list[DirectFsAccessNode] = [] | ||||||||||||||||||||||||
self._reported_locations: set[tuple[int, int]] = set() | ||||||||||||||||||||||||
self._prevent_spark_duplicates = prevent_spark_duplicates | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def visit_call(self, node: Call): | ||||||||||||||||||||||||
for arg in node.args: | ||||||||||||||||||||||||
self._visit_arg(arg) | ||||||||||||||||||||||||
self._visit_arg(node, arg) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def _visit_arg(self, arg: NodeNG): | ||||||||||||||||||||||||
def _visit_arg(self, call: Call, arg: NodeNG): | ||||||||||||||||||||||||
try: | ||||||||||||||||||||||||
for inferred in InferredValue.infer_from_node(arg, self._session_state): | ||||||||||||||||||||||||
if not inferred.is_inferred(): | ||||||||||||||||||||||||
logger.debug(f"Could not infer value of {arg.as_string()}") | ||||||||||||||||||||||||
continue | ||||||||||||||||||||||||
self._check_str_constant(arg, inferred) | ||||||||||||||||||||||||
self._check_str_arg(call, arg, inferred) | ||||||||||||||||||||||||
except InferenceError as e: | ||||||||||||||||||||||||
logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def visit_const(self, node: Const): | ||||||||||||||||||||||||
# Constant strings yield Advisories | ||||||||||||||||||||||||
if isinstance(node.value, str): | ||||||||||||||||||||||||
self._check_str_constant(node, InferredValue([node])) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def _check_str_constant(self, source_node: NodeNG, inferred: InferredValue): | ||||||||||||||||||||||||
if self._already_reported(source_node, inferred): | ||||||||||||||||||||||||
return | ||||||||||||||||||||||||
# don't report on JoinedStr fragments | ||||||||||||||||||||||||
if isinstance(source_node.parent, JoinedStr): | ||||||||||||||||||||||||
return | ||||||||||||||||||||||||
def _check_str_arg(self, call_node: Call, arg_node: NodeNG, inferred: InferredValue): | ||||||||||||||||||||||||
value = inferred.as_string() | ||||||||||||||||||||||||
for pattern in DIRECT_FS_ACCESS_PATTERNS: | ||||||||||||||||||||||||
if not pattern.matches(value): | ||||||||||||||||||||||||
continue | ||||||||||||||||||||||||
# avoid false positives with relative URLs | ||||||||||||||||||||||||
if self._is_http_call_parameter(source_node): | ||||||||||||||||||||||||
# only capture calls originating from spark or dbutils | ||||||||||||||||||||||||
# because there is no other known way to manipulate data directly from file system | ||||||||||||||||||||||||
is_from_spark = False | ||||||||||||||||||||||||
is_from_db_utils = Tree(call_node).is_from_module("dbutils") | ||||||||||||||||||||||||
if not is_from_db_utils: | ||||||||||||||||||||||||
is_from_spark = Tree(call_node).is_from_module("spark") | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are independent, but if either is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you mean:
That is what I read: the "and" in the code with negations on both sides There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup sorry There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The purpose here is to minimize calls to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Could you add that as a comment? It is fine with me, just want to be sure the logic is correct though There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||||||||||||||||||||
if not is_from_db_utils and not is_from_spark: | ||||||||||||||||||||||||
return | ||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
i wonder if this logic is more readable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is but it requires making 2 calls to |
||||||||||||||||||||||||
# avoid duplicate advices that are reported by SparkSqlPyLinter | ||||||||||||||||||||||||
if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"): | ||||||||||||||||||||||||
if self._prevent_spark_duplicates and is_from_spark: | ||||||||||||||||||||||||
return | ||||||||||||||||||||||||
# since we're normally filtering out spark calls, we're dealing with dfsas we know little about | ||||||||||||||||||||||||
# notably we don't know is_read or is_write | ||||||||||||||||||||||||
|
@@ -117,39 +111,8 @@ def _check_str_constant(self, source_node: NodeNG, inferred: InferredValue): | |||||||||||||||||||||||
is_read=True, | ||||||||||||||||||||||||
is_write=False, | ||||||||||||||||||||||||
) | ||||||||||||||||||||||||
self._directfs_nodes.append(DirectFsAccessNode(dfsa, source_node)) | ||||||||||||||||||||||||
self._reported_locations.add((source_node.lineno, source_node.col_offset)) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
@classmethod | ||||||||||||||||||||||||
def _is_http_call_parameter(cls, source_node: NodeNG): | ||||||||||||||||||||||||
if not isinstance(source_node.parent, Call): | ||||||||||||||||||||||||
return False | ||||||||||||||||||||||||
# for now we only cater for ws.api_client.do | ||||||||||||||||||||||||
return cls._is_ws_api_client_do_call(source_node) | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
@classmethod | ||||||||||||||||||||||||
def _is_ws_api_client_do_call(cls, source_node: NodeNG): | ||||||||||||||||||||||||
assert isinstance(source_node.parent, Call) | ||||||||||||||||||||||||
func = source_node.parent.func | ||||||||||||||||||||||||
if not isinstance(func, Attribute) or func.attrname != "do": | ||||||||||||||||||||||||
return False | ||||||||||||||||||||||||
expr = func.expr | ||||||||||||||||||||||||
if not isinstance(expr, Attribute) or expr.attrname != "api_client": | ||||||||||||||||||||||||
return False | ||||||||||||||||||||||||
expr = expr.expr | ||||||||||||||||||||||||
if not isinstance(expr, Name): | ||||||||||||||||||||||||
return False | ||||||||||||||||||||||||
for value in InferredValue.infer_from_node(expr): | ||||||||||||||||||||||||
if not value.is_inferred(): | ||||||||||||||||||||||||
continue | ||||||||||||||||||||||||
for node in value.nodes: | ||||||||||||||||||||||||
return Tree(node).is_instance_of("WorkspaceClient") | ||||||||||||||||||||||||
# at this point is seems safer to assume that expr.expr is a workspace than the opposite | ||||||||||||||||||||||||
return True | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
def _already_reported(self, source_node: NodeNG, inferred: InferredValue): | ||||||||||||||||||||||||
all_nodes = [source_node] + inferred.nodes | ||||||||||||||||||||||||
return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes) | ||||||||||||||||||||||||
self._directfs_nodes.append(DirectFsAccessNode(dfsa, arg_node)) | ||||||||||||||||||||||||
return | ||||||||||||||||||||||||
|
||||||||||||||||||||||||
@property | ||||||||||||||||||||||||
def directfs_nodes(self): | ||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's also
with open('/dbfs/mnt/x', 'r') as f: ...
, but we're not handling it here yet. make sure to add thatThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done