Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement check for groupby slicing and aggregation patterns. #54

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions pandas_vet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def visit_Call(self, node):
self.errors.extend(check_for_arithmetic_methods(node))
self.errors.extend(check_for_comparison_methods(node))
self.errors.extend(check_for_read_table(node))
self.errors.extend(check_for_groupby_slicing_with_method(node))

def visit_Subscript(self, node):
"""
Expand All @@ -50,6 +51,7 @@ def visit_Subscript(self, node):
self.errors.extend(check_for_ix(node))
self.errors.extend(check_for_at(node))
self.errors.extend(check_for_iat(node))
self.errors.extend(check_for_groupby_slicing(node))

def visit_Attribute(self, node):
"""
Expand Down Expand Up @@ -228,6 +230,53 @@ def check_for_read_table(node: ast.Call) -> List:
return []


def check_for_groupby_slicing(node: ast.Subscript) -> List:
"""
Check for slicing operations when using of the `.groupby()` method.

This function will only be called when visiting a `ast.Subscript` node,
which indicates use of slicing syntax, and checks for use of slicing
syntax directly on a `.groupby()` method, e.g.,

df.groupby(A)[B]

Error/warning message to recommend use of the standard pattern below,
which can handle more generalized cases and returns consistent dataframe.

.groupby('group_cols').agg({'agg_cols': 'agg_funcs'})

"""
if isinstance(node.value, ast.Call):
if not hasattr(node.value.func, 'attr'): return []
if node.value.func.attr == 'groupby':
return[PD014(node.lineno, node.col_offset)]

return []


def check_for_groupby_slicing_with_method(node: ast.Call) -> List:
"""
Check for `.groupby()[].agg()` pattern.

This function will only be called when visiting a `ast.Call` node, and
checks for slicing syntax directly on a preceding `.groupby()` method, e.g.,

df.groupby(A)[B].agg(C)

Error/warning message to recommend use of the standard pattern below,
which can handle more generalized cases and returns consistent dataframe.

.groupby('group_cols').agg({'agg_cols': 'agg_funcs'})

"""
if not hasattr(node.func, 'value'): return []
if isinstance(node.func.value, ast.Subscript):
if node.func.value.value.func.attr == 'groupby':
return[PD014(node.lineno, node.col_offset)]

return []


error = namedtuple("Error", ["lineno", "col", "message", "type"])
VetError = partial(partial, error, type=VetPlugin)

Expand Down Expand Up @@ -270,3 +319,6 @@ def check_for_read_table(node: ast.Call) -> List:
PD013 = VetError(
message="PD013 '.melt' is preferred to '.stack'; provides same functionality"
)
PD014 = VetError(
message="PDO14 Use standard syntax '.groupby().agg({agg_col: agg_func})' instead of slicing"
)
207 changes: 207 additions & 0 deletions tests/test_PD014.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
"""
Test to check for use of the slicing syntax with `.groupby()` method

### 2019.03.07 TESTS DON'T YET EXIST FOR ALL OF THESE PATTERNS

Valid (?) patterns for `.groupby()` method:
groupby('group_col')
groupby('group_col').agg('agg_col')
groupby('group_col').agg('agg_func')
groupby('group_col').agg({'agg_col': 'agg_func'})
df.groupby('group_col')
df.groupby('group_col').agg('agg_col')
df.groupby('group_col').agg('agg_func')
df.groupby('group_col').agg({'agg_col': 'agg_func'})

Invalid patterns for `.groupby()` method:
groupby('group_col')['agg_col']
groupby('group_col')['agg_col'].agg('agg_func')
groupby('group_col')['agg_col'].agg_func()
df.groupby('group_col')['agg_col']
df.groupby('group_col')['agg_col'].agg('agg_func')
df.groupby('group_col')['agg_col'].agg_func()

NOTE:
For method calls, function name is in node.value.func.attr

### 2019.03.07 IS THERE VALID SYNTAX FOR A `groupby()` FUNCTION CALL ?

Valid (?) patterns for `groupby()` function:
groupby('group_col')
groupby('group_col').agg('agg_col')
groupby('group_col').agg('agg_func')
groupby('group_col').agg({'agg_col': 'agg_func'})

Invalid patterns for `groupby()` function:
groupby('group_col')['agg_col']
groupby('group_col')['agg_col'].agg('agg_func')
groupby('group_col')['agg_col'].agg_func()

NOTE:
For function calls, function name is in node.value.func.id
"""
import ast

from pandas_vet import VetPlugin
from pandas_vet import PD014


def test_PD014_pass_groupby_method_with_no_slicing():
"""
Test that using .groupby() without slicing does not result in an error.
"""
statement = "df.groupby('group_col')"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = []
assert actual == expected


def test_pd014_pass_groupby_method_with_no_slicing_with_agg_method():
"""
test that .groupby().agg() without slicing does not result in an error.
"""
statement = "df.groupby('group_col').agg('agg_func')"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = []
assert actual == expected


def test_PD014_pass_groupby_method_with_no_slicing_with_agg_columns():
"""
Test that .groupby().agg() without slicing does not result in an error.
"""
statement = "df.groupby('group_col').agg({'agg_col': 'agg_func'})"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = []
assert actual == expected


def test_PD014_fail_groupby_method_with_slicing():
"""
Test that using .groupby()[] syntax results in an error.
"""
statement = "df.groupby('group_col')['agg_col']"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = [PD014(1, 0)]
assert actual == expected


def test_PD014_fail_groupby_method_with_slicing_and_agg_method():
"""
Test that using .groupby()[].agg() syntax results in an error.
"""
statement = "df.groupby('group_col')['agg_col'].agg('agg_func')"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = [PD014(1, 0)]
assert actual == expected


def test_PD014_fail_groupby_method_with_slicing_and_agg_func():
"""
test that using .groupby()[].agg_func() syntax results in an error.
"""
statement = "df.groupby('group_col')['agg_col'].agg_func()"
tree = ast.parse(statement)
actual = list(VetPlugin(tree).run())
expected = [PD014(1, 0)]
assert actual == expected


def test_PD014_pass_groupby_with_slicing_on_chained_method():
"""
test that using .groupby().somefunc[] syntax does not produce an error.
"""
statement = "df.groupby('group_col').somefunc(A)[B]"
tree = ast.parse(statement)
expected = []

actual = list(VetPlugin(tree).run())

assert actual == expected


def test_PD014_pass_groupby_on_sliced_object():
"""
test that using .somefunc()[].groupby() syntax does not produce an error.
"""
statement = "df.somefunc(A)[B].groupby('group_col')"
tree = ast.parse(statement)
expected = []

actual = list(VetPlugin(tree).run())

assert actual == expected


# Below functions test for use of `groupby() function, independent of dataframe
# DOES THIS MAKE ANY SENSE?
#
#def test_PD014_pass_groupby_function_with_no_slicing():
# """
# Test that using groupby() without slicing does not result in an error.
# """
# statement = "groupby('group_col')"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = []
# assert actual == expected
#
#
#def test_PD014_pass_groupby_function_with_no_slicing_with_agg_method():
# """
# Test that groupby().agg() without slicing does not result in an error.
# """
# statement = "groupby('group_col').agg('agg_func')"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = []
# assert actual == expected
#
#
#def test_PD014_pass_groupby_function_with_no_slicing_with_agg_columns():
# """
# Test that groupby().agg() without slicing does not result in an error.
# """
# statement = "groupby('group_col').agg({'agg_col': 'agg_func'})"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = []
# assert actual == expected
#
#
#def test_PD014_fail_groupby_function_with_slicing():
# """
# Test that using groupby()[] syntax results in an error.
# """
# statement = "groupby('group_col')['agg_col']"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = [PD014(1, 0)]
# assert actual == expected
#
#
#def test_PD014_fail_groupby_function_with_slicing_and_agg_method():
# """
# Test that using groupby()[].agg() syntax results in an error.
# """
# statement = "groupby('group_col')['agg_col'].agg('agg_func')"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = [PD014(1, 0)]
# assert actual == expected
#
#
#def test_PD014_fail_groupby_function_with_slicing_and_agg_func():
# """
# test that using groupby()[].agg_func() syntax results in an error.
# """
# statement = "groupby('group_col')['agg_col'].agg_func()"
# tree = ast.parse(statement)
# actual = list(VetPlugin(tree).run())
# expected = [PD014(1, 0)]
# assert actual == expected