Skip to content

Commit

Permalink
STYLE use pd_array in core (pandas-dev#40319)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli authored and vladu committed Apr 5, 2021
1 parent 1c5c42e commit 2f4755e
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 2 deletions.
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,10 @@ repos:
|\#\ type:\s?ignore(?!\[)
language: pygrep
types: [python]
- id: use-pd_array-in-core
name: Import pandas.array as pd_array in core
language: python
entry: python scripts/use_pd_array_in_core.py
files: ^pandas/core/
exclude: ^pandas/core/api\.py$
types: [python]
4 changes: 2 additions & 2 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3023,7 +3023,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
"""
from pandas import (
DataFrame,
array,
array as pd_array,
)

regex = re.compile(pat, flags=flags)
Expand All @@ -3034,7 +3034,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
name = _get_single_group_name(regex)
# not dispatching, so we have to reconstruct here.
result = array(result, dtype=result_dtype)
result = pd_array(result, dtype=result_dtype)
else:
if isinstance(arr, ABCIndex):
raise ValueError("only one regex group is supported with Index")
Expand Down
26 changes: 26 additions & 0 deletions scripts/tests/test_use_pd_array_in_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest

from scripts.use_pd_array_in_core import use_pd_array

BAD_FILE_0 = "import pandas as pd\npd.array"
BAD_FILE_1 = "\nfrom pandas import array"
GOOD_FILE_0 = "from pandas import array as pd_array"
GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
PATH = "t.py"


@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
def test_inconsistent_usage(content, capsys):
result_msg = (
"t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
)
with pytest.raises(SystemExit):
use_pd_array(content, PATH)
expected_msg, _ = capsys.readouterr()
assert result_msg == expected_msg


@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
def test_consistent_usage(content):
# should not raise
use_pd_array(content, PATH)
77 changes: 77 additions & 0 deletions scripts/use_pd_array_in_core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Check that pandas/core imports pandas.array as pd_array.
This makes it easier to grep for usage of pandas array.
This is meant to be run as a pre-commit hook - to run it manually, you can do:
pre-commit run use-pd_array-in-core --all-files
"""

import argparse
import ast
import sys
from typing import (
Optional,
Sequence,
)

ERROR_MESSAGE = (
"{path}:{lineno}:{col_offset}: "
"Don't use pd.array in core, import array as pd_array instead\n"
)


class Visitor(ast.NodeVisitor):
def __init__(self, path: str) -> None:
self.path = path

def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
# If array has been imported from somewhere in pandas,
# check it's aliased as pd_array.
if (
node.module is not None
and node.module.startswith("pandas")
and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
):
msg = ERROR_MESSAGE.format(
path=self.path, lineno=node.lineno, col_offset=node.col_offset
)
sys.stdout.write(msg)
sys.exit(1)
super().generic_visit(node)

def visit_Attribute(self, node: ast.Attribute) -> None:
if (
isinstance(node.value, ast.Name)
and node.value.id == "pd"
and node.attr == "array"
):
msg = ERROR_MESSAGE.format(
path=self.path, lineno=node.lineno, col_offset=node.col_offset
)
sys.stdout.write(msg)
sys.exit(1)
super().generic_visit(node)


def use_pd_array(content: str, path: str) -> None:
tree = ast.parse(content)
visitor = Visitor(path)
visitor.visit(tree)


def main(argv: Optional[Sequence[str]] = None) -> None:
parser = argparse.ArgumentParser()
parser.add_argument("paths", nargs="*")
args = parser.parse_args(argv)

for path in args.paths:
with open(path, encoding="utf-8") as fd:
content = fd.read()
use_pd_array(content, path)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ omit =
pandas/_typing.py
pandas/_version.py
plugins = Cython.Coverage
source = pandas

[coverage:report]
ignore_errors = False
Expand Down

0 comments on commit 2f4755e

Please sign in to comment.