STYLE use pd_array in core (pandas-dev#40319)

vladu · Apr 5, 2021 · 2f4755e · 2f4755e
1 parent 1c5c42e
commit 2f4755e
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 2 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -212,3 +212,10 @@ repos:
             |\#\ type:\s?ignore(?!\[)
         language: pygrep
         types: [python]
+    -   id: use-pd_array-in-core
+        name: Import pandas.array as pd_array in core
+        language: python
+        entry: python scripts/use_pd_array_in_core.py
+        files: ^pandas/core/
+        exclude: ^pandas/core/api\.py$
+        types: [python]
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -3023,7 +3023,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
     """
     from pandas import (
         DataFrame,
-        array,
+        array as pd_array,
     )
 
     regex = re.compile(pat, flags=flags)
@@ -3034,7 +3034,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
         result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
         name = _get_single_group_name(regex)
         # not dispatching, so we have to reconstruct here.
-        result = array(result, dtype=result_dtype)
+        result = pd_array(result, dtype=result_dtype)
     else:
         if isinstance(arr, ABCIndex):
             raise ValueError("only one regex group is supported with Index")

diff --git a/scripts/tests/test_use_pd_array_in_core.py b/scripts/tests/test_use_pd_array_in_core.py
@@ -0,0 +1,26 @@
+import pytest
+
+from scripts.use_pd_array_in_core import use_pd_array
+
+BAD_FILE_0 = "import pandas as pd\npd.array"
+BAD_FILE_1 = "\nfrom pandas import array"
+GOOD_FILE_0 = "from pandas import array as pd_array"
+GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
+PATH = "t.py"
+
+
+@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
+def test_inconsistent_usage(content, capsys):
+    result_msg = (
+        "t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
+    )
+    with pytest.raises(SystemExit):
+        use_pd_array(content, PATH)
+    expected_msg, _ = capsys.readouterr()
+    assert result_msg == expected_msg
+
+
+@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
+def test_consistent_usage(content):
+    # should not raise
+    use_pd_array(content, PATH)
diff --git a/scripts/use_pd_array_in_core.py b/scripts/use_pd_array_in_core.py
@@ -0,0 +1,77 @@
+"""
+Check that pandas/core imports pandas.array as pd_array.
+
+This makes it easier to grep for usage of pandas array.
+
+This is meant to be run as a pre-commit hook - to run it manually, you can do:
+
+    pre-commit run use-pd_array-in-core --all-files
+
+"""
+
+import argparse
+import ast
+import sys
+from typing import (
+    Optional,
+    Sequence,
+)
+
+ERROR_MESSAGE = (
+    "{path}:{lineno}:{col_offset}: "
+    "Don't use pd.array in core, import array as pd_array instead\n"
+)
+
+
+class Visitor(ast.NodeVisitor):
+    def __init__(self, path: str) -> None:
+        self.path = path
+
+    def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
+        # If array has been imported from somewhere in pandas,
+        # check it's aliased as pd_array.
+        if (
+            node.module is not None
+            and node.module.startswith("pandas")
+            and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
+        ):
+            msg = ERROR_MESSAGE.format(
+                path=self.path, lineno=node.lineno, col_offset=node.col_offset
+            )
+            sys.stdout.write(msg)
+            sys.exit(1)
+        super().generic_visit(node)
+
+    def visit_Attribute(self, node: ast.Attribute) -> None:
+        if (
+            isinstance(node.value, ast.Name)
+            and node.value.id == "pd"
+            and node.attr == "array"
+        ):
+            msg = ERROR_MESSAGE.format(
+                path=self.path, lineno=node.lineno, col_offset=node.col_offset
+            )
+            sys.stdout.write(msg)
+            sys.exit(1)
+        super().generic_visit(node)
+
+
+def use_pd_array(content: str, path: str) -> None:
+    tree = ast.parse(content)
+    visitor = Visitor(path)
+    visitor.visit(tree)
+
+
+def main(argv: Optional[Sequence[str]] = None) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("paths", nargs="*")
+    args = parser.parse_args(argv)
+
+    for path in args.paths:
+        with open(path, encoding="utf-8") as fd:
+            content = fd.read()
+        use_pd_array(content, path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
@@ -140,6 +140,7 @@ omit =
     pandas/_typing.py
     pandas/_version.py
 plugins = Cython.Coverage
+source = pandas
 
 [coverage:report]
 ignore_errors = False