From da4df4fe65273eaec1c21af76eb951c29e7f6dd1 Mon Sep 17 00:00:00 2001 From: J van Zundert Date: Sun, 16 Jul 2023 07:50:40 +0100 Subject: [PATCH] chore(python): Use Pathlib everywhere (#9914) --- py-polars/docs/source/conf.py | 7 ++++--- py-polars/polars/config.py | 8 ++++---- py-polars/polars/io/_utils.py | 2 +- py-polars/polars/lazyframe/frame.py | 5 ++--- py-polars/polars/utils/various.py | 15 +++++++-------- py-polars/pyproject.toml | 1 + py-polars/scripts/check_stacklevels.py | 6 +++--- py-polars/tests/benchmark/test_release.py | 5 ++--- py-polars/tests/unit/io/conftest.py | 4 +--- py-polars/tests/unit/io/test_csv.py | 8 +++----- py-polars/tests/unit/io/test_database.py | 7 ++----- py-polars/tests/unit/io/test_lazy_csv.py | 12 +++++------- py-polars/tests/unit/io/test_lazy_json.py | 4 ++-- py-polars/tests/unit/io/test_other.py | 12 ++++++------ py-polars/tests/unit/io/test_parquet.py | 6 ++---- py-polars/tests/unit/streaming/conftest.py | 4 +--- py-polars/tests/unit/test_cfg.py | 8 +++----- py-polars/tests/unit/test_sql.py | 5 ++--- 18 files changed, 51 insertions(+), 68 deletions(-) diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py index 31916f3c6106..6c4f0891e9a6 100644 --- a/py-polars/docs/source/conf.py +++ b/py-polars/docs/source/conf.py @@ -16,11 +16,12 @@ import re import sys import warnings +from pathlib import Path import sphinx_autosummary_accessors # add polars directory -sys.path.insert(0, os.path.abspath("../..")) +sys.path.insert(0, str(Path("../..").resolve())) # -- Project information ----------------------------------------------------- @@ -200,8 +201,8 @@ def linkcode_resolve(domain, info): linespec = f"#L{lineno}-L{lineno + len(source) - 1}" if lineno else "" - conf_dir_path = os.path.dirname(os.path.realpath(__file__)) - polars_root = os.path.abspath(f"{conf_dir_path}/../../polars") + conf_dir_path = Path(__file__).absolute().parent + polars_root = (conf_dir_path.parent.parent / "polars").absolute() fn = os.path.relpath(fn, start=polars_root) return f"{github_root}/blob/main/py-polars/polars/{fn}{linespec}" diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py index fc4e49f86929..2ef42ad14af3 100644 --- a/py-polars/polars/config.py +++ b/py-polars/polars/config.py @@ -156,7 +156,7 @@ def load(cls, cfg: Path | str) -> type[Config]: """ options = json.loads( Path(normalise_filepath(cfg)).read_text() - if isinstance(cfg, Path) or os.path.exists(cfg) + if isinstance(cfg, Path) or Path(cfg).exists() else cfg ) os.environ.update(options.get("environment", {})) @@ -221,9 +221,9 @@ def save(cls, file: Path | str | None = None) -> str: separators=(",", ":"), ) if isinstance(file, (str, Path)): - file = os.path.abspath(normalise_filepath(file)) - Path(file).write_text(options) - return file + file = Path(normalise_filepath(file)).resolve() + file.write_text(options) + return str(file) return options diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py index a0bdceb7f20e..46d55343ec27 100644 --- a/py-polars/polars/io/_utils.py +++ b/py-polars/polars/io/_utils.py @@ -166,7 +166,7 @@ def managed_file(file: Any) -> Iterator[Any]: if isinstance(file, str): file = normalise_filepath(file, check_not_dir) if has_non_utf8_non_utf8_lossy_encoding: - with open(file, encoding=encoding_str) as f: + with Path(file).open(encoding=encoding_str) as f: return _check_empty( BytesIO(f.read().encode("utf8")), context=f"{file!r}" ) diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 66d91f95fa87..f4ac1d5dced5 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -894,7 +894,7 @@ def show_graph( *, optimized: bool = True, show: bool = True, - output_path: str | None = None, + output_path: str | Path | None = None, raw_output: bool = False, figsize: tuple[float, float] = (16.0, 12.0), type_coercion: bool = True, @@ -975,8 +975,7 @@ def show_graph( raise ImportError("Graphviz dot binary should be on your PATH") from None if output_path: - with Path(output_path).open(mode="wb") as file: - file.write(graph) + Path(output_path).write_bytes(graph) if not show: return None diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py index 69d3cb502fe6..d6820aaf3b23 100644 --- a/py-polars/polars/utils/various.py +++ b/py-polars/polars/utils/various.py @@ -1,12 +1,12 @@ from __future__ import annotations import inspect -import os import re import sys import warnings from collections.abc import MappingView, Sized from enum import Enum +from pathlib import Path from typing import TYPE_CHECKING, Any, Generator, Iterable, Literal, Sequence, TypeVar import polars as pl @@ -25,7 +25,6 @@ if TYPE_CHECKING: from collections.abc import Reversible - from pathlib import Path from polars import DataFrame, Series from polars.type_aliases import PolarsDataType, PolarsIntegerType, SizeUnit @@ -183,10 +182,10 @@ def can_create_dicts_with_pyarrow(dtypes: Sequence[PolarsDataType]) -> bool: def normalise_filepath(path: str | Path, check_not_directory: bool = True) -> str: """Create a string path, expanding the home directory if present.""" - path = os.path.expanduser(path) - if check_not_directory and os.path.exists(path) and os.path.isdir(path): + path = Path(path).expanduser() + if check_not_directory and path.exists() and path.is_dir(): raise IsADirectoryError(f"Expected a file path; {path!r} is a directory") - return path + return str(path) def parse_version(version: Sequence[str | int]) -> tuple[int, ...]: @@ -358,15 +357,15 @@ def find_stacklevel() -> int: Taken from: https://github.com/pandas-dev/pandas/blob/ab89c53f48df67709a533b6a95ce3d911871a0a8/pandas/util/_exceptions.py#L30-L51 """ - pkg_dir = os.path.dirname(pl.__file__) - test_dir = os.path.join(pkg_dir, "tests") + pkg_dir = Path(pl.__file__).parent + test_dir = pkg_dir / "tests" # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow frame = inspect.currentframe() n = 0 while frame: fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): + if fname.startswith(str(pkg_dir)) and not fname.startswith(str(test_dir)): frame = frame.f_back n += 1 else: diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml index ca01851a144f..724f6638ce6e 100644 --- a/py-polars/pyproject.toml +++ b/py-polars/pyproject.toml @@ -119,6 +119,7 @@ select = [ "UP", # pyupgrade "PT", # flake8-pytest-style "RUF", # Ruff-specific rules + "PTH", # flake8-use-pathlib ] ignore = [ diff --git a/py-polars/scripts/check_stacklevels.py b/py-polars/scripts/check_stacklevels.py index 93805063acad..2ff14283ea01 100644 --- a/py-polars/scripts/check_stacklevels.py +++ b/py-polars/scripts/check_stacklevels.py @@ -7,6 +7,7 @@ import subprocess import sys from ast import NodeVisitor +from pathlib import Path # Files in which it's OK to set the stacklevel manually. # `git ls-files` lists files with forwards-slashes @@ -38,10 +39,9 @@ def visit_Call(self, node: ast.Call) -> None: for file in files: if file in EXCLUDE: continue - if not file.endswith(".py"): + if Path(file).suffix != ".py": continue - with open(file) as fd: - content = fd.read() + content = Path(file).read_text() tree = ast.parse(content) stacklevel_checker = StackLevelChecker(file) stacklevel_checker.visit(tree) diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py index bbea4df6b928..e6ebdac7dcbe 100644 --- a/py-polars/tests/benchmark/test_release.py +++ b/py-polars/tests/benchmark/test_release.py @@ -5,7 +5,6 @@ To run these tests: pytest -m benchmark """ -import os import time from pathlib import Path from typing import cast @@ -21,12 +20,12 @@ @pytest.mark.skipif( - not (Path(os.path.dirname(__file__)) / "G1_1e7_1e2_5_0.csv").is_file(), + not (Path(__file__).parent / "G1_1e7_1e2_5_0.csv").is_file(), reason="Dataset must be generated before running this test.", ) def test_read_scan_large_csv() -> None: filename = "G1_1e7_1e2_5_0.csv" - path = Path(os.path.dirname(__file__)) / filename + path = Path(__file__).parent / filename predicate = pl.col("v2") < 5 diff --git a/py-polars/tests/unit/io/conftest.py b/py-polars/tests/unit/io/conftest.py index b488a9d29c44..fd174486b25f 100644 --- a/py-polars/tests/unit/io/conftest.py +++ b/py-polars/tests/unit/io/conftest.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os from pathlib import Path import pytest @@ -8,5 +7,4 @@ @pytest.fixture() def io_files_path() -> Path: - current_dir = os.path.dirname(__file__) - return Path(current_dir) / "files" + return Path(__file__).parent / "files" diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index f4a18436e751..8f065bdff851 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -379,8 +379,7 @@ def test_read_csv_encoding(tmp_path: Path) -> None: ) file_path = tmp_path / "encoding.csv" - with open(file_path, "wb") as f: - f.write(bts) + file_path.write_bytes(bts) file_str = str(file_path) bytesio = io.BytesIO(bts) @@ -487,9 +486,8 @@ def test_compressed_csv(io_files_path: Path) -> None: def test_partial_decompression(foods_file_path: Path) -> None: f_out = io.BytesIO() - with open(foods_file_path, "rb") as f_read: # noqa: SIM117 - with gzip.GzipFile(fileobj=f_out, mode="w") as f: - f.write(f_read.read()) + with gzip.GzipFile(fileobj=f_out, mode="w") as f: + f.write(foods_file_path.read_bytes()) csv_bytes = f_out.getvalue() for n_rows in [1, 5, 26]: diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py index a292af1217ca..4466a57761c6 100644 --- a/py-polars/tests/unit/io/test_database.py +++ b/py-polars/tests/unit/io/test_database.py @@ -1,8 +1,8 @@ from __future__ import annotations -import os import sys from datetime import date +from pathlib import Path from typing import TYPE_CHECKING import pytest @@ -11,8 +11,6 @@ from polars.testing import assert_frame_equal if TYPE_CHECKING: - from pathlib import Path - from polars.type_aliases import ( DbReadEngine, DbWriteEngine, @@ -35,8 +33,7 @@ def sample_df() -> pl.DataFrame: def create_temp_sqlite_db(test_db: str) -> None: import sqlite3 - if os.path.exists(test_db): - os.unlink(test_db) + Path(test_db).unlink(missing_ok=True) # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index cd5aea1a1e05..2eaa730b0bc8 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -42,8 +42,7 @@ def test_invalid_utf8(tmp_path: Path) -> None: bts = bytes(np.random.randint(0, 255, 200)) file_path = tmp_path / "nonutf8.csv" - with open(file_path, "wb") as f: - f.write(bts) + file_path.write_bytes(bts) a = pl.read_csv(file_path, has_header=False, encoding="utf8-lossy") b = pl.scan_csv(file_path, has_header=False, encoding="utf8-lossy").collect() @@ -192,9 +191,8 @@ def test_glob_skip_rows(tmp_path: Path) -> None: for i in range(2): file_path = tmp_path / f"test_{i}.csv" - with open(file_path, "w") as f: - f.write( - f""" + file_path.write_text( + f""" metadata goes here file number {i} foo,bar,baz @@ -202,7 +200,7 @@ def test_glob_skip_rows(tmp_path: Path) -> None: 4,5,6 7,8,9 """ - ) + ) file_path = tmp_path / "*.csv" assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == { "foo": [1, 4, 7, 1, 4, 7], @@ -227,7 +225,7 @@ def test_glob_n_rows(io_files_path: Path) -> None: } -def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: str) -> None: +def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> None: df = ( pl.scan_csv( foods_file_path, diff --git a/py-polars/tests/unit/io/test_lazy_json.py b/py-polars/tests/unit/io/test_lazy_json.py index 924d59aba717..8c16e9039e2c 100644 --- a/py-polars/tests/unit/io/test_lazy_json.py +++ b/py-polars/tests/unit/io/test_lazy_json.py @@ -57,8 +57,8 @@ def test_scan_with_projection(tmp_path: Path) -> None: json_bytes = bytes(json, "utf-8") file_path = tmp_path / "escape_chars.json" - with open(file_path, "wb") as f: - f.write(json_bytes) + file_path.write_bytes(json_bytes) + actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect() expected = pl.DataFrame( diff --git a/py-polars/tests/unit/io/test_other.py b/py-polars/tests/unit/io/test_other.py index 7e7746b9ef98..8b068708bdc4 100644 --- a/py-polars/tests/unit/io/test_other.py +++ b/py-polars/tests/unit/io/test_other.py @@ -1,7 +1,7 @@ from __future__ import annotations import copy -import os.path +from pathlib import Path from typing import cast import polars as pl @@ -51,8 +51,8 @@ def test_unit_io_subdir_has_no_init() -> None: # -------------------------------------------------------------------------------- # TLDR: it can mask the builtin 'io' module, causing a fatal python error. # -------------------------------------------------------------------------------- - io_dir = os.path.dirname(__file__) - assert io_dir.endswith(f"unit{os.path.sep}io") - assert not os.path.exists( - f"{io_dir}{os.path.sep}__init__.py" - ), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory" + io_dir = Path(__file__).parent + assert io_dir.parts[-2:] == ("unit", "io") + assert not ( + io_dir / "__init__.py" + ).exists(), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory" diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 556a3efbbf39..b57765242a24 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -1,8 +1,8 @@ from __future__ import annotations import io -import os from datetime import datetime, timezone +from pathlib import Path from typing import TYPE_CHECKING import numpy as np @@ -19,8 +19,6 @@ ) if TYPE_CHECKING: - from pathlib import Path - from polars.type_aliases import ParquetCompression @@ -513,7 +511,7 @@ def test_parquet_string_cache() -> None: def test_tz_aware_parquet_9586() -> None: result = pl.read_parquet( - os.path.join("tests", "unit", "io", "files", "tz_aware.parquet") + Path("tests") / "unit" / "io" / "files" / "tz_aware.parquet" ) expected = pl.DataFrame( {"UTC_DATETIME_ID": [datetime(2023, 6, 26, 14, 15, 0, tzinfo=timezone.utc)]} diff --git a/py-polars/tests/unit/streaming/conftest.py b/py-polars/tests/unit/streaming/conftest.py index 31e98521a2a2..b7b476474316 100644 --- a/py-polars/tests/unit/streaming/conftest.py +++ b/py-polars/tests/unit/streaming/conftest.py @@ -1,4 +1,3 @@ -import os from pathlib import Path import pytest @@ -6,5 +5,4 @@ @pytest.fixture() def io_files_path() -> Path: - current_dir = os.path.dirname(__file__) - return Path(current_dir) / ".." / "io" / "files" + return Path(__file__).parent.parent / "io" / "files" diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py index 1027a739fd51..1d1be35d45dc 100644 --- a/py-polars/tests/unit/test_cfg.py +++ b/py-polars/tests/unit/test_cfg.py @@ -1,7 +1,8 @@ from __future__ import annotations import os -from typing import TYPE_CHECKING, Iterator +from pathlib import Path +from typing import Iterator import pytest @@ -10,9 +11,6 @@ from polars.exceptions import StringCacheMismatchError from polars.testing import assert_frame_equal -if TYPE_CHECKING: - from pathlib import Path - @pytest.fixture(autouse=True) def _environ() -> Iterator[None]: @@ -531,7 +529,7 @@ def test_config_load_save(tmp_path: Path) -> None: # ...load back from config... if file is not None: - assert os.path.isfile(cfg) + assert Path(cfg).is_file() pl.Config.load(cfg) # ...and confirm the saved options were set. diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py index 86a78ddddfd2..5f328390ac1b 100644 --- a/py-polars/tests/unit/test_sql.py +++ b/py-polars/tests/unit/test_sql.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import warnings from pathlib import Path @@ -13,8 +12,8 @@ # TODO: Do not rely on I/O for these tests @pytest.fixture() -def foods_ipc_path() -> str: - return str(Path(os.path.dirname(__file__)) / "io" / "files" / "foods1.ipc") +def foods_ipc_path() -> Path: + return Path(__file__).parent / "io" / "files" / "foods1.ipc" def test_sql_cast() -> None: