From da4df4fe65273eaec1c21af76eb951c29e7f6dd1 Mon Sep 17 00:00:00 2001
From: J van Zundert <mail@jeroenvanzundert.nl>
Date: Sun, 16 Jul 2023 07:50:40 +0100
Subject: [PATCH] chore(python): Use Pathlib everywhere (#9914)

---
 py-polars/docs/source/conf.py              |  7 ++++---
 py-polars/polars/config.py                 |  8 ++++----
 py-polars/polars/io/_utils.py              |  2 +-
 py-polars/polars/lazyframe/frame.py        |  5 ++---
 py-polars/polars/utils/various.py          | 15 +++++++--------
 py-polars/pyproject.toml                   |  1 +
 py-polars/scripts/check_stacklevels.py     |  6 +++---
 py-polars/tests/benchmark/test_release.py  |  5 ++---
 py-polars/tests/unit/io/conftest.py        |  4 +---
 py-polars/tests/unit/io/test_csv.py        |  8 +++-----
 py-polars/tests/unit/io/test_database.py   |  7 ++-----
 py-polars/tests/unit/io/test_lazy_csv.py   | 12 +++++-------
 py-polars/tests/unit/io/test_lazy_json.py  |  4 ++--
 py-polars/tests/unit/io/test_other.py      | 12 ++++++------
 py-polars/tests/unit/io/test_parquet.py    |  6 ++----
 py-polars/tests/unit/streaming/conftest.py |  4 +---
 py-polars/tests/unit/test_cfg.py           |  8 +++-----
 py-polars/tests/unit/test_sql.py           |  5 ++---
 18 files changed, 51 insertions(+), 68 deletions(-)

diff --git a/py-polars/docs/source/conf.py b/py-polars/docs/source/conf.py
index 31916f3c6106..6c4f0891e9a6 100644
--- a/py-polars/docs/source/conf.py
+++ b/py-polars/docs/source/conf.py
@@ -16,11 +16,12 @@
 import re
 import sys
 import warnings
+from pathlib import Path
 
 import sphinx_autosummary_accessors
 
 # add polars directory
-sys.path.insert(0, os.path.abspath("../.."))
+sys.path.insert(0, str(Path("../..").resolve()))
 
 # -- Project information -----------------------------------------------------
 
@@ -200,8 +201,8 @@ def linkcode_resolve(domain, info):
 
     linespec = f"#L{lineno}-L{lineno + len(source) - 1}" if lineno else ""
 
-    conf_dir_path = os.path.dirname(os.path.realpath(__file__))
-    polars_root = os.path.abspath(f"{conf_dir_path}/../../polars")
+    conf_dir_path = Path(__file__).absolute().parent
+    polars_root = (conf_dir_path.parent.parent / "polars").absolute()
 
     fn = os.path.relpath(fn, start=polars_root)
     return f"{github_root}/blob/main/py-polars/polars/{fn}{linespec}"
diff --git a/py-polars/polars/config.py b/py-polars/polars/config.py
index fc4e49f86929..2ef42ad14af3 100644
--- a/py-polars/polars/config.py
+++ b/py-polars/polars/config.py
@@ -156,7 +156,7 @@ def load(cls, cfg: Path | str) -> type[Config]:
         """
         options = json.loads(
             Path(normalise_filepath(cfg)).read_text()
-            if isinstance(cfg, Path) or os.path.exists(cfg)
+            if isinstance(cfg, Path) or Path(cfg).exists()
             else cfg
         )
         os.environ.update(options.get("environment", {}))
@@ -221,9 +221,9 @@ def save(cls, file: Path | str | None = None) -> str:
             separators=(",", ":"),
         )
         if isinstance(file, (str, Path)):
-            file = os.path.abspath(normalise_filepath(file))
-            Path(file).write_text(options)
-            return file
+            file = Path(normalise_filepath(file)).resolve()
+            file.write_text(options)
+            return str(file)
 
         return options
 
diff --git a/py-polars/polars/io/_utils.py b/py-polars/polars/io/_utils.py
index a0bdceb7f20e..46d55343ec27 100644
--- a/py-polars/polars/io/_utils.py
+++ b/py-polars/polars/io/_utils.py
@@ -166,7 +166,7 @@ def managed_file(file: Any) -> Iterator[Any]:
     if isinstance(file, str):
         file = normalise_filepath(file, check_not_dir)
         if has_non_utf8_non_utf8_lossy_encoding:
-            with open(file, encoding=encoding_str) as f:
+            with Path(file).open(encoding=encoding_str) as f:
                 return _check_empty(
                     BytesIO(f.read().encode("utf8")), context=f"{file!r}"
                 )
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index 66d91f95fa87..f4ac1d5dced5 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -894,7 +894,7 @@ def show_graph(
         *,
         optimized: bool = True,
         show: bool = True,
-        output_path: str | None = None,
+        output_path: str | Path | None = None,
         raw_output: bool = False,
         figsize: tuple[float, float] = (16.0, 12.0),
         type_coercion: bool = True,
@@ -975,8 +975,7 @@ def show_graph(
             raise ImportError("Graphviz dot binary should be on your PATH") from None
 
         if output_path:
-            with Path(output_path).open(mode="wb") as file:
-                file.write(graph)
+            Path(output_path).write_bytes(graph)
 
         if not show:
             return None
diff --git a/py-polars/polars/utils/various.py b/py-polars/polars/utils/various.py
index 69d3cb502fe6..d6820aaf3b23 100644
--- a/py-polars/polars/utils/various.py
+++ b/py-polars/polars/utils/various.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
 import inspect
-import os
 import re
 import sys
 import warnings
 from collections.abc import MappingView, Sized
 from enum import Enum
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Generator, Iterable, Literal, Sequence, TypeVar
 
 import polars as pl
@@ -25,7 +25,6 @@
 
 if TYPE_CHECKING:
     from collections.abc import Reversible
-    from pathlib import Path
 
     from polars import DataFrame, Series
     from polars.type_aliases import PolarsDataType, PolarsIntegerType, SizeUnit
@@ -183,10 +182,10 @@ def can_create_dicts_with_pyarrow(dtypes: Sequence[PolarsDataType]) -> bool:
 
 def normalise_filepath(path: str | Path, check_not_directory: bool = True) -> str:
     """Create a string path, expanding the home directory if present."""
-    path = os.path.expanduser(path)
-    if check_not_directory and os.path.exists(path) and os.path.isdir(path):
+    path = Path(path).expanduser()
+    if check_not_directory and path.exists() and path.is_dir():
         raise IsADirectoryError(f"Expected a file path; {path!r} is a directory")
-    return path
+    return str(path)
 
 
 def parse_version(version: Sequence[str | int]) -> tuple[int, ...]:
@@ -358,15 +357,15 @@ def find_stacklevel() -> int:
     Taken from:
     https://github.com/pandas-dev/pandas/blob/ab89c53f48df67709a533b6a95ce3d911871a0a8/pandas/util/_exceptions.py#L30-L51
     """
-    pkg_dir = os.path.dirname(pl.__file__)
-    test_dir = os.path.join(pkg_dir, "tests")
+    pkg_dir = Path(pl.__file__).parent
+    test_dir = pkg_dir / "tests"
 
     # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
     frame = inspect.currentframe()
     n = 0
     while frame:
         fname = inspect.getfile(frame)
-        if fname.startswith(pkg_dir) and not fname.startswith(test_dir):
+        if fname.startswith(str(pkg_dir)) and not fname.startswith(str(test_dir)):
             frame = frame.f_back
             n += 1
         else:
diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
index ca01851a144f..724f6638ce6e 100644
--- a/py-polars/pyproject.toml
+++ b/py-polars/pyproject.toml
@@ -119,6 +119,7 @@ select = [
   "UP", # pyupgrade
   "PT", # flake8-pytest-style
   "RUF", # Ruff-specific rules
+  "PTH", # flake8-use-pathlib
 ]
 
 ignore = [
diff --git a/py-polars/scripts/check_stacklevels.py b/py-polars/scripts/check_stacklevels.py
index 93805063acad..2ff14283ea01 100644
--- a/py-polars/scripts/check_stacklevels.py
+++ b/py-polars/scripts/check_stacklevels.py
@@ -7,6 +7,7 @@
 import subprocess
 import sys
 from ast import NodeVisitor
+from pathlib import Path
 
 # Files in which it's OK to set the stacklevel manually.
 # `git ls-files` lists files with forwards-slashes
@@ -38,10 +39,9 @@ def visit_Call(self, node: ast.Call) -> None:
     for file in files:
         if file in EXCLUDE:
             continue
-        if not file.endswith(".py"):
+        if Path(file).suffix != ".py":
             continue
-        with open(file) as fd:
-            content = fd.read()
+        content = Path(file).read_text()
         tree = ast.parse(content)
         stacklevel_checker = StackLevelChecker(file)
         stacklevel_checker.visit(tree)
diff --git a/py-polars/tests/benchmark/test_release.py b/py-polars/tests/benchmark/test_release.py
index bbea4df6b928..e6ebdac7dcbe 100644
--- a/py-polars/tests/benchmark/test_release.py
+++ b/py-polars/tests/benchmark/test_release.py
@@ -5,7 +5,6 @@
 
 To run these tests: pytest -m benchmark
 """
-import os
 import time
 from pathlib import Path
 from typing import cast
@@ -21,12 +20,12 @@
 
 
 @pytest.mark.skipif(
-    not (Path(os.path.dirname(__file__)) / "G1_1e7_1e2_5_0.csv").is_file(),
+    not (Path(__file__).parent / "G1_1e7_1e2_5_0.csv").is_file(),
     reason="Dataset must be generated before running this test.",
 )
 def test_read_scan_large_csv() -> None:
     filename = "G1_1e7_1e2_5_0.csv"
-    path = Path(os.path.dirname(__file__)) / filename
+    path = Path(__file__).parent / filename
 
     predicate = pl.col("v2") < 5
 
diff --git a/py-polars/tests/unit/io/conftest.py b/py-polars/tests/unit/io/conftest.py
index b488a9d29c44..fd174486b25f 100644
--- a/py-polars/tests/unit/io/conftest.py
+++ b/py-polars/tests/unit/io/conftest.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 from pathlib import Path
 
 import pytest
@@ -8,5 +7,4 @@
 
 @pytest.fixture()
 def io_files_path() -> Path:
-    current_dir = os.path.dirname(__file__)
-    return Path(current_dir) / "files"
+    return Path(__file__).parent / "files"
diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
index f4a18436e751..8f065bdff851 100644
--- a/py-polars/tests/unit/io/test_csv.py
+++ b/py-polars/tests/unit/io/test_csv.py
@@ -379,8 +379,7 @@ def test_read_csv_encoding(tmp_path: Path) -> None:
     )
 
     file_path = tmp_path / "encoding.csv"
-    with open(file_path, "wb") as f:
-        f.write(bts)
+    file_path.write_bytes(bts)
 
     file_str = str(file_path)
     bytesio = io.BytesIO(bts)
@@ -487,9 +486,8 @@ def test_compressed_csv(io_files_path: Path) -> None:
 
 def test_partial_decompression(foods_file_path: Path) -> None:
     f_out = io.BytesIO()
-    with open(foods_file_path, "rb") as f_read:  # noqa: SIM117
-        with gzip.GzipFile(fileobj=f_out, mode="w") as f:
-            f.write(f_read.read())
+    with gzip.GzipFile(fileobj=f_out, mode="w") as f:
+        f.write(foods_file_path.read_bytes())
 
     csv_bytes = f_out.getvalue()
     for n_rows in [1, 5, 26]:
diff --git a/py-polars/tests/unit/io/test_database.py b/py-polars/tests/unit/io/test_database.py
index a292af1217ca..4466a57761c6 100644
--- a/py-polars/tests/unit/io/test_database.py
+++ b/py-polars/tests/unit/io/test_database.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
-import os
 import sys
 from datetime import date
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pytest
@@ -11,8 +11,6 @@
 from polars.testing import assert_frame_equal
 
 if TYPE_CHECKING:
-    from pathlib import Path
-
     from polars.type_aliases import (
         DbReadEngine,
         DbWriteEngine,
@@ -35,8 +33,7 @@ def sample_df() -> pl.DataFrame:
 def create_temp_sqlite_db(test_db: str) -> None:
     import sqlite3
 
-    if os.path.exists(test_db):
-        os.unlink(test_db)
+    Path(test_db).unlink(missing_ok=True)
 
     # NOTE: at the time of writing adcb/connectorx have weak SQLite support (poor or
     # no bool/date/datetime dtypes, for example) and there is a bug in connectorx that
diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py
index cd5aea1a1e05..2eaa730b0bc8 100644
--- a/py-polars/tests/unit/io/test_lazy_csv.py
+++ b/py-polars/tests/unit/io/test_lazy_csv.py
@@ -42,8 +42,7 @@ def test_invalid_utf8(tmp_path: Path) -> None:
     bts = bytes(np.random.randint(0, 255, 200))
 
     file_path = tmp_path / "nonutf8.csv"
-    with open(file_path, "wb") as f:
-        f.write(bts)
+    file_path.write_bytes(bts)
 
     a = pl.read_csv(file_path, has_header=False, encoding="utf8-lossy")
     b = pl.scan_csv(file_path, has_header=False, encoding="utf8-lossy").collect()
@@ -192,9 +191,8 @@ def test_glob_skip_rows(tmp_path: Path) -> None:
 
     for i in range(2):
         file_path = tmp_path / f"test_{i}.csv"
-        with open(file_path, "w") as f:
-            f.write(
-                f"""
+        file_path.write_text(
+            f"""
 metadata goes here
 file number {i}
 foo,bar,baz
@@ -202,7 +200,7 @@ def test_glob_skip_rows(tmp_path: Path) -> None:
 4,5,6
 7,8,9
     """
-            )
+        )
     file_path = tmp_path / "*.csv"
     assert pl.read_csv(file_path, skip_rows=2).to_dict(False) == {
         "foo": [1, 4, 7, 1, 4, 7],
@@ -227,7 +225,7 @@ def test_glob_n_rows(io_files_path: Path) -> None:
     }
 
 
-def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: str) -> None:
+def test_scan_csv_schema_overwrite_not_projected_8483(foods_file_path: Path) -> None:
     df = (
         pl.scan_csv(
             foods_file_path,
diff --git a/py-polars/tests/unit/io/test_lazy_json.py b/py-polars/tests/unit/io/test_lazy_json.py
index 924d59aba717..8c16e9039e2c 100644
--- a/py-polars/tests/unit/io/test_lazy_json.py
+++ b/py-polars/tests/unit/io/test_lazy_json.py
@@ -57,8 +57,8 @@ def test_scan_with_projection(tmp_path: Path) -> None:
     json_bytes = bytes(json, "utf-8")
 
     file_path = tmp_path / "escape_chars.json"
-    with open(file_path, "wb") as f:
-        f.write(json_bytes)
+    file_path.write_bytes(json_bytes)
+
     actual = pl.scan_ndjson(file_path).select(["id", "text"]).collect()
 
     expected = pl.DataFrame(
diff --git a/py-polars/tests/unit/io/test_other.py b/py-polars/tests/unit/io/test_other.py
index 7e7746b9ef98..8b068708bdc4 100644
--- a/py-polars/tests/unit/io/test_other.py
+++ b/py-polars/tests/unit/io/test_other.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import copy
-import os.path
+from pathlib import Path
 from typing import cast
 
 import polars as pl
@@ -51,8 +51,8 @@ def test_unit_io_subdir_has_no_init() -> None:
     # --------------------------------------------------------------------------------
     # TLDR: it can mask the builtin 'io' module, causing a fatal python error.
     # --------------------------------------------------------------------------------
-    io_dir = os.path.dirname(__file__)
-    assert io_dir.endswith(f"unit{os.path.sep}io")
-    assert not os.path.exists(
-        f"{io_dir}{os.path.sep}__init__.py"
-    ), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory"
+    io_dir = Path(__file__).parent
+    assert io_dir.parts[-2:] == ("unit", "io")
+    assert not (
+        io_dir / "__init__.py"
+    ).exists(), "Found undesirable '__init__.py' in the 'unit.io' tests subdirectory"
diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py
index 556a3efbbf39..b57765242a24 100644
--- a/py-polars/tests/unit/io/test_parquet.py
+++ b/py-polars/tests/unit/io/test_parquet.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 import io
-import os
 from datetime import datetime, timezone
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import numpy as np
@@ -19,8 +19,6 @@
 )
 
 if TYPE_CHECKING:
-    from pathlib import Path
-
     from polars.type_aliases import ParquetCompression
 
 
@@ -513,7 +511,7 @@ def test_parquet_string_cache() -> None:
 
 def test_tz_aware_parquet_9586() -> None:
     result = pl.read_parquet(
-        os.path.join("tests", "unit", "io", "files", "tz_aware.parquet")
+        Path("tests") / "unit" / "io" / "files" / "tz_aware.parquet"
     )
     expected = pl.DataFrame(
         {"UTC_DATETIME_ID": [datetime(2023, 6, 26, 14, 15, 0, tzinfo=timezone.utc)]}
diff --git a/py-polars/tests/unit/streaming/conftest.py b/py-polars/tests/unit/streaming/conftest.py
index 31e98521a2a2..b7b476474316 100644
--- a/py-polars/tests/unit/streaming/conftest.py
+++ b/py-polars/tests/unit/streaming/conftest.py
@@ -1,4 +1,3 @@
-import os
 from pathlib import Path
 
 import pytest
@@ -6,5 +5,4 @@
 
 @pytest.fixture()
 def io_files_path() -> Path:
-    current_dir = os.path.dirname(__file__)
-    return Path(current_dir) / ".." / "io" / "files"
+    return Path(__file__).parent.parent / "io" / "files"
diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py
index 1027a739fd51..1d1be35d45dc 100644
--- a/py-polars/tests/unit/test_cfg.py
+++ b/py-polars/tests/unit/test_cfg.py
@@ -1,7 +1,8 @@
 from __future__ import annotations
 
 import os
-from typing import TYPE_CHECKING, Iterator
+from pathlib import Path
+from typing import Iterator
 
 import pytest
 
@@ -10,9 +11,6 @@
 from polars.exceptions import StringCacheMismatchError
 from polars.testing import assert_frame_equal
 
-if TYPE_CHECKING:
-    from pathlib import Path
-
 
 @pytest.fixture(autouse=True)
 def _environ() -> Iterator[None]:
@@ -531,7 +529,7 @@ def test_config_load_save(tmp_path: Path) -> None:
 
         # ...load back from config...
         if file is not None:
-            assert os.path.isfile(cfg)
+            assert Path(cfg).is_file()
         pl.Config.load(cfg)
 
         # ...and confirm the saved options were set.
diff --git a/py-polars/tests/unit/test_sql.py b/py-polars/tests/unit/test_sql.py
index 86a78ddddfd2..5f328390ac1b 100644
--- a/py-polars/tests/unit/test_sql.py
+++ b/py-polars/tests/unit/test_sql.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 import warnings
 from pathlib import Path
 
@@ -13,8 +12,8 @@
 
 # TODO: Do not rely on I/O for these tests
 @pytest.fixture()
-def foods_ipc_path() -> str:
-    return str(Path(os.path.dirname(__file__)) / "io" / "files" / "foods1.ipc")
+def foods_ipc_path() -> Path:
+    return Path(__file__).parent / "io" / "files" / "foods1.ipc"
 
 
 def test_sql_cast() -> None: