tests(python): Make torch install CI-only by default (pola-rs#16058)

nameexhaustion · May 6, 2024 · ae66acd · ae66acd
1 parent 7e000cf
commit ae66acd
Show file tree

Hide file tree

Showing 9 changed files with 66 additions and 55 deletions.
diff --git a/.github/workflows/test-python.yml b/.github/workflows/test-python.yml
@@ -65,7 +65,7 @@ jobs:
       - name: Install Python dependencies
         run: |
           pip install uv
-          uv pip install --compile-bytecode -r requirements-dev.txt
+          uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt
 
       - name: Set up Rust
         run: rustup show

diff --git a/Makefile b/Makefile
@@ -26,6 +26,11 @@ requirements: .venv  ## Install/refresh Python project requirements
 	&& $(VENV_BIN)/uv pip install --upgrade -r py-polars/docs/requirements-docs.txt \
 	&& $(VENV_BIN)/uv pip install --upgrade -r docs/requirements.txt
 
+.PHONY: requirements-all
+requirements-all: .venv  ## Install/refresh all Python requirements (including those needed for CI tests)
+	$(MAKE) requirements
+	$(VENV_BIN)/uv pip install --upgrade --compile-bytecode -r py-polars/requirements-ci.txt
+
 .PHONY: build
 build: .venv  ## Compile and install Python Polars for development
 	@unset CONDA_PREFIX \
@@ -80,7 +85,6 @@ build-release-native: .venv  ## Same as build-release, except with native CPU op
 	$(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --release \
 	$(FILTER_PIP_WARNINGS)
 
-
 .PHONY: check
 check:  ## Run cargo check with all features
 	cargo check --workspace --all-targets --all-features
@@ -108,6 +112,7 @@ pre-commit: fmt clippy clippy-default  ## Run all code quality checks
 clean:  ## Clean up caches and build artifacts
 	@$(MAKE) -s -C py-polars/ $@
 	@rm -rf .ruff_cache/
+	@rm -rf .hypothesis/
 	@rm -rf .venv/
 	@cargo clean
 

diff --git a/py-polars/Makefile b/py-polars/Makefile
@@ -15,7 +15,11 @@ endif
 	@$(MAKE) -s -C .. $@
 
 .PHONY: requirements
-requirements: .venv  ## Install/refresh all project requirements
+requirements: .venv  ## Install/refresh Python project requirements
+	@$(MAKE) -s -C .. $@
+
+.PHONY: requirements-all
+requirements-all: .venv  ## Install/refresh all Python requirements (including those needed for CI tests)
 	@$(MAKE) -s -C .. $@
 
 .PHONY: build
@@ -79,6 +83,11 @@ pre-commit: fmt clippy  ## Run all code formatting and lint/quality checks
 test: .venv build  ## Run fast unittests
 	$(VENV_BIN)/pytest -n auto --dist loadgroup $(PYTEST_ARGS)
 
+.PHONY: test-all
+test-all: .venv build  ## Run all tests
+	$(VENV_BIN)/pytest -n auto --dist loadgroup -m "slow or not slow"
+	$(VENV_BIN)/python tests/docs/run_doctest.py
+
 .PHONY: doctest
 doctest: .venv build  ## Run doctests
 	$(VENV_BIN)/python tests/docs/run_doctest.py
@@ -93,11 +102,6 @@ docs-clean: .venv  ## Build Python docs (full rebuild)
 	@$(MAKE) -s -C docs clean
 	@$(MAKE) docs
 
-.PHONY: test-all
-test-all: .venv build  ## Run all tests
-	$(VENV_BIN)/pytest -n auto --dist loadgroup -m "slow or not slow"
-	$(VENV_BIN)/python tests/docs/run_doctest.py
-
 .PHONY: coverage
 coverage: .venv build  ## Run tests and report coverage
 	$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not release and not benchmark"

diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py
@@ -305,7 +305,6 @@ def import_optional(
     "_check_for_pandas",
     "_check_for_pyarrow",
     "_check_for_pydantic",
-    "_LazyModule",
     # exported flags/guards
     "_DELTALAKE_AVAILABLE",
     "_PYICEBERG_AVAILABLE",

diff --git a/py-polars/polars/ml/torch.py b/py-polars/polars/ml/torch.py
@@ -1,3 +1,4 @@
+# mypy: disable-error-code="unused-ignore"
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Sequence
@@ -26,7 +27,7 @@
     raise ImportError(msg) from None
 
 
-class PolarsDataset(TensorDataset):
+class PolarsDataset(TensorDataset):  # type: ignore[misc]
     """Specialized TensorDataset for Polars DataFrames."""
 
     tensors: tuple[Tensor, ...]

diff --git a/py-polars/pyproject.toml b/py-polars/pyproject.toml
@@ -102,6 +102,7 @@ module = [
   "pydantic",
   "pyxlsb",
   "sqlalchemy.*",
+  "torch.*",
   "xlsx2csv",
   "xlsxwriter.*",
   "zoneinfo",
@@ -207,14 +208,14 @@ addopts = [
   "--strict-markers",
   "--import-mode=importlib",
   # Default to running fast tests only. To run ALL tests, run: pytest -m ""
-  "-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not third_party_integration",
+  "-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only",
 ]
 markers = [
+  "ci_only: Tests that should only run on CI by default.",
   "debug: Tests that should be run on a Polars debug build.",
   "docs: Documentation code snippets",
   "release: Tests that should be run on a Polars release build.",
   "slow: Tests with a longer than average runtime.",
-  "third_party_integration: Tests with larger non-core/third party dependencies.",
   "write_disk: Tests that write to disk",
 ]
 filterwarnings = [

diff --git a/py-polars/requirements-ci.txt b/py-polars/requirements-ci.txt
@@ -0,0 +1,6 @@
+# -------------------------------------------------------
+# Packages that we require for unit tests that run on CI
+# (installable via `make requirements-all`)
+# -------------------------------------------------------
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch
diff --git a/py-polars/requirements-dev.txt b/py-polars/requirements-dev.txt
@@ -20,7 +20,6 @@ numpy
 pandas
 pyarrow
 pydantic>=2.0.0
-torch
 # Datetime / time zones
 backports.zoneinfo; python_version < '3.9'
 tzdata; platform_system == 'Windows'

diff --git a/py-polars/tests/unit/dataframe/test_to_torch.py b/py-polars/tests/unit/dataframe/test_to_torch.py
@@ -3,14 +3,14 @@
 from typing import Any
 
 import pytest
-import torch
-from torch import Tensor
-from torch.testing import assert_close as assert_tensor
-from torch.utils.data import DataLoader, Dataset
 
 import polars as pl
 import polars.selectors as cs
-from polars.ml.torch import PolarsDataset
+from polars.dependencies import _lazy_import
+
+# don't import torch until an actual test is triggered (the decorator already
+# ensures the tests aren't run locally, this will skip premature local import)
+torch, _ = _lazy_import("torch")
 
 
 @pytest.fixture()
@@ -25,36 +25,36 @@ def df() -> pl.DataFrame:
     )
 
 
-@pytest.mark.third_party_integration()
+@pytest.mark.ci_only()
 class TestTorchIntegration:
     """Test coverage for `to_torch` conversions and `polars.ml.torch` classes."""
 
+    def assert_tensor(self, actual: Any, expected: Any) -> None:
+        torch.testing.assert_close(actual, expected)
+
     def test_to_torch_series(
         self,
     ) -> None:
         s = pl.Series("x", [1, 2, 3, 4], dtype=pl.Int8)
         t = s.to_torch()
 
-        assert isinstance(t, Tensor)
         assert list(t.shape) == [4]
-        assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int8))
+        self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int8))
 
         # note: torch doesn't natively support uint16/32/64.
         # confirm that we export to a suitable signed integer type
         s = s.cast(pl.UInt16)
         t = s.to_torch()
-        assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int32))
+        self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int32))
 
         for dtype in (pl.UInt32, pl.UInt64):
             t = s.cast(dtype).to_torch()
-            assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int64))
+            self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int64))
 
     def test_to_torch_tensor(self, df: pl.DataFrame) -> None:
         t1 = df.to_torch()
         t2 = df.to_torch("tensor")
 
-        assert isinstance(t1, Tensor)
-        assert isinstance(t2, Tensor)
         assert list(t1.shape) == [4, 3]
         assert (t1 == t2).all().item() is True
 
@@ -63,30 +63,29 @@ def test_to_torch_dict(self, df: pl.DataFrame) -> None:
 
         assert list(td.keys()) == ["x", "y", "z"]
 
-        assert_tensor(td["x"], torch.tensor([1, 2, 2, 3], dtype=torch.int8))
-        assert_tensor(
+        self.assert_tensor(td["x"], torch.tensor([1, 2, 2, 3], dtype=torch.int8))
+        self.assert_tensor(
             td["y"], torch.tensor([True, False, True, False], dtype=torch.bool)
         )
-        assert_tensor(
+        self.assert_tensor(
             td["z"], torch.tensor([1.5, -0.5, 0.0, -2.0], dtype=torch.float32)
         )
 
     def test_to_torch_dataset(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset", dtype=pl.Float64)
 
         assert len(ds) == 4
-        assert isinstance(ds, Dataset)
-        assert isinstance(ds, PolarsDataset)
+        assert isinstance(ds, torch.utils.data.Dataset)
         assert repr(ds).startswith("<PolarsDataset [len:4, features:3, labels:0] at 0x")
 
         ts = ds[0]
         assert isinstance(ts, tuple)
         assert len(ts) == 1
-        assert_tensor(ts[0], torch.tensor([1.0, 1.0, 1.5], dtype=torch.float64))
+        self.assert_tensor(ts[0], torch.tensor([1.0, 1.0, 1.5], dtype=torch.float64))
 
     def test_to_torch_dataset_feature_reorder(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset", label="x", features=["z", "y"])
-        assert_tensor(
+        self.assert_tensor(
             torch.tensor(
                 [
                     [1.5000, 1.0000],
@@ -97,15 +96,15 @@ def test_to_torch_dataset_feature_reorder(self, df: pl.DataFrame) -> None:
             ),
             ds.features,
         )
-        assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
+        self.assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
 
     def test_to_torch_dataset_feature_subset(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset", label="x", features=["z"])
-        assert_tensor(
+        self.assert_tensor(
             torch.tensor([[1.5000], [-0.5000], [0.0000], [-2.0000]]),
             ds.features,
         )
-        assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
+        self.assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
 
     def test_to_torch_dataset_index_slice(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset")
@@ -114,27 +113,26 @@ def test_to_torch_dataset_index_slice(self, df: pl.DataFrame) -> None:
         expected = (
             torch.tensor([[2.0000, 0.0000, -0.5000], [2.0000, 1.0000, 0.0000]]),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
         ts = ds[::2]
         expected = (torch.tensor([[1.0000, 1.0000, 1.5000], [2.0, 1.0, 0.0]]),)
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
     @pytest.mark.parametrize(
         "index",
         [
             [0, 3],
             range(0, 4, 3),
             slice(0, 4, 3),
-            torch.tensor([0, 3]),
         ],
     )
     def test_to_torch_dataset_index_multi(self, index: Any, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset")
         ts = ds[index]
 
         expected = (torch.tensor([[1.0, 1.0, 1.5], [3.0, 0.0, -2.0]]),)
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
         assert ds.schema == {"features": torch.float32, "labels": None}
 
     def test_to_torch_dataset_index_range(self, df: pl.DataFrame) -> None:
@@ -144,7 +142,7 @@ def test_to_torch_dataset_index_range(self, df: pl.DataFrame) -> None:
         expected = (
             torch.tensor([[3.0, 0.0, -2.0], [2.0, 1.0, 0.0], [2.0, 0.0, -0.5]]),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
     def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset", label="x")
@@ -155,11 +153,11 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
 
         # half precision across all data
         ts = dsf16[:3:2]
-        expected: tuple[Tensor, ...] = (
+        expected = (
             torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
             torch.tensor([1.0, 2.0], dtype=torch.float16),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
         # only apply half precision to the feature data
         dsf16 = ds.half(labels=False)
@@ -170,7 +168,7 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
             torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
             torch.tensor([1, 2], dtype=torch.int8),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
         # only apply half precision to the label data
         dsf16 = ds.half(features=False)
@@ -181,20 +179,20 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
             torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float32),
             torch.tensor([1.0, 2.0], dtype=torch.float16),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
         # no labels
         dsf16 = df.to_torch("dataset").half()
         assert dsf16.schema == {"features": torch.float16, "labels": None}
 
         ts = dsf16[:3:2]
-        expected = (
+        expected = (  # type: ignore[assignment]
             torch.tensor(
                 data=[[1.0000, 1.0000, 1.5000], [2.0000, 1.0000, 0.0000]],
                 dtype=torch.float16,
             ),
         )
-        assert_tensor(expected, ts)
+        self.assert_tensor(expected, ts)
 
     @pytest.mark.parametrize(
         ("label", "features"),
@@ -208,26 +206,24 @@ def test_to_torch_labelled_dataset(
         self, label: Any, features: Any, df: pl.DataFrame
     ) -> None:
         ds = df.to_torch("dataset", label=label, features=features)
-        ts = next(iter(DataLoader(ds, batch_size=2, shuffle=False)))
+        ts = next(iter(torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)))
 
         expected = [
             torch.tensor([[1.0, 1.5], [0.0, -0.5]]),
             torch.tensor([1, 2], dtype=torch.int8),
         ]
         assert len(ts) == len(expected)
         for actual, exp in zip(ts, expected):
-            assert_tensor(exp, actual)
+            self.assert_tensor(exp, actual)
 
     def test_to_torch_labelled_dataset_expr(self, df: pl.DataFrame) -> None:
         ds = df.to_torch(
             "dataset",
             dtype=pl.Float64,
             label=(pl.col("x") * 8).cast(pl.Int16),
         )
-        for data in (
-            tuple(ds[:2]),
-            tuple(next(iter(DataLoader(ds, batch_size=2, shuffle=False)))),
-        ):
+        dl = torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)
+        for data in (tuple(ds[:2]), tuple(next(iter(dl)))):
             expected = (
                 torch.tensor(
                     [[1.0000, 1.5000], [0.0000, -0.5000]], dtype=torch.float64
@@ -236,11 +232,11 @@ def test_to_torch_labelled_dataset_expr(self, df: pl.DataFrame) -> None:
             )
             assert len(data) == len(expected)
             for actual, exp in zip(data, expected):
-                assert_tensor(exp, actual)
+                self.assert_tensor(exp, actual)
 
     def test_to_torch_labelled_dataset_multi(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset", label=["x", "y"])
-        dl = DataLoader(ds, batch_size=3, shuffle=False)
+        dl = torch.utils.data.DataLoader(ds, batch_size=3, shuffle=False)
         ts = list(dl)
 
         expected = [
@@ -258,7 +254,7 @@ def test_to_torch_labelled_dataset_multi(self, df: pl.DataFrame) -> None:
         for actual, exp in zip(ts, expected):
             assert len(actual) == len(exp)
             for a, e in zip(actual, exp):
-                assert_tensor(e, a)
+                self.assert_tensor(e, a)
 
     def test_misc_errors(self, df: pl.DataFrame) -> None:
         ds = df.to_torch("dataset")