Skip to content

Commit

Permalink
tests(python): Make torch install CI-only by default (pola-rs#16058)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored May 6, 2024
1 parent 7e000cf commit ae66acd
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
- name: Install Python dependencies
run: |
pip install uv
uv pip install --compile-bytecode -r requirements-dev.txt
uv pip install --compile-bytecode -r requirements-dev.txt -r requirements-ci.txt
- name: Set up Rust
run: rustup show
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ requirements: .venv ## Install/refresh Python project requirements
&& $(VENV_BIN)/uv pip install --upgrade -r py-polars/docs/requirements-docs.txt \
&& $(VENV_BIN)/uv pip install --upgrade -r docs/requirements.txt

.PHONY: requirements-all
requirements-all: .venv ## Install/refresh all Python requirements (including those needed for CI tests)
$(MAKE) requirements
$(VENV_BIN)/uv pip install --upgrade --compile-bytecode -r py-polars/requirements-ci.txt

.PHONY: build
build: .venv ## Compile and install Python Polars for development
@unset CONDA_PREFIX \
Expand Down Expand Up @@ -80,7 +85,6 @@ build-release-native: .venv ## Same as build-release, except with native CPU op
$(VENV_BIN)/maturin develop -m py-polars/Cargo.toml --release \
$(FILTER_PIP_WARNINGS)


.PHONY: check
check: ## Run cargo check with all features
cargo check --workspace --all-targets --all-features
Expand Down Expand Up @@ -108,6 +112,7 @@ pre-commit: fmt clippy clippy-default ## Run all code quality checks
clean: ## Clean up caches and build artifacts
@$(MAKE) -s -C py-polars/ $@
@rm -rf .ruff_cache/
@rm -rf .hypothesis/
@rm -rf .venv/
@cargo clean

Expand Down
16 changes: 10 additions & 6 deletions py-polars/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ endif
@$(MAKE) -s -C .. $@

.PHONY: requirements
requirements: .venv ## Install/refresh all project requirements
requirements: .venv ## Install/refresh Python project requirements
@$(MAKE) -s -C .. $@

.PHONY: requirements-all
requirements-all: .venv ## Install/refresh all Python requirements (including those needed for CI tests)
@$(MAKE) -s -C .. $@

.PHONY: build
Expand Down Expand Up @@ -79,6 +83,11 @@ pre-commit: fmt clippy ## Run all code formatting and lint/quality checks
test: .venv build ## Run fast unittests
$(VENV_BIN)/pytest -n auto --dist loadgroup $(PYTEST_ARGS)

.PHONY: test-all
test-all: .venv build ## Run all tests
$(VENV_BIN)/pytest -n auto --dist loadgroup -m "slow or not slow"
$(VENV_BIN)/python tests/docs/run_doctest.py

.PHONY: doctest
doctest: .venv build ## Run doctests
$(VENV_BIN)/python tests/docs/run_doctest.py
Expand All @@ -93,11 +102,6 @@ docs-clean: .venv ## Build Python docs (full rebuild)
@$(MAKE) -s -C docs clean
@$(MAKE) docs

.PHONY: test-all
test-all: .venv build ## Run all tests
$(VENV_BIN)/pytest -n auto --dist loadgroup -m "slow or not slow"
$(VENV_BIN)/python tests/docs/run_doctest.py

.PHONY: coverage
coverage: .venv build ## Run tests and report coverage
$(VENV_BIN)/pytest --cov -n auto --dist loadgroup -m "not release and not benchmark"
Expand Down
1 change: 0 additions & 1 deletion py-polars/polars/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,6 @@ def import_optional(
"_check_for_pandas",
"_check_for_pyarrow",
"_check_for_pydantic",
"_LazyModule",
# exported flags/guards
"_DELTALAKE_AVAILABLE",
"_PYICEBERG_AVAILABLE",
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/ml/torch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: disable-error-code="unused-ignore"
from __future__ import annotations

from typing import TYPE_CHECKING, Sequence
Expand Down Expand Up @@ -26,7 +27,7 @@
raise ImportError(msg) from None


class PolarsDataset(TensorDataset):
class PolarsDataset(TensorDataset): # type: ignore[misc]
"""Specialized TensorDataset for Polars DataFrames."""

tensors: tuple[Tensor, ...]
Expand Down
5 changes: 3 additions & 2 deletions py-polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ module = [
"pydantic",
"pyxlsb",
"sqlalchemy.*",
"torch.*",
"xlsx2csv",
"xlsxwriter.*",
"zoneinfo",
Expand Down Expand Up @@ -207,14 +208,14 @@ addopts = [
"--strict-markers",
"--import-mode=importlib",
# Default to running fast tests only. To run ALL tests, run: pytest -m ""
"-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not third_party_integration",
"-m not slow and not write_disk and not release and not docs and not hypothesis and not benchmark and not ci_only",
]
markers = [
"ci_only: Tests that should only run on CI by default.",
"debug: Tests that should be run on a Polars debug build.",
"docs: Documentation code snippets",
"release: Tests that should be run on a Polars release build.",
"slow: Tests with a longer than average runtime.",
"third_party_integration: Tests with larger non-core/third party dependencies.",
"write_disk: Tests that write to disk",
]
filterwarnings = [
Expand Down
6 changes: 6 additions & 0 deletions py-polars/requirements-ci.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -------------------------------------------------------
# Packages that we require for unit tests that run on CI
# (installable via `make requirements-all`)
# -------------------------------------------------------
--extra-index-url https://download.pytorch.org/whl/cpu
torch
1 change: 0 additions & 1 deletion py-polars/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ numpy
pandas
pyarrow
pydantic>=2.0.0
torch
# Datetime / time zones
backports.zoneinfo; python_version < '3.9'
tzdata; platform_system == 'Windows'
Expand Down
80 changes: 38 additions & 42 deletions py-polars/tests/unit/dataframe/test_to_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from typing import Any

import pytest
import torch
from torch import Tensor
from torch.testing import assert_close as assert_tensor
from torch.utils.data import DataLoader, Dataset

import polars as pl
import polars.selectors as cs
from polars.ml.torch import PolarsDataset
from polars.dependencies import _lazy_import

# don't import torch until an actual test is triggered (the decorator already
# ensures the tests aren't run locally, this will skip premature local import)
torch, _ = _lazy_import("torch")


@pytest.fixture()
Expand All @@ -25,36 +25,36 @@ def df() -> pl.DataFrame:
)


@pytest.mark.third_party_integration()
@pytest.mark.ci_only()
class TestTorchIntegration:
"""Test coverage for `to_torch` conversions and `polars.ml.torch` classes."""

def assert_tensor(self, actual: Any, expected: Any) -> None:
torch.testing.assert_close(actual, expected)

def test_to_torch_series(
self,
) -> None:
s = pl.Series("x", [1, 2, 3, 4], dtype=pl.Int8)
t = s.to_torch()

assert isinstance(t, Tensor)
assert list(t.shape) == [4]
assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int8))
self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int8))

# note: torch doesn't natively support uint16/32/64.
# confirm that we export to a suitable signed integer type
s = s.cast(pl.UInt16)
t = s.to_torch()
assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int32))
self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int32))

for dtype in (pl.UInt32, pl.UInt64):
t = s.cast(dtype).to_torch()
assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int64))
self.assert_tensor(t, torch.tensor([1, 2, 3, 4], dtype=torch.int64))

def test_to_torch_tensor(self, df: pl.DataFrame) -> None:
t1 = df.to_torch()
t2 = df.to_torch("tensor")

assert isinstance(t1, Tensor)
assert isinstance(t2, Tensor)
assert list(t1.shape) == [4, 3]
assert (t1 == t2).all().item() is True

Expand All @@ -63,30 +63,29 @@ def test_to_torch_dict(self, df: pl.DataFrame) -> None:

assert list(td.keys()) == ["x", "y", "z"]

assert_tensor(td["x"], torch.tensor([1, 2, 2, 3], dtype=torch.int8))
assert_tensor(
self.assert_tensor(td["x"], torch.tensor([1, 2, 2, 3], dtype=torch.int8))
self.assert_tensor(
td["y"], torch.tensor([True, False, True, False], dtype=torch.bool)
)
assert_tensor(
self.assert_tensor(
td["z"], torch.tensor([1.5, -0.5, 0.0, -2.0], dtype=torch.float32)
)

def test_to_torch_dataset(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset", dtype=pl.Float64)

assert len(ds) == 4
assert isinstance(ds, Dataset)
assert isinstance(ds, PolarsDataset)
assert isinstance(ds, torch.utils.data.Dataset)
assert repr(ds).startswith("<PolarsDataset [len:4, features:3, labels:0] at 0x")

ts = ds[0]
assert isinstance(ts, tuple)
assert len(ts) == 1
assert_tensor(ts[0], torch.tensor([1.0, 1.0, 1.5], dtype=torch.float64))
self.assert_tensor(ts[0], torch.tensor([1.0, 1.0, 1.5], dtype=torch.float64))

def test_to_torch_dataset_feature_reorder(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset", label="x", features=["z", "y"])
assert_tensor(
self.assert_tensor(
torch.tensor(
[
[1.5000, 1.0000],
Expand All @@ -97,15 +96,15 @@ def test_to_torch_dataset_feature_reorder(self, df: pl.DataFrame) -> None:
),
ds.features,
)
assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
self.assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)

def test_to_torch_dataset_feature_subset(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset", label="x", features=["z"])
assert_tensor(
self.assert_tensor(
torch.tensor([[1.5000], [-0.5000], [0.0000], [-2.0000]]),
ds.features,
)
assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)
self.assert_tensor(torch.tensor([1, 2, 2, 3], dtype=torch.int8), ds.labels)

def test_to_torch_dataset_index_slice(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset")
Expand All @@ -114,27 +113,26 @@ def test_to_torch_dataset_index_slice(self, df: pl.DataFrame) -> None:
expected = (
torch.tensor([[2.0000, 0.0000, -0.5000], [2.0000, 1.0000, 0.0000]]),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

ts = ds[::2]
expected = (torch.tensor([[1.0000, 1.0000, 1.5000], [2.0, 1.0, 0.0]]),)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

@pytest.mark.parametrize(
"index",
[
[0, 3],
range(0, 4, 3),
slice(0, 4, 3),
torch.tensor([0, 3]),
],
)
def test_to_torch_dataset_index_multi(self, index: Any, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset")
ts = ds[index]

expected = (torch.tensor([[1.0, 1.0, 1.5], [3.0, 0.0, -2.0]]),)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)
assert ds.schema == {"features": torch.float32, "labels": None}

def test_to_torch_dataset_index_range(self, df: pl.DataFrame) -> None:
Expand All @@ -144,7 +142,7 @@ def test_to_torch_dataset_index_range(self, df: pl.DataFrame) -> None:
expected = (
torch.tensor([[3.0, 0.0, -2.0], [2.0, 1.0, 0.0], [2.0, 0.0, -0.5]]),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset", label="x")
Expand All @@ -155,11 +153,11 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:

# half precision across all data
ts = dsf16[:3:2]
expected: tuple[Tensor, ...] = (
expected = (
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
torch.tensor([1.0, 2.0], dtype=torch.float16),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

# only apply half precision to the feature data
dsf16 = ds.half(labels=False)
Expand All @@ -170,7 +168,7 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float16),
torch.tensor([1, 2], dtype=torch.int8),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

# only apply half precision to the label data
dsf16 = ds.half(features=False)
Expand All @@ -181,20 +179,20 @@ def test_to_dataset_half_precision(self, df: pl.DataFrame) -> None:
torch.tensor([[1.0000, 1.5000], [1.0000, 0.0000]], dtype=torch.float32),
torch.tensor([1.0, 2.0], dtype=torch.float16),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

# no labels
dsf16 = df.to_torch("dataset").half()
assert dsf16.schema == {"features": torch.float16, "labels": None}

ts = dsf16[:3:2]
expected = (
expected = ( # type: ignore[assignment]
torch.tensor(
data=[[1.0000, 1.0000, 1.5000], [2.0000, 1.0000, 0.0000]],
dtype=torch.float16,
),
)
assert_tensor(expected, ts)
self.assert_tensor(expected, ts)

@pytest.mark.parametrize(
("label", "features"),
Expand All @@ -208,26 +206,24 @@ def test_to_torch_labelled_dataset(
self, label: Any, features: Any, df: pl.DataFrame
) -> None:
ds = df.to_torch("dataset", label=label, features=features)
ts = next(iter(DataLoader(ds, batch_size=2, shuffle=False)))
ts = next(iter(torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)))

expected = [
torch.tensor([[1.0, 1.5], [0.0, -0.5]]),
torch.tensor([1, 2], dtype=torch.int8),
]
assert len(ts) == len(expected)
for actual, exp in zip(ts, expected):
assert_tensor(exp, actual)
self.assert_tensor(exp, actual)

def test_to_torch_labelled_dataset_expr(self, df: pl.DataFrame) -> None:
ds = df.to_torch(
"dataset",
dtype=pl.Float64,
label=(pl.col("x") * 8).cast(pl.Int16),
)
for data in (
tuple(ds[:2]),
tuple(next(iter(DataLoader(ds, batch_size=2, shuffle=False)))),
):
dl = torch.utils.data.DataLoader(ds, batch_size=2, shuffle=False)
for data in (tuple(ds[:2]), tuple(next(iter(dl)))):
expected = (
torch.tensor(
[[1.0000, 1.5000], [0.0000, -0.5000]], dtype=torch.float64
Expand All @@ -236,11 +232,11 @@ def test_to_torch_labelled_dataset_expr(self, df: pl.DataFrame) -> None:
)
assert len(data) == len(expected)
for actual, exp in zip(data, expected):
assert_tensor(exp, actual)
self.assert_tensor(exp, actual)

def test_to_torch_labelled_dataset_multi(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset", label=["x", "y"])
dl = DataLoader(ds, batch_size=3, shuffle=False)
dl = torch.utils.data.DataLoader(ds, batch_size=3, shuffle=False)
ts = list(dl)

expected = [
Expand All @@ -258,7 +254,7 @@ def test_to_torch_labelled_dataset_multi(self, df: pl.DataFrame) -> None:
for actual, exp in zip(ts, expected):
assert len(actual) == len(exp)
for a, e in zip(actual, exp):
assert_tensor(e, a)
self.assert_tensor(e, a)

def test_misc_errors(self, df: pl.DataFrame) -> None:
ds = df.to_torch("dataset")
Expand Down

0 comments on commit ae66acd

Please sign in to comment.