Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add separate CI job for slow tests #10830

Merged
merged 13 commits into from
Dec 1, 2021
83 changes: 83 additions & 0 deletions .github/workflows/ci_test-slow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Test

# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
on: # Trigger the workflow on push or pull request, but only for the master branch
push:
branches: [master, "release/*"]
pull_request:
branches: [master, "release/*"]

jobs:
slow:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macOS-latest]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just noticed that here we skip run on Windows and all minimal requirements?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I wanted to keep this light at first as we don't run many tests at the moment. These can be added later

# same config as '.azure-pipelines/gpu-tests.yml'
python-version: ["3.7"]
pytorch-version: ["1.8"]

timeout-minutes: 20
carmocca marked this conversation as resolved.
Show resolved Hide resolved
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Weekly reset caching
carmocca marked this conversation as resolved.
Show resolved Hide resolved
run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"
id: times

- name: Get pip cache
id: pip-cache
run: |
python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"

- name: Cache pip
carmocca marked this conversation as resolved.
Show resolved Hide resolved
uses: actions/cache@v2
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-

- name: Install dependencies
run: |
# adjust versions according installed Torch version
python ./requirements/adjust_versions.py requirements.txt ${{ matrix.pytorch-version }}
pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
pip install --requirement requirements/test.txt
pip list
shell: bash

- name: Tests
run: |
coverage run --source pytorch_lightning -m pytest tests -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml
env:
PL_RUN_SLOW_TESTS: 1
carmocca marked this conversation as resolved.
Show resolved Hide resolved

- name: Upload pytest test results
uses: actions/upload-artifact@v2
with:
name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}
path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml
if: failure()

- name: Statistics
if: success()
run: |
coverage report
coverage xml

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
if: success()
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: cpu,pytest,torch${{ matrix.pytorch-version }}
name: CPU-coverage
fail_ci_if_error: false
6 changes: 3 additions & 3 deletions tests/benchmarks/generate_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
# limitations under the License.
import os

import matplotlib.pylab as plt
import pandas as pd

from tests.benchmarks.test_basic_parity import measure_loops
from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN

Expand All @@ -27,6 +24,9 @@


def _main():
import matplotlib.pylab as plt
import pandas as pd

fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES))

for i, cls_model in enumerate(MODEL_CLASSES):
Expand Down
27 changes: 17 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,13 +172,20 @@ def single_process_pg():


def pytest_collection_modifyitems(items):
if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1":
return
# filter out non-standalone tests
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(standalone=True)`
if marker.name == "skipif" and marker.kwargs.get("standalone")
]
# filter out special tests
if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1":
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(standalone=True)`
if marker.name == "skipif" and marker.kwargs.get("standalone")
]
elif os.getenv("PL_RUN_SLOW_TESTS", "0") == "1":
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(slow=True)`
if marker.name == "skipif" and marker.kwargs.get("slow")
]
9 changes: 9 additions & 0 deletions tests/helpers/runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __new__(
skip_49370: bool = False,
skip_hanging_spawn: bool = False,
omegaconf: bool = False,
slow: bool = False,
**kwargs,
):
"""
Expand All @@ -97,6 +98,7 @@ def __new__(
skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370.
skip_hanging_spawn: Skip the test as it's impacted by hanging loggers on spawn.
omegaconf: Require that omry/omegaconf is installed.
slow: Mark the test as slow, our CI will run it in a separate job.
**kwargs: Any :class:`pytest.mark.skipif` keyword arguments.
"""
conditions = []
Expand Down Expand Up @@ -195,6 +197,13 @@ def __new__(
conditions.append(not _OMEGACONF_AVAILABLE)
reasons.append("omegaconf")

if slow:
carmocca marked this conversation as resolved.
Show resolved Hide resolved
env_flag = os.getenv("PL_RUN_SLOW_TESTS", "0")
conditions.append(env_flag != "1")
reasons.append("Slow test")
# used in tests/conftest.py::pytest_collection_modifyitems
kwargs["slow"] = True

reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
return pytest.mark.skipif(
*args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs
Expand Down
3 changes: 1 addition & 2 deletions tests/loops/test_loops.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,8 +907,7 @@ def val_dataloader(self):

@RunIf(min_torch="1.8.0")
@pytest.mark.parametrize("should_fail", [False, True])
# False is de-activated due to slowness
@pytest.mark.parametrize("persistent_workers", [True])
@pytest.mark.parametrize("persistent_workers", [pytest.param(False, marks=RunIf(slow=True)), True])
def test_workers_are_shutdown(tmpdir, should_fail, persistent_workers):
# `num_workers == 1` uses `_MultiProcessingDataLoaderIter`
# `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance
Expand Down
33 changes: 14 additions & 19 deletions tests/utilities/test_auto_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,8 @@ def test_fast_forward_on_sequential_sampler():
assert next(batch_sampler_iter) == [6, 7, 8]


@pytest.mark.skipif(torch.cuda.is_available(), reason="todo (tchaton) Need more investigation")
def test_fast_forward_on_random_sampler():
"""This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrived the right next
"""This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrieved the right next
batch on restart."""
seed = 42
seed_everything(42)
Expand Down Expand Up @@ -255,8 +254,9 @@ def __next__(self):


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 30 sec and should be skipped in Azure CI")
@pytest.mark.parametrize("num_workers", [0, 1, 2])
@pytest.mark.parametrize(
"num_workers", [0, pytest.param(1, marks=RunIf(slow=True)), pytest.param(2, marks=RunIf(slow=True))]
)
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
def test_fast_forward_sampler_over_iterable_dataset(num_workers):
"""This test ensures ``FastForwardSampler`` and ``CaptureIterableDataset`` are properly being used to capture
Expand Down Expand Up @@ -368,8 +368,7 @@ def _test_fast_forward_sampler_with_distributed_sampler(rank, worldsize):
assert sampler.state_dict(num_yielded)[0]["current_iteration"] == 16


@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 25 sec and should be skipped in Azure CI")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, slow=True)
def test_fast_forward_sampler_with_distributed_sampler():
"""Make sure result logging works with DDP."""
tutils.set_random_main_port()
Expand Down Expand Up @@ -638,14 +637,13 @@ def all_gather(tensor, world_size):


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 45 sec and should be skipped in Azure CI")
@RunIf(slow=True)
def test_fast_forward_sampler_iterative_dataset():
_test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(0, 1)


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 55 sec and should be skipped in Azure CI")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, slow=True)
def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset():
"""Make sure result logging works with DDP."""
tutils.set_random_main_port()
Expand All @@ -668,9 +666,9 @@ def create_iterable_dataset(batch_size, num_workers, attr_name="iter_sampler", w
return dataset


@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None)
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic")
@pytest.mark.parametrize("use_fault_tolerant", ["0", "1"])
def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir):
def test_data_loading_wraps_dataset_and_samplers(_, tmpdir, use_fault_tolerant):
"""This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled."""

class CustomBatchSampler(BatchSampler):
Expand Down Expand Up @@ -771,7 +769,7 @@ def __len__(self):
# RandomGeneratorGetItemDataset,
],
)
@pytest.mark.parametrize("num_workers", [0])
@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))])
@pytest.mark.parametrize("batch_size", [1, 2, 3])
def test_dataset_rng_states_restart(dataset_class, num_workers, batch_size):
"""Test that the sequence of batches coming from a random number generator continues with the correct sequence
Expand Down Expand Up @@ -897,10 +895,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_
return model.seen_batches, model.parameters()


# this test will fail `fault_tolerant` don't support multiple datasets.
# this tests works as the dataset is fully deterministic and therefore
# there is not overall between the seeds.
carmocca marked this conversation as resolved.
Show resolved Hide resolved
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None)
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic")
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.parametrize(
"dataset_classes",
Expand All @@ -916,7 +911,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_
],
)
@pytest.mark.parametrize("multiple_trainloader_mode", ["min_size", "max_size_cycle"])
def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, multiple_trainloader_mode):
def test_dataset_rng_states_restart_with_lightning(_, tmpdir, dataset_classes, multiple_trainloader_mode):
"""Test that the Trainer can resume from a failed run in the case of several types of datasets."""
trainer_kwargs = dict(
default_root_dir=tmpdir,
Expand Down Expand Up @@ -1384,10 +1379,10 @@ def test_collect_states_with_collection():
assert generated == [{"a": {0: state}, "b": [{"a": {0: state}}]}]


@pytest.mark.parametrize("num_workers", [0])
# FIXME(@tchaton): >0 num_workers failing
@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=[RunIf(slow=True), pytest.mark.xfail()])])
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "2"})
def test_stateful_workers(num_workers):

seed_everything(42)

_get_iterator_fn = DataLoader._get_iterator
Expand Down