Skip to content

Commit

Permalink
Add separate CI job for slow tests (#10830)
Browse files Browse the repository at this point in the history
  • Loading branch information
carmocca committed Dec 1, 2021
1 parent 619ef7a commit 2697704
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 34 deletions.
83 changes: 83 additions & 0 deletions .github/workflows/ci_test-slow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: Test

# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows
on: # Trigger the workflow on push or pull request, but only for the master branch
push:
branches: [master, "release/*"]
pull_request:
branches: [master, "release/*"]

jobs:
slow:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macOS-latest]
# same config as '.azure-pipelines/gpu-tests.yml'
python-version: ["3.7"]
pytorch-version: ["1.8"]

timeout-minutes: 20
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Weekly reset caching
run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)"
id: times

- name: Get pip cache
id: pip-cache
run: |
python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)"
- name: Cache pip
uses: actions/cache@v2
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }}
restore-keys: |
${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-
- name: Install dependencies
run: |
# adjust versions according installed Torch version
python ./requirements/adjust_versions.py requirements.txt ${{ matrix.pytorch-version }}
pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
pip install --requirement requirements/test.txt
pip list
shell: bash

- name: Tests
run: |
coverage run --source pytorch_lightning -m pytest tests -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml
env:
PL_RUN_SLOW_TESTS: 1

- name: Upload pytest test results
uses: actions/upload-artifact@v2
with:
name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}
path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml
if: failure()

- name: Statistics
if: success()
run: |
coverage report
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
if: success()
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: cpu,pytest,torch${{ matrix.pytorch-version }}
name: CPU-coverage
fail_ci_if_error: false
6 changes: 3 additions & 3 deletions tests/benchmarks/generate_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
# limitations under the License.
import os

import matplotlib.pylab as plt
import pandas as pd

from tests.benchmarks.test_basic_parity import measure_loops
from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN

Expand All @@ -27,6 +24,9 @@


def _main():
import matplotlib.pylab as plt
import pandas as pd

fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES))

for i, cls_model in enumerate(MODEL_CLASSES):
Expand Down
27 changes: 17 additions & 10 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,20 @@ def single_process_pg():


def pytest_collection_modifyitems(items):
if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1":
return
# filter out non-standalone tests
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(standalone=True)`
if marker.name == "skipif" and marker.kwargs.get("standalone")
]
# filter out special tests
if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1":
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(standalone=True)`
if marker.name == "skipif" and marker.kwargs.get("standalone")
]
elif os.getenv("PL_RUN_SLOW_TESTS", "0") == "1":
items[:] = [
item
for item in items
for marker in item.own_markers
# has `@RunIf(slow=True)`
if marker.name == "skipif" and marker.kwargs.get("slow")
]
9 changes: 9 additions & 0 deletions tests/helpers/runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __new__(
skip_49370: bool = False,
skip_hanging_spawn: bool = False,
omegaconf: bool = False,
slow: bool = False,
**kwargs,
):
"""
Expand All @@ -97,6 +98,7 @@ def __new__(
skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370.
skip_hanging_spawn: Skip the test as it's impacted by hanging loggers on spawn.
omegaconf: Require that omry/omegaconf is installed.
slow: Mark the test as slow, our CI will run it in a separate job.
**kwargs: Any :class:`pytest.mark.skipif` keyword arguments.
"""
conditions = []
Expand Down Expand Up @@ -195,6 +197,13 @@ def __new__(
conditions.append(not _OMEGACONF_AVAILABLE)
reasons.append("omegaconf")

if slow:
env_flag = os.getenv("PL_RUN_SLOW_TESTS", "0")
conditions.append(env_flag != "1")
reasons.append("Slow test")
# used in tests/conftest.py::pytest_collection_modifyitems
kwargs["slow"] = True

reasons = [rs for cond, rs in zip(conditions, reasons) if cond]
return pytest.mark.skipif(
*args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs
Expand Down
3 changes: 1 addition & 2 deletions tests/loops/test_loops.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,8 +907,7 @@ def val_dataloader(self):

@RunIf(min_torch="1.8.0")
@pytest.mark.parametrize("should_fail", [False, True])
# False is de-activated due to slowness
@pytest.mark.parametrize("persistent_workers", [True])
@pytest.mark.parametrize("persistent_workers", [pytest.param(False, marks=RunIf(slow=True)), True])
def test_workers_are_shutdown(tmpdir, should_fail, persistent_workers):
# `num_workers == 1` uses `_MultiProcessingDataLoaderIter`
# `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance
Expand Down
33 changes: 14 additions & 19 deletions tests/utilities/test_auto_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,8 @@ def test_fast_forward_on_sequential_sampler():
assert next(batch_sampler_iter) == [6, 7, 8]


@pytest.mark.skipif(torch.cuda.is_available(), reason="todo (tchaton) Need more investigation")
def test_fast_forward_on_random_sampler():
"""This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrived the right next
"""This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrieved the right next
batch on restart."""
seed = 42
seed_everything(42)
Expand Down Expand Up @@ -255,8 +254,9 @@ def __next__(self):


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 30 sec and should be skipped in Azure CI")
@pytest.mark.parametrize("num_workers", [0, 1, 2])
@pytest.mark.parametrize(
"num_workers", [0, pytest.param(1, marks=RunIf(slow=True)), pytest.param(2, marks=RunIf(slow=True))]
)
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
def test_fast_forward_sampler_over_iterable_dataset(num_workers):
"""This test ensures ``FastForwardSampler`` and ``CaptureIterableDataset`` are properly being used to capture
Expand Down Expand Up @@ -368,8 +368,7 @@ def _test_fast_forward_sampler_with_distributed_sampler(rank, worldsize):
assert sampler.state_dict(num_yielded)[0]["current_iteration"] == 16


@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 25 sec and should be skipped in Azure CI")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, slow=True)
def test_fast_forward_sampler_with_distributed_sampler():
"""Make sure result logging works with DDP."""
tutils.set_random_main_port()
Expand Down Expand Up @@ -638,14 +637,13 @@ def all_gather(tensor, world_size):


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 45 sec and should be skipped in Azure CI")
@RunIf(slow=True)
def test_fast_forward_sampler_iterative_dataset():
_test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(0, 1)


@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 55 sec and should be skipped in Azure CI")
@RunIf(skip_windows=True)
@RunIf(skip_windows=True, slow=True)
def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset():
"""Make sure result logging works with DDP."""
tutils.set_random_main_port()
Expand All @@ -668,9 +666,9 @@ def create_iterable_dataset(batch_size, num_workers, attr_name="iter_sampler", w
return dataset


@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None)
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic")
@pytest.mark.parametrize("use_fault_tolerant", ["0", "1"])
def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir):
def test_data_loading_wraps_dataset_and_samplers(_, tmpdir, use_fault_tolerant):
"""This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled."""

class CustomBatchSampler(BatchSampler):
Expand Down Expand Up @@ -771,7 +769,7 @@ def __len__(self):
# RandomGeneratorGetItemDataset,
],
)
@pytest.mark.parametrize("num_workers", [0])
@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))])
@pytest.mark.parametrize("batch_size", [1, 2, 3])
def test_dataset_rng_states_restart(dataset_class, num_workers, batch_size):
"""Test that the sequence of batches coming from a random number generator continues with the correct sequence
Expand Down Expand Up @@ -897,10 +895,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_
return model.seen_batches, model.parameters()


# this test will fail `fault_tolerant` don't support multiple datasets.
# this tests works as the dataset is fully deterministic and therefore
# there is not overall between the seeds.
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None)
@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic")
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
@pytest.mark.parametrize(
"dataset_classes",
Expand All @@ -916,7 +911,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_
],
)
@pytest.mark.parametrize("multiple_trainloader_mode", ["min_size", "max_size_cycle"])
def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, multiple_trainloader_mode):
def test_dataset_rng_states_restart_with_lightning(_, tmpdir, dataset_classes, multiple_trainloader_mode):
"""Test that the Trainer can resume from a failed run in the case of several types of datasets."""
trainer_kwargs = dict(
default_root_dir=tmpdir,
Expand Down Expand Up @@ -1384,10 +1379,10 @@ def test_collect_states_with_collection():
assert generated == [{"a": {0: state}, "b": [{"a": {0: state}}]}]


@pytest.mark.parametrize("num_workers", [0])
# FIXME(@tchaton): >0 num_workers failing
@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=[RunIf(slow=True), pytest.mark.xfail()])])
@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "2"})
def test_stateful_workers(num_workers):

seed_everything(42)

_get_iterator_fn = DataLoader._get_iterator
Expand Down

0 comments on commit 2697704

Please sign in to comment.