From 26977043bf72acbb4be704af5c6153212d1a282d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 1 Dec 2021 20:58:18 +0100 Subject: [PATCH] Add separate CI job for slow tests (#10830) --- .github/workflows/ci_test-slow.yml | 83 +++++++++++++++++++++++++ tests/benchmarks/generate_comparison.py | 6 +- tests/conftest.py | 27 +++++--- tests/helpers/runif.py | 9 +++ tests/loops/test_loops.py | 3 +- tests/utilities/test_auto_restart.py | 33 +++++----- 6 files changed, 127 insertions(+), 34 deletions(-) create mode 100644 .github/workflows/ci_test-slow.yml diff --git a/.github/workflows/ci_test-slow.yml b/.github/workflows/ci_test-slow.yml new file mode 100644 index 0000000000000..b9a07c4b6a26f --- /dev/null +++ b/.github/workflows/ci_test-slow.yml @@ -0,0 +1,83 @@ +name: Test + +# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows +on: # Trigger the workflow on push or pull request, but only for the master branch + push: + branches: [master, "release/*"] + pull_request: + branches: [master, "release/*"] + +jobs: + slow: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macOS-latest] + # same config as '.azure-pipelines/gpu-tests.yml' + python-version: ["3.7"] + pytorch-version: ["1.8"] + + timeout-minutes: 20 + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Weekly reset caching + run: echo "::set-output name=period::$(python -c 'import time ; days = time.time() / 60 / 60 / 24 ; print(int(days / 7))' 2>&1)" + id: times + + - name: Get pip cache + id: pip-cache + run: | + python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" + + - name: Cache pip + uses: actions/cache@v2 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}-${{ hashFiles('requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip-td${{ steps.times.outputs.period }}-py${{ matrix.python-version }}- + + - name: Install dependencies + run: | + # adjust versions according installed Torch version + python ./requirements/adjust_versions.py requirements.txt ${{ matrix.pytorch-version }} + pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade + pip install --requirement requirements/test.txt + pip list + shell: bash + + - name: Tests + run: | + coverage run --source pytorch_lightning -m pytest tests -v --junitxml=junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml + env: + PL_RUN_SLOW_TESTS: 1 + + - name: Upload pytest test results + uses: actions/upload-artifact@v2 + with: + name: pytest-results-${{ runner.os }}-${{ matrix.python-version }} + path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}.xml + if: failure() + + - name: Statistics + if: success() + run: | + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + if: success() + # see: https://github.com/actions/toolkit/issues/399 + continue-on-error: true + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: coverage.xml + flags: cpu,pytest,torch${{ matrix.pytorch-version }} + name: CPU-coverage + fail_ci_if_error: false diff --git a/tests/benchmarks/generate_comparison.py b/tests/benchmarks/generate_comparison.py index bc95b5d9cf591..984ffdd19c163 100644 --- a/tests/benchmarks/generate_comparison.py +++ b/tests/benchmarks/generate_comparison.py @@ -13,9 +13,6 @@ # limitations under the License. import os -import matplotlib.pylab as plt -import pandas as pd - from tests.benchmarks.test_basic_parity import measure_loops from tests.helpers.advanced_models import ParityModuleMNIST, ParityModuleRNN @@ -27,6 +24,9 @@ def _main(): + import matplotlib.pylab as plt + import pandas as pd + fig, axarr = plt.subplots(nrows=len(MODEL_CLASSES)) for i, cls_model in enumerate(MODEL_CLASSES): diff --git a/tests/conftest.py b/tests/conftest.py index 5871921e6c3dd..ae3c6435515f1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -191,13 +191,20 @@ def single_process_pg(): def pytest_collection_modifyitems(items): - if os.getenv("PL_RUN_STANDALONE_TESTS", "0") != "1": - return - # filter out non-standalone tests - items[:] = [ - item - for item in items - for marker in item.own_markers - # has `@RunIf(standalone=True)` - if marker.name == "skipif" and marker.kwargs.get("standalone") - ] + # filter out special tests + if os.getenv("PL_RUN_STANDALONE_TESTS", "0") == "1": + items[:] = [ + item + for item in items + for marker in item.own_markers + # has `@RunIf(standalone=True)` + if marker.name == "skipif" and marker.kwargs.get("standalone") + ] + elif os.getenv("PL_RUN_SLOW_TESTS", "0") == "1": + items[:] = [ + item + for item in items + for marker in item.own_markers + # has `@RunIf(slow=True)` + if marker.name == "skipif" and marker.kwargs.get("slow") + ] diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 179535a63dd2c..22da59d9e6983 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -73,6 +73,7 @@ def __new__( skip_49370: bool = False, skip_hanging_spawn: bool = False, omegaconf: bool = False, + slow: bool = False, **kwargs, ): """ @@ -97,6 +98,7 @@ def __new__( skip_49370: Skip the test as it's impacted by https://github.com/pytorch/pytorch/issues/49370. skip_hanging_spawn: Skip the test as it's impacted by hanging loggers on spawn. omegaconf: Require that omry/omegaconf is installed. + slow: Mark the test as slow, our CI will run it in a separate job. **kwargs: Any :class:`pytest.mark.skipif` keyword arguments. """ conditions = [] @@ -195,6 +197,13 @@ def __new__( conditions.append(not _OMEGACONF_AVAILABLE) reasons.append("omegaconf") + if slow: + env_flag = os.getenv("PL_RUN_SLOW_TESTS", "0") + conditions.append(env_flag != "1") + reasons.append("Slow test") + # used in tests/conftest.py::pytest_collection_modifyitems + kwargs["slow"] = True + reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( *args, condition=any(conditions), reason=f"Requires: [{' + '.join(reasons)}]", **kwargs diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py index 6338ed00e481d..b1f93d82ab616 100644 --- a/tests/loops/test_loops.py +++ b/tests/loops/test_loops.py @@ -907,8 +907,7 @@ def val_dataloader(self): @RunIf(min_torch="1.8.0") @pytest.mark.parametrize("should_fail", [False, True]) -# False is de-activated due to slowness -@pytest.mark.parametrize("persistent_workers", [True]) +@pytest.mark.parametrize("persistent_workers", [pytest.param(False, marks=RunIf(slow=True)), True]) def test_workers_are_shutdown(tmpdir, should_fail, persistent_workers): # `num_workers == 1` uses `_MultiProcessingDataLoaderIter` # `persistent_workers` makes sure `self._iterator` gets set on the `DataLoader` instance diff --git a/tests/utilities/test_auto_restart.py b/tests/utilities/test_auto_restart.py index 4c2c440797dd2..e33bc91621a2b 100644 --- a/tests/utilities/test_auto_restart.py +++ b/tests/utilities/test_auto_restart.py @@ -177,9 +177,8 @@ def test_fast_forward_on_sequential_sampler(): assert next(batch_sampler_iter) == [6, 7, 8] -@pytest.mark.skipif(torch.cuda.is_available(), reason="todo (tchaton) Need more investigation") def test_fast_forward_on_random_sampler(): - """This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrived the right next + """This test ensures ``FastForwardSampler`` applied to ``RandomSampler`` correctly retrieved the right next batch on restart.""" seed = 42 seed_everything(42) @@ -255,8 +254,9 @@ def __next__(self): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 30 sec and should be skipped in Azure CI") -@pytest.mark.parametrize("num_workers", [0, 1, 2]) +@pytest.mark.parametrize( + "num_workers", [0, pytest.param(1, marks=RunIf(slow=True)), pytest.param(2, marks=RunIf(slow=True))] +) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) def test_fast_forward_sampler_over_iterable_dataset(num_workers): """This test ensures ``FastForwardSampler`` and ``CaptureIterableDataset`` are properly being used to capture @@ -368,8 +368,7 @@ def _test_fast_forward_sampler_with_distributed_sampler(rank, worldsize): assert sampler.state_dict(num_yielded)[0]["current_iteration"] == 16 -@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 25 sec and should be skipped in Azure CI") -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, slow=True) def test_fast_forward_sampler_with_distributed_sampler(): """Make sure result logging works with DDP.""" tutils.set_random_main_port() @@ -638,14 +637,13 @@ def all_gather(tensor, world_size): @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 45 sec and should be skipped in Azure CI") +@RunIf(slow=True) def test_fast_forward_sampler_iterative_dataset(): _test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(0, 1) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) -@pytest.mark.skipif(torch.cuda.is_available(), reason="This test takes around 55 sec and should be skipped in Azure CI") -@RunIf(skip_windows=True) +@RunIf(skip_windows=True, slow=True) def test_fast_forward_sampler_with_distributed_sampler_and_iterative_dataset(): """Make sure result logging works with DDP.""" tutils.set_random_main_port() @@ -668,9 +666,9 @@ def create_iterable_dataset(batch_size, num_workers, attr_name="iter_sampler", w return dataset -@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None) +@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic") @pytest.mark.parametrize("use_fault_tolerant", ["0", "1"]) -def test_data_loading_wraps_dataset_and_samplers(use_fault_tolerant, tmpdir): +def test_data_loading_wraps_dataset_and_samplers(_, tmpdir, use_fault_tolerant): """This test ensures the dataset and sampler are properly wrapped when fault tolerant is enabled.""" class CustomBatchSampler(BatchSampler): @@ -771,7 +769,7 @@ def __len__(self): # RandomGeneratorGetItemDataset, ], ) -@pytest.mark.parametrize("num_workers", [0]) +@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=RunIf(slow=True))]) @pytest.mark.parametrize("batch_size", [1, 2, 3]) def test_dataset_rng_states_restart(dataset_class, num_workers, batch_size): """Test that the sequence of batches coming from a random number generator continues with the correct sequence @@ -897,10 +895,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_ return model.seen_batches, model.parameters() -# this test will fail `fault_tolerant` don't support multiple datasets. -# this tests works as the dataset is fully deterministic and therefore -# there is not overall between the seeds. -@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic", lambda x, y: None) +@mock.patch("pytorch_lightning.trainer.data_loading._validate_fault_tolerant_automatic") @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"}) @pytest.mark.parametrize( "dataset_classes", @@ -916,7 +911,7 @@ def _run_training(trainer_kwargs, dataset_classes, fail_on_step: int = -1, ckpt_ ], ) @pytest.mark.parametrize("multiple_trainloader_mode", ["min_size", "max_size_cycle"]) -def test_dataset_rng_states_restart_with_lightning(tmpdir, dataset_classes, multiple_trainloader_mode): +def test_dataset_rng_states_restart_with_lightning(_, tmpdir, dataset_classes, multiple_trainloader_mode): """Test that the Trainer can resume from a failed run in the case of several types of datasets.""" trainer_kwargs = dict( default_root_dir=tmpdir, @@ -1384,10 +1379,10 @@ def test_collect_states_with_collection(): assert generated == [{"a": {0: state}, "b": [{"a": {0: state}}]}] -@pytest.mark.parametrize("num_workers", [0]) +# FIXME(@tchaton): >0 num_workers failing +@pytest.mark.parametrize("num_workers", [0, pytest.param(2, marks=[RunIf(slow=True), pytest.mark.xfail()])]) @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "2"}) def test_stateful_workers(num_workers): - seed_everything(42) _get_iterator_fn = DataLoader._get_iterator