allenai · epwalsh · May 12, 2020 · May 12, 2020 · May 12, 2020 · May 12, 2020
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -84,3 +84,21 @@ jobs:
     - name: Run pretrained tests
       run: |
         make docker-test-run DOCKER_TAG=$DOCKER_TAG ARGS='test-pretrained'
+
+  gpu_checks:
+    runs-on: [self-hosted, GPU]
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set Docker tag
+      run: |
+        echo "::set-env name=DOCKER_TAG::$GITHUB_SHA";
+
+    - name: Build test image
+      run: |
+        make docker-test-image DOCKER_TAG=$DOCKER_TAG
+
+    - name: Run GPU tests
+      run: |
+        make docker-test-run DOCKER_TAG=$DOCKER_TAG ARGS='gpu-test'
diff --git a/Dockerfile.test b/Dockerfile.test
@@ -4,7 +4,6 @@ FROM python:3.7
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 
-ENV PATH /usr/local/nvidia/bin/:$PATH
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
 
 # Tell nvidia-docker the driver spec that we need as well as to

diff --git a/Makefile b/Makefile
@@ -15,12 +15,16 @@ format :
 
 .PHONY : typecheck
 typecheck :
-	mypy allennlp_models --ignore-missing-imports --no-strict-optional --no-site-packages
+	mypy allennlp_models tests --ignore-missing-imports --no-strict-optional --no-site-packages
 
 .PHONY : test
 test :
 	pytest --color=yes -rf --durations=40 -m "not pretrained_model_test"
 
+.PHONY : gpu-test
+gpu-test :
+	pytest --color=yes -v -rf -m gpu
+
 .PHONY : test-with-cov
 test-with-cov :
 	pytest --color=yes -rf --cov-config=.coveragerc --cov=allennlp_models/ --durations=40 -m "not pretrained_model_test"

diff --git a/pytest.ini b/pytest.ini
@@ -1,10 +1,12 @@
 [pytest]
 testpaths = tests/
+python_classes = Test* *Test
 log_format = %(asctime)s - %(levelname)s - %(name)s - %(message)s
 log_level = DEBUG
 markers =
     pretrained_model_test
     java
+    gpu: marks tests that need at least one GPU
 filterwarnings =
 # Note: When a warning matches more than one option in the list,
 # the action for the _last_ matching option is performed.

diff --git a/tests/coref/coref_model_test.py b/tests/coref/coref_model_test.py
@@ -30,7 +30,7 @@ def _test_coref_model_can_train_save_and_load(
         )
         # fmt: on
         self.ensure_model_can_train_save_and_load(self.param_file, overrides=overrides)
-        self.tearDown()
+        self.teardown_method()
         self.setup_method()
 
     def test_coref_bert_model_can_train_save_and_load(self):

diff --git a/tests/rc/qanet/qanet_model_test.py b/tests/rc/qanet/qanet_model_test.py
@@ -1,5 +1,3 @@
-import torch
-import pytest
 from flaky import flaky
 import numpy
 from numpy.testing import assert_almost_equal
@@ -8,9 +6,7 @@
 from allennlp.common.testing import ModelTestCase
 from allennlp.data import DatasetReader, Vocabulary
 from allennlp.data import Batch
-from allennlp.data import DataLoader
 from allennlp.models import Model
-from allennlp.training import Trainer
 
 from tests import FIXTURES_ROOT
 
@@ -52,17 +48,6 @@ def test_forward_pass_runs_correctly(self):
     def test_model_can_train_save_and_load(self):
         self.ensure_model_can_train_save_and_load(self.param_file, tolerance=1e-4)
 
-    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
-    def test_multigpu_qanet(self):
-        params = Params.from_file(self.param_file)
-        vocab = Vocabulary.from_instances(self.instances)
-        model = Model.from_params(vocab=vocab, params=params["model"]).cuda()
-        optimizer = torch.optim.SGD(self.model.parameters(), 0.01, momentum=0.9)
-        self.instances.index_with(model.vocab)
-        loader = DataLoader(self.instances, batch_size=4)
-        trainer = Trainer(model, optimizer, loader, num_epochs=2, cuda_device=[0, 1])
-        trainer.train()
-
     def test_batch_predictions_are_consistent(self):
         # The same issue as the bidaf test case.
         # The CNN encoder has problems with this kind of test - it's not properly masked yet, so

diff --git a/tests/rc/qanet/stacked_self_attention_test.py b/tests/rc/qanet/stacked_self_attention_test.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.testing import AllenNlpTestCase, requires_multi_gpu
 
 from allennlp_models.rc.qanet.stacked_self_attention import StackedSelfAttentionEncoder
 
@@ -37,7 +37,7 @@ def test_stacked_self_attention_can_run_foward(self):
         encoder_output = encoder(inputs, None)
         assert list(encoder_output.size()) == [3, 5, 12]
 
-    @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need multiple GPUs.")
+    @requires_multi_gpu
     def test_stacked_self_attention_can_run_foward_on_multiple_gpus(self):
         encoder = StackedSelfAttentionEncoder(
             input_dim=9,

diff --git a/tests/syntax/srl/bert_srl_model_test.py b/tests/syntax/srl/bert_srl_model_test.py
@@ -31,10 +31,10 @@ def setup_method(self):
             FIXTURES_ROOT / "syntax" / "srl" / "conll_2012",
         )
 
-    def tearDown(self):
+    def teardown_method(self):
         self.monkeypatch.undo()
         self.monkeypatch.undo()
-        super().tearDown()
+        super().teardown_method()
 
     def test_bert_srl_model_can_train_save_and_load(self):
         ignore_grads = {"bert_model.pooler.dense.weight", "bert_model.pooler.dense.bias"}