diff --git a/allennlp/data/dataset_readers/__init__.py b/allennlp/data/dataset_readers/__init__.py
index ab8fd0b47dd..274d9d7e4ee 100644
--- a/allennlp/data/dataset_readers/__init__.py
+++ b/allennlp/data/dataset_readers/__init__.py
@@ -19,7 +19,3 @@
 from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
 from allennlp.data.dataset_readers.sharded_dataset_reader import ShardedDatasetReader
 from allennlp.data.dataset_readers.text_classification_json import TextClassificationJsonReader
-from allennlp.data.dataset_readers.vision_reader import VisionReader
-from allennlp.data.dataset_readers.vqav2 import VQAv2Reader
-from allennlp.data.dataset_readers.visual_entailment import VisualEntailmentReader
-from allennlp.data.dataset_readers.gqa import GQAReader
diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py
index ddaf3d0931b..58614160b81 100644
--- a/allennlp/data/dataset_readers/dataset_reader.py
+++ b/allennlp/data/dataset_readers/dataset_reader.py
@@ -300,6 +300,12 @@ def shard_iterable(self, iterable: Iterable[_T]) -> Iterator[_T]:
                 sharded_slice, self._worker_info.id, None, self._worker_info.num_workers
             )
 
+        # We don't know for sure how many instances we have to produce.
+        # _multi_worker_islice() figures that out. But we know for sure
+        # it won't be more than max_instances.
+        if self.max_instances is not None:
+            sharded_slice = itertools.islice(sharded_slice, self.max_instances)
+
         return sharded_slice
 
     def _multi_worker_islice(
diff --git a/allennlp/data/dataset_readers/gqa.py b/allennlp/data/dataset_readers/gqa.py
deleted file mode 100644
index 88a784342cd..00000000000
--- a/allennlp/data/dataset_readers/gqa.py
+++ /dev/null
@@ -1,213 +0,0 @@
-from os import PathLike
-from typing import (
-    Dict,
-    Union,
-    Optional,
-    Tuple,
-    Iterable,
-)
-import json
-import os
-
-from overrides import overrides
-import torch
-from torch import Tensor
-
-from allennlp.common.file_utils import cached_path
-from allennlp.common.lazy import Lazy
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.fields import ArrayField, LabelField, ListField, TextField
-from allennlp.data.image_loader import ImageLoader
-from allennlp.data.instance import Instance
-from allennlp.data.token_indexers import TokenIndexer
-from allennlp.data.tokenizers import Tokenizer
-from allennlp.modules.vision.grid_embedder import GridEmbedder
-from allennlp.modules.vision.region_detector import RegionDetector
-from allennlp.data.dataset_readers.vision_reader import VisionReader
-
-
-@DatasetReader.register("gqa")
-class GQAReader(VisionReader):
-    """
-    Parameters
-    ----------
-    image_dir: `str`
-        Path to directory containing `png` image files.
-    image_loader : `ImageLoader`
-    image_featurizer: `Lazy[GridEmbedder]`
-        The backbone image processor (like a ResNet), whose output will be passed to the region
-        detector for finding object boxes in the image.
-    region_detector: `Lazy[RegionDetector]`
-        For pulling out regions of the image (both coordinates and features) that will be used by
-        downstream models.
-    data_dir: `str`
-        Path to directory containing text files for each dataset split. These files contain
-        the sentences and metadata for each task instance.
-    tokenizer: `Tokenizer`, optional
-    token_indexers: `Dict[str, TokenIndexer]`
-    """
-
-    def __init__(
-        self,
-        image_dir: Union[str, PathLike],
-        *,
-        image_loader: Optional[ImageLoader] = None,
-        image_featurizer: Optional[Lazy[GridEmbedder]] = None,
-        region_detector: Optional[Lazy[RegionDetector]] = None,
-        answer_vocab: Optional[Union[str, Vocabulary]] = None,
-        feature_cache_dir: Optional[Union[str, PathLike]] = None,
-        data_dir: Optional[Union[str, PathLike]] = None,
-        tokenizer: Tokenizer = None,
-        token_indexers: Dict[str, TokenIndexer] = None,
-        cuda_device: Optional[Union[int, torch.device]] = None,
-        max_instances: Optional[int] = None,
-        image_processing_batch_size: int = 8,
-        write_to_cache: bool = True,
-    ) -> None:
-        super().__init__(
-            image_dir,
-            image_loader=image_loader,
-            image_featurizer=image_featurizer,
-            region_detector=region_detector,
-            feature_cache_dir=feature_cache_dir,
-            tokenizer=tokenizer,
-            token_indexers=token_indexers,
-            cuda_device=cuda_device,
-            max_instances=max_instances,
-            image_processing_batch_size=image_processing_batch_size,
-            write_to_cache=write_to_cache,
-        )
-        self.data_dir = data_dir
-
-        # read answer vocab
-        if answer_vocab is None:
-            self.answer_vocab = None
-        else:
-            if isinstance(answer_vocab, str):
-                answer_vocab = cached_path(answer_vocab, extract_archive=True)
-                answer_vocab = Vocabulary.from_files(answer_vocab)
-            self.answer_vocab = frozenset(
-                answer_vocab.get_token_to_index_vocabulary("answers").keys()
-            )
-
-    @overrides
-    def _read(self, split_or_filename: str):
-
-        if not self.data_dir:
-            self.data_dir = "https://nlp.stanford.edu/data/gqa/questions1.2.zip!"
-
-        splits = {
-            "challenge_all": f"{self.data_dir}challenge_all_questions.json",
-            "challenge_balanced": f"{self.data_dir}challenge_balanced_questions.json",
-            "test_all": f"{self.data_dir}test_all_questions.json",
-            "test_balanced": f"{self.data_dir}test_balanced_questions.json",
-            "testdev_all": f"{self.data_dir}testdev_all_questions.json",
-            "testdev_balanced": f"{self.data_dir}testdev_balanced_questions.json",
-            "train_balanced": f"{self.data_dir}train_balanced_questions.json",
-            "train_all": f"{self.data_dir}train_all_questions",
-            "val_all": f"{self.data_dir}val_all_questions.json",
-            "val_balanced": f"{self.data_dir}val_balanced_questions.json",
-        }
-
-        filename = splits.get(split_or_filename, split_or_filename)
-        filename = cached_path(filename, extract_archive=True)
-
-        # If we're considering a directory of files (such as train_all)
-        # loop through each in file in generator
-        if os.path.isdir(filename):
-            files = [os.path.join(filename, file_path) for file_path in os.listdir(filename)]
-        else:
-            files = [filename]
-
-        # Ensure order is deterministic.
-        files.sort()
-
-        for data_file in files:
-            with open(data_file) as f:
-                questions_with_annotations = json.load(f)
-
-            question_dicts = list(
-                self.shard_iterable(
-                    questions_with_annotations[q_id] for q_id in questions_with_annotations
-                )
-            )
-
-            processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
-            if self.produce_featurized_images:
-                # It would be much easier to just process one image at a time, but it's faster to process
-                # them in batches. So this code gathers up instances until it has enough to fill up a batch
-                # that needs processing, and then processes them all.
-                filenames = [f"{question_dict['imageId']}.jpg" for question_dict in question_dicts]
-                try:
-                    processed_images = self._process_image_paths(
-                        self.images[filename] for filename in filenames
-                    )
-                except KeyError as e:
-                    missing_filename = e.args[0]
-                    raise KeyError(
-                        missing_filename,
-                        f"We could not find an image with the name {missing_filename}. "
-                        "Because of the size of the image datasets, we don't download them automatically. "
-                        "Please download the images from"
-                        "https://nlp.stanford.edu/data/gqa/images.zip, "
-                        "extract them into a directory, and set the image_dir parameter to point to that "
-                        "directory. This dataset reader does not care about the exact directory structure. It "
-                        "finds the images wherever they are.",
-                    )
-            else:
-                processed_images = [None] * len(question_dicts)
-
-            for question_dict, processed_image in zip(question_dicts, processed_images):
-                answer = {
-                    "answer": question_dict["answer"],
-                }
-                instance = self.text_to_instance(question_dict["question"], processed_image, answer)
-                if instance is not None:
-                    yield instance
-
-    @overrides
-    def text_to_instance(
-        self,  # type: ignore
-        question: str,
-        image: Optional[Union[str, Tuple[Tensor, Tensor]]],
-        answer: Optional[Dict[str, str]] = None,
-        *,
-        use_cache: bool = True,
-    ) -> Optional[Instance]:
-        from allennlp.data import Field
-
-        tokenized_question = self._tokenizer.tokenize(question)
-        fields: Dict[str, Field] = {"question": TextField(tokenized_question, None)}
-
-        if answer is not None:
-            labels_fields = []
-            weights = []
-            if not self.answer_vocab or answer["answer"] in self.answer_vocab:
-                labels_fields.append(LabelField(answer["answer"], label_namespace="answers"))
-                weights.append(1.0)
-
-            if len(labels_fields) <= 0:
-                return None
-
-            fields["label_weights"] = ArrayField(torch.tensor(weights))
-            fields["labels"] = ListField(labels_fields)
-
-        if image is not None:
-            if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
-            else:
-                features, coords = image
-            fields["box_features"] = ArrayField(features)
-            fields["box_coordinates"] = ArrayField(coords)
-            fields["box_mask"] = ArrayField(
-                features.new_ones((features.shape[0],), dtype=torch.bool),
-                padding_value=False,
-                dtype=torch.bool,
-            )
-
-        return Instance(fields)
-
-    @overrides
-    def apply_token_indexers(self, instance: Instance) -> None:
-        instance["question"].token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/dataset_readers/vision_reader.py b/allennlp/data/dataset_readers/vision_reader.py
deleted file mode 100644
index 150abb6eff1..00000000000
--- a/allennlp/data/dataset_readers/vision_reader.py
+++ /dev/null
@@ -1,324 +0,0 @@
-import glob
-import logging
-from os import PathLike
-from typing import (
-    Dict,
-    List,
-    Union,
-    Optional,
-    MutableMapping,
-    Set,
-    Tuple,
-    Iterator,
-    Iterable,
-)
-import os
-
-import torch
-from torch import Tensor
-from tqdm import tqdm
-import torch.distributed as dist
-
-from allennlp.common import util
-from allennlp.common.checks import check_for_gpu, ConfigurationError
-from allennlp.common.lazy import Lazy
-from allennlp.common.util import int_to_device
-from allennlp.common.file_utils import TensorCache
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.image_loader import ImageLoader
-from allennlp.data.token_indexers import PretrainedTransformerIndexer
-from allennlp.data.token_indexers import TokenIndexer
-from allennlp.data.tokenizers import PretrainedTransformerTokenizer
-from allennlp.data.tokenizers import Tokenizer
-from allennlp.modules.vision.grid_embedder import GridEmbedder
-from allennlp.modules.vision.region_detector import RegionDetector
-
-logger = logging.getLogger(__name__)
-
-
-class VisionReader(DatasetReader):
-    """
-    Base class for dataset readers for vision tasks.
-
-    If you don't specify `image_loader`, `image_featurizer`, and `region_detector`, the reader
-    assumes it can get all featurized images from the cache.
-
-    If you don't specify `feature_cache`, the reader will featurize all images using the
-    featurization components, and use an internal in-memory cache to catch duplicate
-    images.
-
-    If you don't specify either of these things, the reader will not produce featurized images
-    at all.
-
-    Parameters
-    ----------
-
-    image_dir: `str`
-        Path to directory containing image files. The structure of the directory doesn't matter. We
-        find images by finding filenames that match `*[image_id].jpg`.
-    image_loader : `ImageLoader`, optional
-        The image loading component.
-    image_featurizer: `Lazy[GridEmbedder]`, optional
-        The backbone image processor (like a ResNet), whose output will be passed to the region
-        detector for finding object boxes in the image.
-    region_detector: `Lazy[RegionDetector]`, optional
-        For pulling out regions of the image (both coordinates and features) that will be used by
-        downstream models.
-    tokenizer: `Tokenizer`, optional
-        The `Tokenizer` to use to tokenize the text. By default, this uses the tokenizer for
-        `"bert-base-uncased"`.
-    token_indexers: `Dict[str, TokenIndexer]`, optional
-        The `TokenIndexer` to use. By default, this uses the indexer for `"bert-base-uncased"`.
-    cuda_device: `Union[int, torch.device]`, optional
-        Either a torch device or a GPU number. This is the GPU we'll use to featurize the images.
-    max_instances: `int`, optional
-        For debugging, you can use this parameter to limit the number of instances the reader
-        returns.
-    image_processing_batch_size: `int`
-        The number of images to process at one time while featurizing. Default is 8.
-    write_to_cache: `bool`, optional (default = `True`)
-        Allows the reader to write to the cache. Disabling this is useful if you don't want
-        to accidentally overwrite a cache you already have, or if you don't have write
-        access to the cache you're using.
-    """
-
-    def __init__(
-        self,
-        image_dir: Optional[Union[str, PathLike]],
-        *,
-        image_loader: Optional[ImageLoader] = None,
-        image_featurizer: Optional[Lazy[GridEmbedder]] = None,
-        region_detector: Optional[Lazy[RegionDetector]] = None,
-        feature_cache_dir: Optional[Union[str, PathLike]] = None,
-        tokenizer: Optional[Tokenizer] = None,
-        token_indexers: Optional[Dict[str, TokenIndexer]] = None,
-        cuda_device: Optional[Union[int, torch.device]] = None,
-        max_instances: Optional[int] = None,
-        image_processing_batch_size: int = 8,
-        write_to_cache: bool = True,
-    ) -> None:
-        super().__init__(
-            max_instances=max_instances,
-            manual_distributed_sharding=True,
-            manual_multiprocess_sharding=True,
-        )
-
-        # tokenizers and indexers
-        if tokenizer is None:
-            tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
-        self._tokenizer = tokenizer
-        if token_indexers is None:
-            token_indexers = {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
-        self._token_indexers = token_indexers
-
-        if not ((image_loader is None) == (image_featurizer is None) == (region_detector is None)):
-            raise ConfigurationError(
-                "Please either specify all of image_loader, image_featurizer, and region_detector, "
-                "or specify none of them if you don't want to featurize images."
-            )
-
-        # feature cache
-        self.feature_cache_dir = feature_cache_dir
-        self.coordinates_cache_dir = feature_cache_dir
-        if feature_cache_dir:
-            self.write_to_cache = write_to_cache
-        else:
-            # If we don't have a cache dir, we use a dict in memory as a cache, so we
-            # always write.
-            self.write_to_cache = True
-        self._feature_cache_instance: Optional[MutableMapping[str, Tensor]] = None
-        self._coordinates_cache_instance: Optional[MutableMapping[str, Tensor]] = None
-
-        # image processors
-        self.image_loader = None
-        if image_loader and image_featurizer and region_detector:
-            if cuda_device is None:
-                if torch.cuda.device_count() > 0:
-                    if util.is_distributed():
-                        cuda_device = dist.get_rank() % torch.cuda.device_count()
-                    else:
-                        cuda_device = 0
-                else:
-                    cuda_device = -1
-            check_for_gpu(cuda_device)
-            self.cuda_device = int_to_device(cuda_device)
-            logger.info(f"Processing images on device {cuda_device}")
-
-            # image loading and featurizing
-            self.image_loader = image_loader
-            self.image_loader.device = self.cuda_device
-            self._lazy_image_featurizer = image_featurizer
-            self._image_featurizer = None
-            self._lazy_region_detector = region_detector
-            self._region_detector = None
-            self.image_processing_batch_size = image_processing_batch_size
-
-        self.produce_featurized_images = False
-        if self.feature_cache_dir and self.coordinates_cache_dir:
-            logger.info(f"Featurizing images with a cache at {self.feature_cache_dir}")
-            self.produce_featurized_images = True
-        if image_loader and image_featurizer and region_detector:
-            if self.produce_featurized_images:
-                logger.info("Falling back to a full image featurization pipeline")
-            else:
-                logger.info("Featurizing images with a full image featurization pipeline")
-                self.produce_featurized_images = True
-
-        if self.produce_featurized_images:
-            if image_dir is None:
-                if image_loader and image_featurizer and region_detector:
-                    raise ConfigurationError("We need an image_dir to featurize images.")
-                else:
-                    raise ConfigurationError(
-                        "We need an image_dir to use a cache of featurized images. Images won't be "
-                        "read if they are cached, but we need the image_dir to determine the right "
-                        "cache keys from the file names."
-                    )
-
-            logger.info("Discovering images ...")
-            self.images = {
-                os.path.basename(filename): filename
-                for extension in {"png", "jpg"}
-                for filename in tqdm(
-                    glob.iglob(os.path.join(image_dir, "**", f"*.{extension}"), recursive=True),
-                    desc=f"Discovering {extension} images",
-                )
-            }
-            logger.info("Done discovering images")
-
-    @property
-    def image_featurizer(self) -> Optional[GridEmbedder]:
-        if self._image_featurizer is None:
-            if self._lazy_image_featurizer is None:
-                return None
-            self._image_featurizer = self._lazy_image_featurizer.construct().to(self.cuda_device)  # type: ignore
-            self._image_featurizer.eval()  # type: ignore[attr-defined]
-        return self._image_featurizer  # type: ignore[return-value]
-
-    @property
-    def region_detector(self) -> Optional[RegionDetector]:
-        if self._region_detector is None:
-            if self._lazy_region_detector is None:
-                return None
-            self._region_detector = self._lazy_region_detector.construct().to(self.cuda_device)  # type: ignore
-            self._region_detector.eval()  # type: ignore[attr-defined]
-        return self._region_detector  # type: ignore[return-value]
-
-    @property
-    def _feature_cache(self) -> MutableMapping[str, Tensor]:
-        if self._feature_cache_instance is None:
-            if self.feature_cache_dir is None:
-                self._feature_cache_instance = {}
-            else:
-                os.makedirs(self.feature_cache_dir, exist_ok=True)
-                self._feature_cache_instance = TensorCache(
-                    os.path.join(self.feature_cache_dir, "features"),
-                    read_only=not self.write_to_cache,
-                )
-
-        return self._feature_cache_instance
-
-    @property
-    def _coordinates_cache(self) -> MutableMapping[str, Tensor]:
-        if self._coordinates_cache_instance is None:
-            if self.coordinates_cache_dir is None:
-                self._coordinates_cache_instance = {}
-            else:
-                os.makedirs(self.feature_cache_dir, exist_ok=True)  # type: ignore
-                self._coordinates_cache_instance = TensorCache(
-                    os.path.join(self.feature_cache_dir, "coordinates"),  # type: ignore
-                    read_only=not self.write_to_cache,
-                )
-
-        return self._coordinates_cache_instance
-
-    def _process_image_paths(
-        self, image_paths: Iterable[str], *, use_cache: bool = True
-    ) -> Iterator[Tuple[Tensor, Tensor]]:
-        """
-        Processes the given image paths and returns featurized images.
-
-        This consumes image paths one at a time, featurizes them either by going to the cache, or
-        by running the featurization models, and yields tensors one at a time. It runs the
-        featurization pipeline in batches for performance.
-
-        image_paths: `Iterable[str]`
-            the image paths to process
-        use_cache: `bool`, default = `True`
-            Usually the cache behavior is governed by the `write_to_cache` parameter given to
-            `__init__()`. But sometimes we want to override this behavior and turn off the
-            cache completely. This parameter lets you do that. This is useful for the
-            `Predictor`, so we can make predictions without having to touch a cache,
-            even if the model was trained with a cache.
-        """
-        assert self.produce_featurized_images, (
-            "For _process_image_paths() to work, we need either a feature cache, or an image loader, "
-            "an image featurizer, and a region detector."
-        )
-
-        batch: List[Union[str, Tuple[Tensor, Tensor]]] = []
-        unprocessed_paths: Set[str] = set()
-
-        def yield_batch():
-            # process the images
-            paths = list(unprocessed_paths)
-            images, sizes = self.image_loader(paths)
-            with torch.no_grad():
-                images = images.to(self.cuda_device)
-                sizes = sizes.to(self.cuda_device)
-                featurized_images = self.image_featurizer(images, sizes)
-                detector_results = self.region_detector(images, sizes, featurized_images)
-                features = detector_results.features
-                coordinates = detector_results.boxes
-
-            # store the processed results in memory, so we can complete the batch
-            paths_to_tensors = {path: (features[i], coordinates[i]) for i, path in enumerate(paths)}
-
-            # store the processed results in the cache
-            if use_cache and self.write_to_cache:
-                for path, (features, coordinates) in paths_to_tensors.items():
-                    basename = os.path.basename(path)
-                    self._feature_cache[basename] = features
-                    self._coordinates_cache[basename] = coordinates
-
-            # yield the batch
-            for b in batch:
-                if isinstance(b, str):
-                    yield paths_to_tensors[b]
-                else:
-                    yield b
-
-        for image_path in image_paths:
-            basename = os.path.basename(image_path)
-            try:
-                if use_cache:
-                    features: Tensor = self._feature_cache[basename]
-                    coordinates: Tensor = self._coordinates_cache[basename]
-                    if len(batch) <= 0:
-                        yield features, coordinates
-                    else:
-                        batch.append((features, coordinates))
-                else:
-                    # If we're not using the cache, we pretend we had a cache miss here.
-                    raise KeyError
-            except KeyError:
-                if not (self.image_loader and self.region_detector and self.image_featurizer):
-                    if use_cache:
-                        raise KeyError(
-                            f"Could not find {basename} in the feature cache, and "
-                            "image featurizers are not defined."
-                        )
-                    else:
-                        raise KeyError(
-                            "Reading the feature cache is disabled, and image featurizers "
-                            "are not defined. I can't process anything."
-                        )
-                batch.append(image_path)
-                unprocessed_paths.add(image_path)
-                if len(unprocessed_paths) >= self.image_processing_batch_size:
-                    yield from yield_batch()
-                    batch = []
-                    unprocessed_paths = set()
-
-        if len(batch) > 0:
-            yield from yield_batch()
diff --git a/allennlp/data/dataset_readers/visual_entailment.py b/allennlp/data/dataset_readers/visual_entailment.py
deleted file mode 100644
index 3a8df5ec1b0..00000000000
--- a/allennlp/data/dataset_readers/visual_entailment.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import logging
-
-from typing import (
-    Dict,
-    List,
-    Union,
-    Optional,
-    Tuple,
-)
-
-from overrides import overrides
-import torch
-from torch import Tensor
-
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.fields import Field, ArrayField, LabelField, TextField
-from allennlp.data.instance import Instance
-from allennlp.data.dataset_readers.vision_reader import VisionReader
-
-from allennlp.common.file_utils import json_lines_from_file
-
-logger = logging.getLogger(__name__)
-
-
-@DatasetReader.register("visual-entailment")
-class VisualEntailmentReader(VisionReader):
-    """
-    The dataset reader for visual entailment.
-    """
-
-    @overrides
-    def _read(self, file_path: str):
-        split_prefix = "https://storage.googleapis.com/allennlp-public-data/snli-ve/"
-        splits = {
-            "dev": split_prefix + "snli_ve_dev.jsonl.gz",
-            "test": split_prefix + "snli_ve_test.jsonl.gz",
-            "train": split_prefix + "snli_ve_train.jsonl.gz",
-        }
-        file_path = splits.get(file_path, file_path)
-        lines = json_lines_from_file(file_path)
-        info_dicts: List[Dict] = list(self.shard_iterable(lines))  # type: ignore
-
-        if self.produce_featurized_images:
-            # It would be much easier to just process one image at a time, but it's faster to process
-            # them in batches. So this code gathers up instances until it has enough to fill up a batch
-            # that needs processing, and then processes them all.
-            filenames = [info_dict["Flickr30K_ID"] + ".jpg" for info_dict in info_dicts]
-
-            try:
-                processed_images = self._process_image_paths(
-                    [self.images[filename] for filename in filenames]
-                )
-            except KeyError as e:
-                missing_filename = e.args[0]
-                raise KeyError(
-                    missing_filename,
-                    f"We could not find an image with the name {missing_filename}. "
-                    "Because of the size of the image datasets, we don't download them automatically. "
-                    "Please download the images from"
-                    "https://storage.googleapis.com/allennlp-public-data/snli-ve/flickr30k_images.tar.gz, "
-                    "extract them into a directory, and set the image_dir parameter to point to that "
-                    "directory. This dataset reader does not care about the exact directory structure. It "
-                    "finds the images wherever they are.",
-                )
-        else:
-            processed_images = [None for i in range(len(info_dicts))]  # type: ignore
-
-        for info_dict, processed_image in zip(info_dicts, processed_images):
-            hypothesis = info_dict["sentence2"]
-            answer = info_dict["gold_label"]
-
-            instance = self.text_to_instance(processed_image, hypothesis, answer)
-            yield instance
-
-    @overrides
-    def text_to_instance(
-        self,  # type: ignore
-        image: Union[str, Tuple[Tensor, Tensor]],
-        hypothesis: str,
-        label: Optional[str] = None,
-        *,
-        use_cache: bool = True,
-    ) -> Instance:
-
-        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)
-        hypothesis_field = TextField(tokenized_hypothesis, None)
-
-        fields: Dict[str, Field] = {"hypothesis": hypothesis_field}
-
-        if image is not None:
-            if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
-            else:
-                features, coords = image
-
-            fields["box_features"] = ArrayField(features)
-            fields["box_coordinates"] = ArrayField(coords)
-            fields["box_mask"] = ArrayField(
-                features.new_ones((features.shape[0],), dtype=torch.bool),
-                padding_value=False,
-                dtype=torch.bool,
-            )
-
-        if label:
-            fields["labels"] = LabelField(label)
-
-        return Instance(fields)
-
-    @overrides
-    def apply_token_indexers(self, instance: Instance) -> None:
-        instance["hypothesis"].token_indexers = self._token_indexers  # type: ignore
diff --git a/allennlp/data/dataset_readers/vqav2.py b/allennlp/data/dataset_readers/vqav2.py
deleted file mode 100644
index 080773adfc6..00000000000
--- a/allennlp/data/dataset_readers/vqav2.py
+++ /dev/null
@@ -1,559 +0,0 @@
-import logging
-from collections import Counter
-from functools import lru_cache
-from os import PathLike
-from typing import (
-    Dict,
-    List,
-    Union,
-    Optional,
-    MutableMapping,
-    NamedTuple,
-    Tuple,
-    Iterable,
-)
-import json
-import re
-
-from overrides import overrides
-import torch
-from torch import Tensor
-
-from allennlp.common.lazy import Lazy
-from allennlp.common.file_utils import cached_path, LocalCacheResource
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.data.dataset_readers.dataset_reader import DatasetReader
-from allennlp.data.fields import Field, ArrayField, LabelField, ListField, TextField
-from allennlp.data.image_loader import ImageLoader
-from allennlp.data.instance import Instance
-from allennlp.data.token_indexers import TokenIndexer
-from allennlp.data.tokenizers import Tokenizer
-from allennlp.modules.vision.grid_embedder import GridEmbedder
-from allennlp.modules.vision.region_detector import RegionDetector
-from allennlp.data.dataset_readers.vision_reader import VisionReader
-
-logger = logging.getLogger(__name__)
-
-contractions = {
-    "aint": "ain't",
-    "arent": "aren't",
-    "cant": "can't",
-    "couldve": "could've",
-    "couldnt": "couldn't",
-    "couldn'tve": "couldn't've",
-    "couldnt've": "couldn't've",
-    "didnt": "didn't",
-    "doesnt": "doesn't",
-    "dont": "don't",
-    "hadnt": "hadn't",
-    "hadnt've": "hadn't've",
-    "hadn'tve": "hadn't've",
-    "hasnt": "hasn't",
-    "havent": "haven't",
-    "hed": "he'd",
-    "hed've": "he'd've",
-    "he'dve": "he'd've",
-    "hes": "he's",
-    "howd": "how'd",
-    "howll": "how'll",
-    "hows": "how's",
-    "Id've": "I'd've",
-    "I'dve": "I'd've",
-    "Im": "I'm",
-    "Ive": "I've",
-    "isnt": "isn't",
-    "itd": "it'd",
-    "itd've": "it'd've",
-    "it'dve": "it'd've",
-    "itll": "it'll",
-    "let's": "let's",
-    "maam": "ma'am",
-    "mightnt": "mightn't",
-    "mightnt've": "mightn't've",
-    "mightn'tve": "mightn't've",
-    "mightve": "might've",
-    "mustnt": "mustn't",
-    "mustve": "must've",
-    "neednt": "needn't",
-    "notve": "not've",
-    "oclock": "o'clock",
-    "oughtnt": "oughtn't",
-    "ow's'at": "'ow's'at",
-    "'ows'at": "'ow's'at",
-    "'ow'sat": "'ow's'at",
-    "shant": "shan't",
-    "shed've": "she'd've",
-    "she'dve": "she'd've",
-    "she's": "she's",
-    "shouldve": "should've",
-    "shouldnt": "shouldn't",
-    "shouldnt've": "shouldn't've",
-    "shouldn'tve": "shouldn't've",
-    "somebody'd": "somebodyd",
-    "somebodyd've": "somebody'd've",
-    "somebody'dve": "somebody'd've",
-    "somebodyll": "somebody'll",
-    "somebodys": "somebody's",
-    "someoned": "someone'd",
-    "someoned've": "someone'd've",
-    "someone'dve": "someone'd've",
-    "someonell": "someone'll",
-    "someones": "someone's",
-    "somethingd": "something'd",
-    "somethingd've": "something'd've",
-    "something'dve": "something'd've",
-    "somethingll": "something'll",
-    "thats": "that's",
-    "thered": "there'd",
-    "thered've": "there'd've",
-    "there'dve": "there'd've",
-    "therere": "there're",
-    "theres": "there's",
-    "theyd": "they'd",
-    "theyd've": "they'd've",
-    "they'dve": "they'd've",
-    "theyll": "they'll",
-    "theyre": "they're",
-    "theyve": "they've",
-    "twas": "'twas",
-    "wasnt": "wasn't",
-    "wed've": "we'd've",
-    "we'dve": "we'd've",
-    "weve": "we've",
-    "werent": "weren't",
-    "whatll": "what'll",
-    "whatre": "what're",
-    "whats": "what's",
-    "whatve": "what've",
-    "whens": "when's",
-    "whered": "where'd",
-    "wheres": "where's",
-    "whereve": "where've",
-    "whod": "who'd",
-    "whod've": "who'd've",
-    "who'dve": "who'd've",
-    "wholl": "who'll",
-    "whos": "who's",
-    "whove": "who've",
-    "whyll": "why'll",
-    "whyre": "why're",
-    "whys": "why's",
-    "wont": "won't",
-    "wouldve": "would've",
-    "wouldnt": "wouldn't",
-    "wouldnt've": "wouldn't've",
-    "wouldn'tve": "wouldn't've",
-    "yall": "y'all",
-    "yall'll": "y'all'll",
-    "y'allll": "y'all'll",
-    "yall'd've": "y'all'd've",
-    "y'alld've": "y'all'd've",
-    "y'all'dve": "y'all'd've",
-    "youd": "you'd",
-    "youd've": "you'd've",
-    "you'dve": "you'd've",
-    "youll": "you'll",
-    "youre": "you're",
-    "youve": "you've",
-}
-manual_map = {
-    "none": "0",
-    "zero": "0",
-    "one": "1",
-    "two": "2",
-    "three": "3",
-    "four": "4",
-    "five": "5",
-    "six": "6",
-    "seven": "7",
-    "eight": "8",
-    "nine": "9",
-    "ten": "10",
-}
-articles = ["a", "an", "the"]
-period_strip = re.compile(r"(?!<=\d)(\.)(?!\d)")
-comma_strip = re.compile(r"(\d)(\,)(\d)")
-punct = [
-    ";",
-    r"/",
-    "[",
-    "]",
-    '"',
-    "{",
-    "}",
-    "(",
-    ")",
-    "=",
-    "+",
-    "\\",
-    "_",
-    "-",
-    ">",
-    "<",
-    "@",
-    "`",
-    ",",
-    "?",
-    "!",
-]
-
-
-def process_punctuation(inText: str) -> str:
-    outText = inText
-    for p in punct:
-        if (p + " " in inText or " " + p in inText) or (comma_strip.search(inText) is not None):
-            outText = outText.replace(p, "")
-        else:
-            outText = outText.replace(p, " ")
-    outText = period_strip.sub("", outText, re.UNICODE)
-    return outText
-
-
-def process_digit_article(input: str) -> str:
-    output = []
-    for word in input.lower().split():
-        word = manual_map.get(word, word)
-        if word not in articles:
-            output.append(word)
-        else:
-            pass
-    for index, word in enumerate(output):
-        if word in contractions:
-            output[index] = contractions[word]
-    return " ".join(output)
-
-
-@lru_cache(maxsize=None)
-def preprocess_answer(answer: str) -> str:
-    answer = process_digit_article(process_punctuation(answer))
-    answer = answer.replace(",", "")
-    return answer
-
-
-def get_score(count: int) -> float:
-    return min(1.0, count / 3)
-
-
-@DatasetReader.register("vqav2")
-class VQAv2Reader(VisionReader):
-    """
-    Parameters
-    ----------
-    image_dir: `str`
-        Path to directory containing `png` image files.
-    image_loader: `ImageLoader`
-        The image loader component used to load the images.
-    image_featurizer: `Lazy[GridEmbedder]`
-        The backbone image processor (like a ResNet), whose output will be passed to the region
-        detector for finding object boxes in the image.
-    region_detector: `Lazy[RegionDetector]`
-        For pulling out regions of the image (both coordinates and features) that will be used by
-        downstream models.
-    answer_vocab: `Union[Vocabulary, str]`, optional
-        The vocabulary to use for answers. The reader will look into the `"answers"` namespace
-        in the vocabulary to find possible answers.
-        If this is given, the reader only outputs instances with answers contained in this vocab.
-        If this is not given, the reader outputs all instances with all answers.
-        If this is a URL or filename, we will download a previously saved vocabulary from there.
-    feature_cache_dir: `Union[str, PathLike]`, optional
-        An optional directory to cache the featurized images in. Featurizing images takes a long
-        time, and many images are duplicated, so we highly recommend to use this cache.
-    tokenizer: `Tokenizer`, optional
-        The `Tokenizer` to use to tokenize the text. By default, this uses the tokenizer for
-        `"bert-base-uncased"`.
-    token_indexers: `Dict[str, TokenIndexer]`, optional
-        The `TokenIndexer` to use. By default, this uses the indexer for `"bert-base-uncased"`.
-    cuda_device: `Union[int, torch.device]`, optional
-        Either a torch device or a GPU number. This is the GPU we'll use to featurize the images.
-    max_instances: `int`, optional
-        For debugging, you can use this parameter to limit the number of instances the reader
-        returns.
-    image_processing_batch_size: `int`
-        The number of images to process at one time while featurizing. Default is 8.
-    multiple_answers_per_question: `bool`
-        VQA questions have multiple answers. By default, we use all of them, and give more
-        points to the more common answer. But VQA also has a special answer, the so-called
-        "multiple choice answer". If this is set to `False`, we only use that answer.
-    """
-
-    def __init__(
-        self,
-        image_dir: Optional[Union[str, PathLike]] = None,
-        *,
-        image_loader: Optional[ImageLoader] = None,
-        image_featurizer: Optional[Lazy[GridEmbedder]] = None,
-        region_detector: Optional[Lazy[RegionDetector]] = None,
-        answer_vocab: Optional[Union[Vocabulary, str]] = None,
-        feature_cache_dir: Optional[Union[str, PathLike]] = None,
-        tokenizer: Optional[Tokenizer] = None,
-        token_indexers: Optional[Dict[str, TokenIndexer]] = None,
-        cuda_device: Optional[Union[int, torch.device]] = None,
-        max_instances: Optional[int] = None,
-        image_processing_batch_size: int = 8,
-        multiple_answers_per_question: bool = True,
-        write_to_cache: bool = True,
-    ) -> None:
-        run_featurization = image_loader and image_featurizer and region_detector
-        if image_dir is None and run_featurization:
-            raise ValueError(
-                "Because of the size of the image datasets, we don't download them automatically. "
-                "Please go to https://visualqa.org/download.html, download the datasets you need, "
-                "and set the image_dir parameter to point to your download location. This dataset "
-                "reader does not care about the exact directory structure. It finds the images "
-                "wherever they are."
-            )
-
-        super().__init__(
-            image_dir,
-            image_loader=image_loader,
-            image_featurizer=image_featurizer,
-            region_detector=region_detector,
-            feature_cache_dir=feature_cache_dir,
-            tokenizer=tokenizer,
-            token_indexers=token_indexers,
-            cuda_device=cuda_device,
-            max_instances=max_instances,
-            image_processing_batch_size=image_processing_batch_size,
-            write_to_cache=write_to_cache,
-        )
-
-        # read answer vocab
-        if answer_vocab is None:
-            self.answer_vocab = None
-        else:
-            if isinstance(answer_vocab, str):
-                answer_vocab = cached_path(answer_vocab, extract_archive=True)
-                answer_vocab = Vocabulary.from_files(answer_vocab)
-            self.answer_vocab = frozenset(
-                preprocess_answer(a)
-                for a in answer_vocab.get_token_to_index_vocabulary("answers").keys()
-            )
-
-        if self.produce_featurized_images:
-            # normalize self.images some more
-            # At this point, self.images maps filenames to full paths, but we want to map image ids to full paths.
-            filename_re = re.compile(r".*(\d{12})\.((jpg)|(png))")
-
-            def id_from_filename(filename: str) -> Optional[int]:
-                match = filename_re.fullmatch(filename)
-                if match is None:
-                    return None
-                return int(match.group(1))
-
-            self.images = {
-                id_from_filename(name): full_path for name, full_path in self.images.items()
-            }
-            if None in self.images:
-                del self.images[None]
-
-        self.multiple_answers_per_question = multiple_answers_per_question
-
-    @overrides
-    def _read(self, splits_or_list_of_splits: Union[str, List[str]]):
-        # if we are given a list of splits, concatenate them
-        if isinstance(splits_or_list_of_splits, str):
-            split_name = splits_or_list_of_splits
-        else:
-            for split_name in splits_or_list_of_splits:
-                yield from self._read(split_name)
-            return
-
-        # if the splits are using slicing syntax, honor it
-        slice_match = re.match(r"(.*)\[([0123456789:]*)]", split_name)
-        if slice_match is None:
-            question_slice = slice(None, None, None)
-        else:
-            split_name = slice_match[1]
-            slice_args = [int(a) if len(a) > 0 else None for a in slice_match[2].split(":")]
-            question_slice = slice(*slice_args)
-
-        class Split(NamedTuple):
-            annotations: Optional[str]
-            questions: str
-
-        aws_base = "https://s3.amazonaws.com/cvmlp/vqa/"
-        mscoco_base = aws_base + "mscoco/vqa/"
-        scene_base = aws_base + "abstract_v002/vqa/"
-
-        # fmt: off
-        splits = {
-            "balanced_real_train": Split(
-                mscoco_base + "v2_Annotations_Train_mscoco.zip!v2_mscoco_train2014_annotations.json",  # noqa: E501
-                mscoco_base + "v2_Questions_Train_mscoco.zip!v2_OpenEnded_mscoco_train2014_questions.json",  # noqa: E501
-            ),
-            "balanced_real_val": Split(
-                mscoco_base + "v2_Annotations_Val_mscoco.zip!v2_mscoco_val2014_annotations.json",  # noqa: E501
-                mscoco_base + "v2_Questions_Val_mscoco.zip!v2_OpenEnded_mscoco_val2014_questions.json",  # noqa: E501
-            ),
-            "balanced_real_test": Split(
-                None,
-                mscoco_base + "v2_Questions_Test_mscoco.zip!v2_OpenEnded_mscoco_test2015_questions.json",  # noqa: E501
-            ),
-            "balanced_bas_train": Split(  # "bas" is Binary Abstract Scenes
-                scene_base + "Annotations_Binary_Train2017_abstract_v002.zip!abstract_v002_train2017_annotations.json",  # noqa: E501
-                scene_base + "Questions_Binary_Train2017_abstract_v002.zip!OpenEnded_abstract_v002_train2017_questions.json",  # noqa: E501
-            ),
-            "balanced_bas_val": Split(
-                scene_base + "Annotations_Binary_Val2017_abstract_v002.zip!abstract_v002_val2017_annotations.json",  # noqa: E501
-                scene_base + "Questions_Binary_Val2017_abstract_v002.zip!OpenEnded_abstract_v002_val2017_questions.json",  # noqa: E501
-            ),
-            "abstract_scenes_train": Split(
-                scene_base + "Annotations_Train_abstract_v002.zip!abstract_v002_train2015_annotations.json",  # noqa: E501
-                scene_base + "Questions_Train_abstract_v002.zip!OpenEnded_abstract_v002_train2015_questions.json",  # noqa: E501
-            ),
-            "abstract_scenes_val": Split(
-                scene_base + "Annotations_Val_abstract_v002.zip!abstract_v002_val2015_annotations.json",  # noqa: E501
-                scene_base + "Questions_Val_abstract_v002.zip!OpenEnded_abstract_v002_val2015_questions.json",  # noqa: E501
-            ),
-            "abstract_scenes_test": Split(
-                None,
-                scene_base + "Questions_Test_abstract_v002.zip!OpenEnded_abstract_v002_test2015_questions.json",  # noqa: E501
-            ),
-            "unittest": Split(
-                "test_fixtures/data/vqav2/annotations.json",
-                "test_fixtures/data/vqav2/questions.json"
-            )
-        }
-        # fmt: on
-
-        try:
-            split = splits[split_name]
-        except KeyError:
-            raise ValueError(f"Unrecognized split: {split_name}.")
-
-        answers_by_question_id = self._get_answers_by_question_id(split)
-
-        questions = []
-        with open(cached_path(split.questions, extract_archive=True)) as f:
-            questions_file = json.load(f)
-        for ques in questions_file["questions"]:
-            questions.append(ques)
-        questions = questions[question_slice]
-
-        question_dicts = list(self.shard_iterable(questions))
-        processed_images: Iterable[Optional[Tuple[Tensor, Tensor]]]
-        if self.produce_featurized_images:
-            # It would be much easier to just process one image at a time, but it's faster to process
-            # them in batches. So this code gathers up instances until it has enough to fill up a batch
-            # that needs processing, and then processes them all.
-
-            try:
-                image_paths = [
-                    self.images[int(question_dict["image_id"])] for question_dict in question_dicts
-                ]
-            except KeyError as e:
-                missing_id = e.args[0]
-                raise KeyError(
-                    missing_id,
-                    f"We could not find an image with the id {missing_id}. "
-                    "Because of the size of the image datasets, we don't download them automatically. "
-                    "Please go to https://visualqa.org/download.html, download the datasets you need, "
-                    "and set the image_dir parameter to point to your download location. This dataset "
-                    "reader does not care about the exact directory structure. It finds the images "
-                    "wherever they are.",
-                )
-
-            processed_images = self._process_image_paths(image_paths)
-        else:
-            processed_images = [None for _ in range(len(question_dicts))]
-
-        attempted_instances_count = 0
-        failed_instances_count = 0
-        for question_dict, processed_image in zip(question_dicts, processed_images):
-            answers = answers_by_question_id.get(str(question_dict["question_id"]))
-            instance = self.text_to_instance(question_dict["question"], processed_image, answers)
-            attempted_instances_count += 1
-            if instance is None:
-                failed_instances_count += 1
-            else:
-                yield instance
-
-            if attempted_instances_count % 2000 == 0:
-                failed_instances_fraction = failed_instances_count / attempted_instances_count
-                if failed_instances_fraction > 0.1:
-                    logger.warning(
-                        f"{failed_instances_fraction*100:.0f}% of instances have no answers."
-                    )
-
-    @overrides
-    def text_to_instance(
-        self,  # type: ignore
-        question: str,
-        image: Union[str, Tuple[Tensor, Tensor]],
-        answer_counts: Optional[MutableMapping[str, int]] = None,
-        *,
-        use_cache: bool = True,
-    ) -> Optional[Instance]:
-        tokenized_question = self._tokenizer.tokenize(question)
-        question_field = TextField(tokenized_question, None)
-
-        fields: Dict[str, Field] = {
-            "question": question_field,
-        }
-
-        if image is not None:
-            if isinstance(image, str):
-                features, coords = next(self._process_image_paths([image], use_cache=use_cache))
-            else:
-                features, coords = image
-
-            fields["box_features"] = ArrayField(features)
-            fields["box_coordinates"] = ArrayField(coords)
-            fields["box_mask"] = ArrayField(
-                features.new_ones((features.shape[0],), dtype=torch.bool),
-                padding_value=False,
-                dtype=torch.bool,
-            )
-
-        if answer_counts is not None:
-            answer_fields = []
-            weights = []
-
-            for answer, count in answer_counts.items():
-                if self.answer_vocab is None or answer in self.answer_vocab:
-                    answer_fields.append(LabelField(answer, label_namespace="answers"))
-                    weights.append(get_score(count))
-
-            if len(answer_fields) <= 0:
-                return None
-
-            fields["labels"] = ListField(answer_fields)
-            fields["label_weights"] = ArrayField(torch.tensor(weights))
-
-        return Instance(fields)
-
-    @overrides
-    def apply_token_indexers(self, instance: Instance) -> None:
-        instance["question"].token_indexers = self._token_indexers  # type: ignore
-
-    def _get_answers_by_question_id(self, split):
-        answers_by_question_id = {}
-        if split.annotations is not None:
-            # Pre-processing the annotations is time-consuming, so we don't want to
-            # have to re-do it each time we call read(). So we cache this result.
-            annotations_path = cached_path(split.annotations, extract_archive=True)
-            with LocalCacheResource(split.annotations + "-cache", annotations_path) as cache:
-                if cache.cached():
-                    logger.info(
-                        "Reading annotation answer counts from cache at %s",
-                        cache.path,
-                    )
-                    with cache.reader() as f:
-                        answers_by_question_id = json.load(f)
-                else:
-                    logger.info("Calculating annotation answer counts...")
-                    with open(annotations_path) as f:
-                        annotations = json.load(f)
-                    for a in annotations["annotations"]:
-                        qid = a["question_id"]
-                        answer_counts: MutableMapping[str, int] = Counter()
-                        if self.multiple_answers_per_question:
-                            for answer in (answer_dict["answer"] for answer_dict in a["answers"]):
-                                answer_counts[preprocess_answer(answer)] += 1
-                        else:
-                            answer_counts[preprocess_answer(a["multiple_choice_answer"])] = 1
-                        answers_by_question_id[str(qid)] = answer_counts
-                    logger.info("Caching annotation answer counts to %s", cache.path)
-                    with cache.writer() as f:
-                        json.dump(answers_by_question_id, f)
-        return answers_by_question_id
diff --git a/allennlp/models/__init__.py b/allennlp/models/__init__.py
index 87424122803..af51339ba31 100644
--- a/allennlp/models/__init__.py
+++ b/allennlp/models/__init__.py
@@ -8,5 +8,3 @@
 from allennlp.models.basic_classifier import BasicClassifier
 from allennlp.models.multitask import MultiTaskModel
 from allennlp.models.simple_tagger import SimpleTagger
-from allennlp.models.vilbert_vqa import VqaVilbert
-from allennlp.models.visual_entailment import VisualEntailmentModel
diff --git a/allennlp/models/heads/__init__.py b/allennlp/models/heads/__init__.py
index 1f9691421f1..0108faf262f 100644
--- a/allennlp/models/heads/__init__.py
+++ b/allennlp/models/heads/__init__.py
@@ -1,4 +1,2 @@
 from allennlp.models.heads.head import Head
 from allennlp.models.heads.classifier_head import ClassifierHead
-from allennlp.models.heads.vqa_head import VqaHead
-from allennlp.models.heads.visual_entailment_head import VisualEntailmentHead
diff --git a/allennlp/models/heads/visual_entailment_head.py b/allennlp/models/heads/visual_entailment_head.py
deleted file mode 100644
index d61db34ba5d..00000000000
--- a/allennlp/models/heads/visual_entailment_head.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from typing import Dict, Optional
-
-import torch
-from overrides import overrides
-
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.models.heads.head import Head
-
-
-@Head.register("visual_entailment")
-class VisualEntailmentHead(Head):
-    def __init__(self, vocab: Vocabulary, embedding_dim: int, label_namespace: str = "labels"):
-        super().__init__(vocab)
-
-        num_labels = vocab.get_vocab_size(label_namespace)
-        self.label_namespace = label_namespace
-        self.classifier = torch.nn.Linear(embedding_dim, num_labels)
-
-        from allennlp.training.metrics import CategoricalAccuracy
-        from allennlp.training.metrics import FBetaMeasure
-
-        self.accuracy = CategoricalAccuracy()
-        self.fbeta = FBetaMeasure(beta=1.0, average="macro")
-
-    @overrides
-    def forward(
-        self,  # type: ignore
-        encoded_boxes: torch.Tensor,
-        encoded_boxes_mask: torch.Tensor,
-        encoded_boxes_pooled: torch.Tensor,
-        encoded_text: torch.Tensor,
-        encoded_text_mask: torch.Tensor,
-        encoded_text_pooled: torch.Tensor,
-        pooled_boxes_and_text: torch.Tensor,
-        labels: Optional[torch.Tensor] = None,
-        label_weights: Optional[torch.Tensor] = None,
-    ) -> Dict[str, torch.Tensor]:
-        logits = self.classifier(pooled_boxes_and_text)
-        probs = torch.softmax(logits, dim=-1)
-
-        output = {"logits": logits, "probs": probs}
-
-        assert label_weights is None
-        if labels is not None:
-            output["loss"] = torch.nn.functional.cross_entropy(logits, labels) / logits.size(0)
-            self.accuracy(logits, labels)
-            self.fbeta(probs, labels)
-
-        return output
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        result = self.fbeta.get_metric(reset)
-        result["acc"] = self.accuracy.get_metric(reset)
-        return result
diff --git a/allennlp/models/heads/vqa_head.py b/allennlp/models/heads/vqa_head.py
deleted file mode 100644
index 749860b7cd5..00000000000
--- a/allennlp/models/heads/vqa_head.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from typing import Dict, Optional
-
-import torch
-from overrides import overrides
-
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.models.heads.head import Head
-
-
-@Head.register("vqa")
-class VqaHead(Head):
-    def __init__(self, vocab: Vocabulary, embedding_dim: int, label_namespace: str = "answers"):
-        super().__init__(vocab)
-
-        num_labels = vocab.get_vocab_size(label_namespace)
-        self.label_namespace = label_namespace
-        self.classifier = torch.nn.Linear(embedding_dim, num_labels)
-
-        from allennlp.training.metrics import F1MultiLabelMeasure
-        from allennlp.training.metrics.vqa import VqaMeasure
-
-        self.f1_metric = F1MultiLabelMeasure(average="micro")
-        self.vqa_metric = VqaMeasure()
-
-    @overrides
-    def forward(
-        self,  # type: ignore
-        encoded_boxes: torch.Tensor,
-        encoded_boxes_mask: torch.Tensor,
-        encoded_boxes_pooled: torch.Tensor,
-        encoded_text: torch.Tensor,
-        encoded_text_mask: torch.Tensor,
-        encoded_text_pooled: torch.Tensor,
-        pooled_boxes_and_text: torch.Tensor,
-        labels: Optional[torch.Tensor] = None,
-        label_weights: Optional[torch.Tensor] = None,
-    ) -> Dict[str, torch.Tensor]:
-        logits = self.classifier(pooled_boxes_and_text)
-
-        output = {
-            "logits": logits,
-            "probs": torch.sigmoid(logits),
-        }
-
-        if labels is not None and label_weights is not None:
-            label_mask = labels > 1  # 0 is padding, 1 is OOV, which we want to ignore
-
-            from allennlp.nn import util
-
-            weighted_labels = util.masked_index_replace(
-                logits.new_zeros(logits.size() + (1,)),
-                labels.clamp(min=0),
-                label_mask,
-                label_weights.unsqueeze(-1),
-            ).squeeze(-1)
-
-            # weighted_labels now has shape (batch_size, num_labels).  We need to ignore the first
-            # two columns of this in our loss function and accuracy metric.  The first column is a
-            # padding label, and the second column is an OOV label.  We want the loss function to
-            # be computed on every other label.
-            binary_label_mask = weighted_labels.new_ones(logits.size())
-            binary_label_mask[:, 0] = 0
-            binary_label_mask[:, 1] = 0
-
-            output["loss"] = torch.nn.functional.binary_cross_entropy_with_logits(
-                logits, weighted_labels, weight=binary_label_mask, reduction="sum"
-            ) / logits.size(0)
-
-            self.f1_metric(logits, weighted_labels, binary_label_mask.bool())
-            self.vqa_metric(logits, labels, label_weights)
-
-        return output
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        result = self.f1_metric.get_metric(reset)
-        result["vqa"] = self.vqa_metric.get_metric(reset)["score"]
-        return result
diff --git a/allennlp/models/vilbert_vqa.py b/allennlp/models/vilbert_vqa.py
deleted file mode 100644
index 7f602f24e34..00000000000
--- a/allennlp/models/vilbert_vqa.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import logging
-from typing import Dict, Optional
-
-from overrides import overrides
-import torch
-
-from allennlp.data import TextFieldTensors, Vocabulary
-from allennlp.models.model import Model
-from allennlp.modules.transformer import (
-    TransformerEmbeddings,
-    ImageFeatureEmbeddings,
-    BiModalEncoder,
-)
-from allennlp.nn import util
-
-from allennlp.models.vision_text_model import VisionTextModel
-
-
-logger = logging.getLogger(__name__)
-
-
-@Model.register("vqa_vilbert")
-@Model.register("vqa_vilbert_from_huggingface", constructor="from_huggingface_model_name")
-class VqaVilbert(VisionTextModel):
-    """
-    Model for VQA task based on the VilBERT paper.
-
-    # Parameters
-
-    vocab : `Vocabulary`
-    text_embeddings : `TransformerEmbeddings`
-    image_embeddings : `ImageFeatureEmbeddings`
-    encoder : `BiModalEncoder`
-    pooled_output_dim : `int`
-    fusion_method : `str`, optional (default = `"sum"`)
-    dropout : `float`, optional (default = `0.1`)
-    label_namespace : `str`, optional (default = `answers`)
-    """
-
-    def __init__(
-        self,
-        vocab: Vocabulary,
-        text_embeddings: TransformerEmbeddings,
-        image_embeddings: ImageFeatureEmbeddings,
-        encoder: BiModalEncoder,
-        pooled_output_dim: int,
-        fusion_method: str = "sum",
-        dropout: float = 0.1,
-        label_namespace: str = "answers",
-        *,
-        ignore_text: bool = False,
-        ignore_image: bool = False
-    ) -> None:
-        super().__init__(
-            vocab,
-            text_embeddings,
-            image_embeddings,
-            encoder,
-            pooled_output_dim,
-            fusion_method,
-            dropout,
-            label_namespace,
-            is_multilabel=True,
-            ignore_text=ignore_text,
-            ignore_image=ignore_image,
-        )
-
-        from allennlp.training.metrics import F1MultiLabelMeasure
-        from allennlp.training.metrics.vqa import VqaMeasure
-
-        self.f1_metric = F1MultiLabelMeasure(average="micro")
-        self.vqa_metric = VqaMeasure()
-
-    @overrides
-    def forward(
-        self,  # type: ignore
-        box_features: torch.Tensor,
-        box_coordinates: torch.Tensor,
-        box_mask: torch.Tensor,
-        question: TextFieldTensors,
-        labels: Optional[torch.Tensor] = None,
-        label_weights: Optional[torch.Tensor] = None,
-    ) -> Dict[str, torch.Tensor]:
-
-        return super().forward(
-            box_features,
-            box_coordinates,
-            box_mask,
-            text=question,
-            labels=labels,
-            label_weights=label_weights,
-        )
-
-    @overrides
-    def _compute_loss_and_metrics(
-        self,
-        batch_size: int,
-        outputs: torch.Tensor,
-        label: torch.Tensor,
-        label_weights: Optional[torch.Tensor] = None,
-    ):
-        if label is not None and label_weights is not None:
-            logits = outputs["logits"]
-            label_mask = label > 1  # 0 is padding, 1 is OOV, which we want to ignore
-
-            weighted_labels = util.masked_index_replace(
-                logits.new_zeros(logits.size() + (1,)),
-                label.clamp(min=0),
-                label_mask,
-                label_weights.unsqueeze(-1),
-            ).squeeze(-1)
-
-            # weighted_labels now has shape (batch_size, num_labels).  We need to ignore the first
-            # two columns of this in our loss function and accuracy metric.  The first column is a
-            # padding label, and the second column is an OOV label.  We want the loss function to
-            # be computed on every other label.
-            binary_label_mask = weighted_labels.new_ones(logits.size())
-            binary_label_mask[:, 0] = 0
-            binary_label_mask[:, 1] = 0
-
-            outputs["loss"] = (
-                torch.nn.functional.binary_cross_entropy_with_logits(
-                    logits, weighted_labels, weight=binary_label_mask, reduction="sum"
-                )
-                / batch_size
-            )
-
-            self.f1_metric(logits, weighted_labels, binary_label_mask.bool())
-            self.vqa_metric(logits, label, label_weights)
-
-        return outputs
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        result = self.f1_metric.get_metric(reset)
-        result["vqa_score"] = self.vqa_metric.get_metric(reset)["score"]
-        return result
-
-    @overrides
-    def make_output_human_readable(
-        self, output_dict: Dict[str, torch.Tensor]
-    ) -> Dict[str, torch.Tensor]:
-        batch_tokens = []
-        for batch_index, batch in enumerate(output_dict["probs"]):
-            tokens = {}
-            for i, prob in enumerate(batch):
-                tokens[self.vocab.get_token_from_index(i, self.label_namespace)] = float(prob)
-            batch_tokens.append(tokens)
-        output_dict["tokens"] = batch_tokens
-        return output_dict
-
-    default_predictor = "vilbert_vqa"
diff --git a/allennlp/models/vision_text_model.py b/allennlp/models/vision_text_model.py
deleted file mode 100644
index 063a176ab7c..00000000000
--- a/allennlp/models/vision_text_model.py
+++ /dev/null
@@ -1,258 +0,0 @@
-import logging
-from copy import deepcopy
-from typing import Dict, List, Optional
-
-from overrides import overrides
-import numpy as np
-import torch
-from transformers import AutoModel
-
-from allennlp.data.fields.text_field import TextFieldTensors
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.models.model import Model
-from allennlp.modules.transformer import (
-    TransformerEmbeddings,
-    ImageFeatureEmbeddings,
-    BiModalEncoder,
-)
-
-logger = logging.getLogger(__name__)
-
-
-@Model.register("vision_model")
-class VisionTextModel(Model):
-    """
-    `VisionTextModel` takes as input a single text input and a single image input
-    to produce some output. Example tasks include visual question-answering, visual
-    entailment, etc.
-
-    # Parameters
-
-    vocab : `Vocabulary`
-    text_embeddings : `TransformerEmbeddings`
-    image_embeddings : `ImageFeatureEmbeddings`
-    encoder : `BiModalEncoder`
-    pooled_output_dim : `int`
-    fusion_method : `str`, optional (default = `"sum"`)
-    dropout : `float`, optional (default = `0.1`)
-    label_namespace : `str`, optional (default = `"labels"`)
-    is_multilabel: `bool`, optional (default = `False`)
-        Whether the output classification is multilabel.
-        (i.e., can have multiple correct answers)
-    """
-
-    def __init__(
-        self,
-        vocab: Vocabulary,
-        text_embeddings: TransformerEmbeddings,
-        image_embeddings: ImageFeatureEmbeddings,
-        encoder: BiModalEncoder,
-        pooled_output_dim: int,
-        fusion_method: str = "sum",
-        dropout: float = 0.1,
-        label_namespace: str = "labels",
-        is_multilabel: bool = False,
-        *,
-        ignore_text: bool = False,
-        ignore_image: bool = False,
-    ) -> None:
-        super().__init__(vocab)
-
-        from allennlp.modules.backbones import VilbertBackbone
-
-        self.backbone = VilbertBackbone(
-            vocab,
-            text_embeddings,
-            image_embeddings,
-            encoder,
-            pooled_output_dim,
-            fusion_method,
-            dropout,
-        )
-
-        num_labels = vocab.get_vocab_size(label_namespace)
-        self.label_namespace = label_namespace
-
-        self.classifier = torch.nn.Linear(pooled_output_dim, num_labels)
-        self.dropout = torch.nn.Dropout(dropout)
-
-        self.is_multilabel = is_multilabel
-        self.ignore_text = ignore_text
-        self.ignore_images = ignore_image
-
-    @classmethod
-    def from_huggingface_model_name(
-        cls,
-        vocab: Vocabulary,
-        model_name: str,
-        image_feature_dim: int,
-        image_num_hidden_layers: int,
-        image_hidden_size: int,
-        image_num_attention_heads: int,
-        combined_hidden_size: int,
-        combined_num_attention_heads: int,
-        pooled_output_dim: int,
-        image_intermediate_size: int,
-        image_attention_dropout: float,
-        image_hidden_dropout: float,
-        image_biattention_id: List[int],
-        text_biattention_id: List[int],
-        text_fixed_layer: int,
-        image_fixed_layer: int,
-        pooled_dropout: float = 0.1,
-        fusion_method: str = "sum",
-        *,
-        ignore_text: bool = False,
-        ignore_image: bool = False,
-    ):
-        transformer = AutoModel.from_pretrained(model_name)
-
-        # Albert (and maybe others?) has this "embedding_size", that's different from "hidden_size".
-        # To get them to the same dimensionality, it uses a linear transform after the embedding
-        # layer, which we need to pull out and copy here.
-        if hasattr(transformer.config, "embedding_size"):
-            config = transformer.config
-
-            text_embeddings = TransformerEmbeddings.from_pretrained_module(
-                transformer.embeddings, output_size=config.hidden_dim
-            )
-
-            from transformers.models.albert.modeling_albert import AlbertModel
-
-            if isinstance(transformer, AlbertModel):
-                text_embeddings.linear_transform = deepcopy(
-                    transformer.encoder.embedding_hidden_mapping_in
-                )
-            else:
-                logger.warning(
-                    "Unknown model that uses separate embedding size; weights of the linear "
-                    f"transform will not be initialized.  Model type is: {transformer.__class__}"
-                )
-        else:
-            text_embeddings = TransformerEmbeddings.from_pretrained_module(transformer.embeddings)
-
-        image_embeddings = ImageFeatureEmbeddings(
-            feature_size=image_feature_dim,
-            embedding_size=image_hidden_size,
-            dropout=image_hidden_dropout,
-        )
-
-        encoder = BiModalEncoder.from_pretrained_module(
-            pretrained_module=transformer,
-            num_hidden_layers2=image_num_hidden_layers,
-            hidden_size2=image_hidden_size,
-            num_attention_heads2=image_num_attention_heads,
-            combined_hidden_size=combined_hidden_size,
-            combined_num_attention_heads=combined_num_attention_heads,
-            intermediate_size2=image_intermediate_size,
-            attention_dropout2=image_attention_dropout,
-            hidden_dropout2=image_hidden_dropout,
-            biattention_id1=text_biattention_id,
-            biattention_id2=image_biattention_id,
-            fixed_layer1=text_fixed_layer,
-            fixed_layer2=image_fixed_layer,
-        )
-        return cls(
-            vocab=vocab,
-            text_embeddings=text_embeddings,
-            image_embeddings=image_embeddings,
-            encoder=encoder,
-            pooled_output_dim=pooled_output_dim,
-            fusion_method=fusion_method,
-            dropout=pooled_dropout,
-            ignore_text=ignore_text,
-            ignore_image=ignore_image,
-        )
-
-    @overrides
-    def forward(
-        self,  # type: ignore
-        box_features: torch.Tensor,
-        box_coordinates: torch.Tensor,
-        box_mask: torch.Tensor,
-        text: TextFieldTensors,
-        labels: Optional[torch.Tensor] = None,
-        label_weights: Optional[torch.Tensor] = None,
-    ) -> Dict[str, torch.Tensor]:
-        """
-        # Parameters
-
-        box_features : `Tensor`
-            Shape: `(batch_size, num_boxes, feature_size)`
-
-        box_coordinates : `Tensor`
-            Shape: `(batch_size, num_boxes, 4)`
-
-        box_mask : `Tensor`
-            A bool and 0-1 tensor of shape `(batch_size, num_boxes)`.
-
-        text : `TextFieldTensors`
-
-        label : `Optional[Tensor]`
-
-        label_weights : `Optional[Tensor]`
-
-        """
-
-        batch_size = box_features.size(0)
-
-        if self.ignore_images:
-            box_features = torch.zeros_like(box_features)
-            box_coordinates = torch.zeros_like(box_coordinates)
-            box_coordinates[..., 2] = 1
-            box_coordinates[..., 3] = 1
-            box_mask = torch.ones_like(box_mask)
-
-        if self.ignore_text:
-            dummy_text = {}
-            for embedder_name, tensor_dict in text.items():
-                dummy_tensor_dict = {}
-                for tensor_name, tensor in tensor_dict.items():
-                    if "mask" in tensor_name:
-                        tensor = torch.ones_like(tensor)
-                    else:
-                        tensor = torch.zeros_like(tensor)
-                    dummy_tensor_dict[tensor_name] = tensor
-                dummy_text[embedder_name] = dummy_tensor_dict
-            text = dummy_text
-
-        backbone_outputs = self.backbone(box_features, box_coordinates, box_mask, text)
-
-        # Shape: (batch_size, num_labels)
-        logits = self.classifier(backbone_outputs["pooled_boxes_and_text"])
-
-        # Shape: (batch_size, num_labels)
-        if self.is_multilabel:
-            probs = torch.sigmoid(logits)
-        else:
-            probs = torch.softmax(logits, dim=-1)
-
-        outputs = {"logits": logits, "probs": probs}
-        outputs = self._compute_loss_and_metrics(batch_size, outputs, labels, label_weights)
-
-        return outputs
-
-    def _compute_loss_and_metrics(
-        self,
-        batch_size: int,
-        outputs: torch.Tensor,
-        label: torch.Tensor,
-        label_weights: Optional[torch.Tensor] = None,
-    ):
-        return outputs
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        result = self.accuracy.get_metric(reset)
-        return {"accuracy": result}
-
-    @overrides
-    def make_output_human_readable(
-        self, output_dict: Dict[str, torch.Tensor]
-    ) -> Dict[str, torch.Tensor]:
-        batch_labels = []
-        for batch_index, batch in enumerate(output_dict["probs"]):
-            labels = np.argmax(batch, axis=-1)
-            batch_labels.append(labels)
-        output_dict["labels"] = batch_labels
-        return output_dict
diff --git a/allennlp/models/visual_entailment.py b/allennlp/models/visual_entailment.py
deleted file mode 100644
index 5164de053ff..00000000000
--- a/allennlp/models/visual_entailment.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import logging
-from typing import Dict, Optional
-
-from overrides import overrides
-import numpy as np
-import torch
-
-from allennlp.data import TextFieldTensors, Vocabulary
-from allennlp.models.model import Model
-from allennlp.modules.transformer import (
-    TransformerEmbeddings,
-    ImageFeatureEmbeddings,
-    BiModalEncoder,
-)
-from allennlp.training.metrics import CategoricalAccuracy
-from allennlp.training.metrics import FBetaMeasure
-
-
-from allennlp.models.vision_text_model import VisionTextModel
-
-logger = logging.getLogger(__name__)
-
-
-@Model.register("ve_vilbert")
-@Model.register("ve_vilbert_from_huggingface", constructor="from_huggingface_model_name")
-class VisualEntailmentModel(VisionTextModel):
-    """
-    Model for visual entailment task based on the paper
-    [Visual Entailment: A Novel Task for Fine-Grained Image Understanding]
-    (https://api.semanticscholar.org/CorpusID:58981654).
-
-    # Parameters
-
-    vocab : `Vocabulary`
-    text_embeddings : `TransformerEmbeddings`
-    image_embeddings : `ImageFeatureEmbeddings`
-    encoder : `BiModalEncoder`
-    pooled_output_dim : `int`
-    fusion_method : `str`, optional (default = `"sum"`)
-    dropout : `float`, optional (default = `0.1`)
-    label_namespace : `str`, optional (default = `labels`)
-    """
-
-    def __init__(
-        self,
-        vocab: Vocabulary,
-        text_embeddings: TransformerEmbeddings,
-        image_embeddings: ImageFeatureEmbeddings,
-        encoder: BiModalEncoder,
-        pooled_output_dim: int,
-        fusion_method: str = "sum",
-        dropout: float = 0.1,
-        label_namespace: str = "labels",
-        *,
-        ignore_text: bool = False,
-        ignore_image: bool = False,
-    ) -> None:
-
-        super().__init__(
-            vocab,
-            text_embeddings,
-            image_embeddings,
-            encoder,
-            pooled_output_dim,
-            fusion_method,
-            dropout,
-            label_namespace,
-            is_multilabel=False,
-        )
-
-        self.accuracy = CategoricalAccuracy()
-        self.fbeta = FBetaMeasure(beta=1.0, average="macro")
-
-    @overrides
-    def forward(
-        self,  # type: ignore
-        box_features: torch.Tensor,
-        box_coordinates: torch.Tensor,
-        box_mask: torch.Tensor,
-        hypothesis: TextFieldTensors,
-        labels: Optional[torch.Tensor] = None,
-    ) -> Dict[str, torch.Tensor]:
-
-        return super().forward(
-            box_features,
-            box_coordinates,
-            box_mask,
-            text=hypothesis,
-            labels=labels,
-            label_weights=None,
-        )
-
-    @overrides
-    def _compute_loss_and_metrics(
-        self,
-        batch_size: int,
-        outputs: torch.Tensor,
-        label: torch.Tensor,
-        label_weights: Optional[torch.Tensor] = None,
-    ):
-        assert label_weights is None
-        if label is not None:
-            outputs["loss"] = (
-                torch.nn.functional.cross_entropy(outputs["logits"], label) / batch_size
-            )
-            self.accuracy(outputs["logits"], label)
-            self.fbeta(outputs["probs"], label)
-        return outputs
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        metrics = self.fbeta.get_metric(reset)
-        accuracy = self.accuracy.get_metric(reset)
-        metrics.update({"accuracy": accuracy})
-        return metrics
-
-    @overrides
-    def make_output_human_readable(
-        self, output_dict: Dict[str, torch.Tensor]
-    ) -> Dict[str, torch.Tensor]:
-        batch_labels = []
-        for batch_index, batch in enumerate(output_dict["probs"]):
-            labels = np.argmax(batch, axis=-1)
-            batch_labels.append(labels)
-        output_dict["labels"] = batch_labels
-        return output_dict
-
-    default_predictor = "vilbert_ve"
diff --git a/allennlp/predictors/vilbert_vqa.py b/allennlp/predictors/vilbert_vqa.py
deleted file mode 100644
index 0bba31c12e7..00000000000
--- a/allennlp/predictors/vilbert_vqa.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from typing import List, Dict
-
-from overrides import overrides
-import numpy
-
-from allennlp.common.file_utils import cached_path
-from allennlp.common.util import JsonDict
-from allennlp.data import Instance
-from allennlp.data.dataset_readers.vqav2 import VQAv2Reader
-from allennlp.predictors.predictor import Predictor
-
-
-@Predictor.register("vilbert_vqa")
-class VilbertVqaPredictor(Predictor):
-    def predict(self, image: str, sentence: str) -> JsonDict:
-        image = cached_path(image)
-        return self.predict_json({"question": sentence, "image": image})
-
-    @overrides
-    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
-        question = json_dict["question"]
-        image = cached_path(json_dict["image"])
-        if isinstance(self._dataset_reader, VQAv2Reader):
-            return self._dataset_reader.text_to_instance(question, image, use_cache=False)
-        else:
-            raise ValueError(
-                f"Dataset reader is of type f{self._dataset_reader.__class__.__name__}. "
-                f"Expected {VQAv2Reader.__name__}."
-            )
-
-    @overrides
-    def predictions_to_labeled_instances(
-        self, instance: Instance, outputs: Dict[str, numpy.ndarray]
-    ) -> List[Instance]:
-        return [instance]  # TODO
diff --git a/allennlp/predictors/visual_entailment.py b/allennlp/predictors/visual_entailment.py
deleted file mode 100644
index 329ab34688e..00000000000
--- a/allennlp/predictors/visual_entailment.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from typing import List, Dict
-
-from overrides import overrides
-import numpy
-
-from allennlp.common.file_utils import cached_path
-from allennlp.common.util import JsonDict
-from allennlp.data import Instance
-from allennlp.data.dataset_readers.visual_entailment import VisualEntailmentReader
-from allennlp.data.fields import LabelField
-from allennlp.predictors.predictor import Predictor
-
-
-@Predictor.register("vilbert_ve")
-class VisualEntailmentPredictor(Predictor):
-    def predict(self, image: str, hypothesis: str) -> JsonDict:
-        image = cached_path(image)
-        return self.predict_json({"image": image, "hypothesis": hypothesis})
-
-    @overrides
-    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
-        image = cached_path(json_dict["image"])
-        hypothesis = json_dict["hypothesis"]
-        if isinstance(self._dataset_reader, VisualEntailmentReader):
-            return self._dataset_reader.text_to_instance(image, hypothesis, use_cache=False)
-        else:
-            raise ValueError(
-                f"Dataset reader is of type f{self._dataset_reader.__class__.__name__}. "
-                f"Expected {VisualEntailmentReader.__name__}."
-            )
-
-    @overrides
-    def predictions_to_labeled_instances(
-        self, instance: Instance, outputs: Dict[str, numpy.ndarray]
-    ) -> List[Instance]:
-        new_instance = instance.duplicate()
-        label = numpy.argmax(outputs["probs"])
-        new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
-        return [new_instance]
diff --git a/allennlp/training/metrics/vqa.py b/allennlp/training/metrics/vqa.py
deleted file mode 100644
index 17316609052..00000000000
--- a/allennlp/training/metrics/vqa.py
+++ /dev/null
@@ -1,76 +0,0 @@
-from typing import Union
-
-import torch
-from overrides import overrides
-
-from allennlp.training.metrics.metric import Metric
-import torch.distributed as dist
-
-
-@Metric.register("vqa")
-class VqaMeasure(Metric):
-    """Compute the VQA metric, as described in
-    https://www.semanticscholar.org/paper/VQA%3A-Visual-Question-Answering-Agrawal-Lu/97ad70a9fa3f99adf18030e5e38ebe3d90daa2db
-
-    In VQA, we take the answer with the highest score, and then we find out how often
-    humans decided this was the right answer. The accuracy score for an answer is
-    `min(1.0, human_count / 3)`.
-
-    This metric takes the logits from the models, i.e., a score for each possible answer,
-    and the labels for the question, together with their weights.
-    """
-
-    def __init__(self) -> None:
-        self._sum_of_scores: Union[None, torch.Tensor] = None
-        self._score_count: Union[None, torch.Tensor] = None
-
-    @overrides
-    def __call__(self, logits: torch.Tensor, labels: torch.Tensor, label_weights: torch.Tensor):
-        """
-        # Parameters
-
-        logits : `torch.Tensor`, required.
-            A tensor of predictions of shape (batch_size, num_classes).
-        labels : `torch.Tensor`, required.
-            A tensor of integer class label of shape (batch_size, num_labels).
-        label_weights : `torch.Tensor`, required.
-            A tensor of floats of shape (batch_size, num_labels), giving a weight or score to
-            every one of the labels.
-        """
-
-        device = logits.device
-
-        if self._sum_of_scores is None:
-            self._sum_of_scores = torch.zeros([], device=device, dtype=label_weights.dtype)
-        if self._score_count is None:
-            self._score_count = torch.zeros([], device=device, dtype=torch.int32)
-
-        logits, labels, label_weights = self.detach_tensors(logits, labels, label_weights)
-        predictions = logits.argmax(dim=1)
-
-        # Sum over dimension 1 gives the score per question. We care about the overall sum though,
-        # so we sum over all dimensions.
-        self._sum_of_scores += (label_weights * (labels == predictions.unsqueeze(-1))).sum()
-        self._score_count += labels.size(0)
-
-        from allennlp.common.util import is_distributed
-
-        if is_distributed():
-            dist.all_reduce(self._sum_of_scores, op=dist.ReduceOp.SUM)
-            dist.all_reduce(self._score_count, op=dist.ReduceOp.SUM)
-
-    @overrides
-    def get_metric(self, reset: bool = False):
-        """
-        # Returns
-
-        score : `float`
-        """
-        from allennlp.common.util import nan_safe_tensor_divide
-
-        return {"score": nan_safe_tensor_divide(self._sum_of_scores, self._score_count).item()}
-
-    @overrides
-    def reset(self) -> None:
-        self._sum_of_scores = None
-        self._score_count = None
diff --git a/test_fixtures/data/gqa/images/test_fixtures/n166008.jpg b/test_fixtures/data/gqa/images/test_fixtures/n166008.jpg
deleted file mode 100644
index c317c513248..00000000000
Binary files a/test_fixtures/data/gqa/images/test_fixtures/n166008.jpg and /dev/null differ
diff --git a/test_fixtures/data/gqa/images/test_fixtures/n578564.jpg b/test_fixtures/data/gqa/images/test_fixtures/n578564.jpg
deleted file mode 100644
index 95ffcb9a9fc..00000000000
Binary files a/test_fixtures/data/gqa/images/test_fixtures/n578564.jpg and /dev/null differ
diff --git a/test_fixtures/data/gqa/question_dir/questions0.json b/test_fixtures/data/gqa/question_dir/questions0.json
deleted file mode 100644
index d6ed830c39b..00000000000
--- a/test_fixtures/data/gqa/question_dir/questions0.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "202218649": {
-    "semantic": [
-    {
-      "operation": "select", 
-      "dependencies": [], 
-      "argument": "chalkboard (0)"
-    }, 
-    {
-      "operation": "relate", 
-      "dependencies": [0], 
-      "argument": "_,hanging above,s (12)"
-    }, 
-    {
-      "operation": "query", 
-      "dependencies": [1], 
-      "argument": "name"
-    }
-    ], 
-    "entailed": ["202218648"], 
-    "equivalent": ["202218649"], 
-    "question": "What is hanging above the chalkboard?", 
-    "imageId": "n578564", 
-    "isBalanced": true, 
-    "groups": {
-      "global": "thing", 
-      "local": "14-chalkboard_hanging above,s"
-    }, 
-    "answer": "picture", 
-    "semanticStr": "select: chalkboard (0)->relate: _,hanging above,s (12) [0]->query: name [1]", 
-    "annotations": {
-      "answer": {"0": "12"}, 
-      "question": {}, 
-      "fullAnswer": {"1": "12", "6": "0"}
-    }, 
-    "types": {
-      "detailed": "relS", 
-      "semantic": "rel", 
-      "structural": "query"
-    }, 
-    "fullAnswer": "The picture is hanging above the chalkboard."
-  }
-}
\ No newline at end of file
diff --git a/test_fixtures/data/gqa/question_dir/questions1.json b/test_fixtures/data/gqa/question_dir/questions1.json
deleted file mode 100644
index cc4a7f2059f..00000000000
--- a/test_fixtures/data/gqa/question_dir/questions1.json
+++ /dev/null
@@ -1,51 +0,0 @@
-{
-  "20240871": {
-    "semantic": [
-      {
-        "operation": "select", 
-        "dependencies": [], 
-        "argument": "water (4)"
-      }, 
-      {
-        "operation": "relate", 
-        "dependencies": [0], 
-        "argument": "table,below,s (11)"
-      }, 
-      {
-        "operation": "verify shape", 
-        "dependencies": [1], 
-        "argument": "round"
-      }, 
-      {
-        "operation": "verify material", 
-        "dependencies": [1], 
-        "argument": "wood "
-      }, 
-      {"operation": "and", 
-        "dependencies": [2, 3], 
-        "argument": ""
-      }
-    ], 
-    "entailed": ["20240900", "20240892", "20240891", "20240890", "20240879", "20240896", "20240895", "20240894", "20240875", "20240897", "20240899", "20240898", "20240870", "20240878", "20240910", "20240877", "20240909", "20240886", "20240887", "20240882", "20240911", "20240872", "20240888", "20240889"], "equivalent": ["20240871", "20240870"],
-    "question": "Does the table below the water look wooden and round?", 
-    "imageId": "n166008", 
-    "isBalanced": false, 
-    "groups": {
-      "global": null, 
-      "local": "05-round_wood"
-    }, 
-    "answer": "yes", 
-    "semanticStr": "select: water (4)->relate: table,below,s (11) [0]->verify shape: round [1]->verify material: wood  [1]->and:  [2, 3]", 
-    "annotations": {
-      "answer": {}, 
-      "question": {"2": "11", "5": "4"}, 
-      "fullAnswer": {"2": "11"}
-    }, 
-    "types": {
-      "detailed": "verifyAttrs", 
-      "semantic": "attr",
-      "structural": "logical"
-    }, 
-    "fullAnswer": "Yes, the table is wooden and round."
-  }
-}
\ No newline at end of file
diff --git a/test_fixtures/data/gqa/questions.json b/test_fixtures/data/gqa/questions.json
deleted file mode 100644
index cad70842302..00000000000
--- a/test_fixtures/data/gqa/questions.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-    "202218649": {
-        "semantic": [
-        {
-            "operation": "select", 
-            "dependencies": [], 
-            "argument": "chalkboard (0)"
-        }, 
-        {
-            "operation": "relate", 
-            "dependencies": [0], 
-            "argument": "_,hanging above,s (12)"
-        }, 
-        {
-            "operation": "query", 
-            "dependencies": [1], 
-            "argument": "name"
-        }
-        ], 
-        "entailed": ["202218648"], 
-        "equivalent": ["202218649"], 
-        "question": "What is hanging above the chalkboard?", 
-        "imageId": "n578564", 
-        "isBalanced": true, 
-        "groups": {
-            "global": "thing", 
-            "local": "14-chalkboard_hanging above,s"
-        }, 
-        "answer": "picture", 
-        "semanticStr": "select: chalkboard (0)->relate: _,hanging above,s (12) [0]->query: name [1]", 
-        "annotations": {
-            "answer": {"0": "12"}, 
-            "question": {}, 
-            "fullAnswer": {"1": "12", "6": "0"}
-        }, 
-        "types": {
-            "detailed": "relS", 
-            "semantic": "rel", 
-            "structural": "query"
-        }, 
-        "fullAnswer": "The picture is hanging above the chalkboard."
-    }
-}
\ No newline at end of file
diff --git a/test_fixtures/data/vqav2/images/test_fixture/COCO_train2014_000000458752.jpg b/test_fixtures/data/images/COCO_train2014_000000458752.jpg
similarity index 100%
rename from test_fixtures/data/vqav2/images/test_fixture/COCO_train2014_000000458752.jpg
rename to test_fixtures/data/images/COCO_train2014_000000458752.jpg
diff --git a/test_fixtures/data/visual_entailment/images/2248275918.jpg b/test_fixtures/data/visual_entailment/images/2248275918.jpg
deleted file mode 100644
index 3f04748697a..00000000000
Binary files a/test_fixtures/data/visual_entailment/images/2248275918.jpg and /dev/null differ
diff --git a/test_fixtures/data/visual_entailment/images/402978771.jpg b/test_fixtures/data/visual_entailment/images/402978771.jpg
deleted file mode 100644
index 76dedb05696..00000000000
Binary files a/test_fixtures/data/visual_entailment/images/402978771.jpg and /dev/null differ
diff --git a/test_fixtures/data/visual_entailment/sample_pairs.jsonl b/test_fixtures/data/visual_entailment/sample_pairs.jsonl
deleted file mode 100644
index 1aaa93ee09b..00000000000
--- a/test_fixtures/data/visual_entailment/sample_pairs.jsonl
+++ /dev/null
@@ -1,16 +0,0 @@
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["contradiction"], "captionID": "2248275918.jpg#2", "gold_label": "contradiction", "pairID": "2248275918.jpg#2r1c", "sentence1": "A toddler poses in front of a computer at a business office.", "sentence1_binary_parse": "( ( A toddler ) ( ( poses ( in ( front ( of ( ( a computer ) ( at ( a ( business office ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ poses) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (NP (DT a) (NN computer)) (PP (IN at) (NP (DT a) (NN business) (NN office)))))))) (. .)))", "sentence2": "A toddler sleeps outside.", "sentence2_binary_parse": "( ( A toddler ) ( ( sleeps outside ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ sleeps) (ADVP (RB outside))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["neutral"], "captionID": "2248275918.jpg#2", "gold_label": "neutral", "pairID": "2248275918.jpg#2r1n", "sentence1": "A toddler poses in front of a computer at a business office.", "sentence1_binary_parse": "( ( A toddler ) ( ( poses ( in ( front ( of ( ( a computer ) ( at ( a ( business office ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ poses) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (NP (DT a) (NN computer)) (PP (IN at) (NP (DT a) (NN business) (NN office)))))))) (. .)))", "sentence2": "A toddler poses at the office.", "sentence2_binary_parse": "( ( A toddler ) ( ( poses ( at ( the office ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ poses) (PP (IN at) (NP (DT the) (NN office)))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["entailment"], "captionID": "2248275918.jpg#2", "gold_label": "entailment", "pairID": "2248275918.jpg#2r1e", "sentence1": "A toddler poses in front of a computer at a business office.", "sentence1_binary_parse": "( ( A toddler ) ( ( poses ( in ( front ( of ( ( a computer ) ( at ( a ( business office ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ poses) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (NP (DT a) (NN computer)) (PP (IN at) (NP (DT a) (NN business) (NN office)))))))) (. .)))", "sentence2": "A toddler poses in front of a computer indoors.", "sentence2_binary_parse": "( ( A toddler ) ( ( ( poses ( in ( front ( of ( a computer ) ) ) ) ) indoors ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN toddler)) (VP (VBZ poses) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer))))) (ADVP (RB indoors))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["entailment"], "captionID": "2248275918.jpg#3", "gold_label": "entailment", "pairID": "2248275918.jpg#3r1e", "sentence1": "An infant is sitting in front of a computer.", "sentence1_binary_parse": "( ( An infant ) ( ( is ( sitting ( in ( front ( of ( a computer ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT An) (NN infant)) (VP (VBZ is) (VP (VBG sitting) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer))))))) (. .)))", "sentence2": "The baby is in front of the computer.", "sentence2_binary_parse": "( ( The baby ) ( ( is ( in ( front ( of ( the computer ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN baby)) (VP (VBZ is) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT the) (NN computer)))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["neutral"], "captionID": "2248275918.jpg#3", "gold_label": "neutral", "pairID": "2248275918.jpg#3r1n", "sentence1": "An infant is sitting in front of a computer.", "sentence1_binary_parse": "( ( An infant ) ( ( is ( sitting ( in ( front ( of ( a computer ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT An) (NN infant)) (VP (VBZ is) (VP (VBG sitting) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer))))))) (. .)))", "sentence2": "The baby loves the computer.", "sentence2_binary_parse": "( ( The baby ) ( ( loves ( the computer ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN baby)) (VP (VBZ loves) (NP (DT the) (NN computer))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["contradiction", "neutral", "neutral", "neutral", "contradiction"], "captionID": "2248275918.jpg#3", "gold_label": "neutral", "pairID": "2248275918.jpg#3r1c", "sentence1": "An infant is sitting in front of a computer.", "sentence1_binary_parse": "( ( An infant ) ( ( is ( sitting ( in ( front ( of ( a computer ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT An) (NN infant)) (VP (VBZ is) (VP (VBG sitting) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer))))))) (. .)))", "sentence2": "The computer smells like diapers.", "sentence2_binary_parse": "( ( The computer ) ( ( smells ( like diapers ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN computer)) (VP (VBZ smells) (PP (IN like) (NP (NNS diapers)))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["neutral"], "captionID": "2248275918.jpg#0", "gold_label": "neutral", "pairID": "2248275918.jpg#0r1n", "sentence1": "A little asian girl is sitting at a computer desk about to grab the mouse.", "sentence1_binary_parse": "( ( A ( little ( asian girl ) ) ) ( ( is ( ( sitting ( at ( a ( computer ( desk about ) ) ) ) ) ( to ( grab ( the mouse ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ little) (NN asian) (NN girl)) (VP (VBZ is) (VP (VBG sitting) (PP (IN at) (NP (DT a) (NN computer) (NN desk) (RB about))) (S (VP (TO to) (VP (VB grab) (NP (DT the) (NN mouse))))))) (. .)))", "sentence2": "The little asian girl at the computer is about to play a game.", "sentence2_binary_parse": "( ( ( The ( little ( asian girl ) ) ) ( at ( the computer ) ) ) ( ( is ( about ( to ( play ( a game ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (NP (DT The) (JJ little) (NN asian) (NN girl)) (PP (IN at) (NP (DT the) (NN computer)))) (VP (VBZ is) (VP (IN about) (S (VP (TO to) (VP (VB play) (NP (DT a) (NN game))))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["contradiction"], "captionID": "2248275918.jpg#0", "gold_label": "contradiction", "pairID": "2248275918.jpg#0r1c", "sentence1": "A little asian girl is sitting at a computer desk about to grab the mouse.", "sentence1_binary_parse": "( ( A ( little ( asian girl ) ) ) ( ( is ( ( sitting ( at ( a ( computer ( desk about ) ) ) ) ) ( to ( grab ( the mouse ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ little) (NN asian) (NN girl)) (VP (VBZ is) (VP (VBG sitting) (PP (IN at) (NP (DT a) (NN computer) (NN desk) (RB about))) (S (VP (TO to) (VP (VB grab) (NP (DT the) (NN mouse))))))) (. .)))", "sentence2": "The little hispanic girl sits in front of the computer desk.", "sentence2_binary_parse": "( ( The ( little ( hispanic girl ) ) ) ( ( sits ( in ( front ( of ( the ( computer desk ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (JJ little) (JJ hispanic) (NN girl)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT the) (NN computer) (NN desk)))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["entailment"], "captionID": "2248275918.jpg#0", "gold_label": "entailment", "pairID": "2248275918.jpg#0r1e", "sentence1": "A little asian girl is sitting at a computer desk about to grab the mouse.", "sentence1_binary_parse": "( ( A ( little ( asian girl ) ) ) ( ( is ( ( sitting ( at ( a ( computer ( desk about ) ) ) ) ) ( to ( grab ( the mouse ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ little) (NN asian) (NN girl)) (VP (VBZ is) (VP (VBG sitting) (PP (IN at) (NP (DT a) (NN computer) (NN desk) (RB about))) (S (VP (TO to) (VP (VB grab) (NP (DT the) (NN mouse))))))) (. .)))", "sentence2": "The asian girl sits at the computer desk.", "sentence2_binary_parse": "( ( The ( asian girl ) ) ( ( sits ( at ( the ( computer desk ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN asian) (NN girl)) (VP (VBZ sits) (PP (IN at) (NP (DT the) (NN computer) (NN desk)))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["neutral"], "captionID": "2248275918.jpg#1", "gold_label": "neutral", "pairID": "2248275918.jpg#1r1n", "sentence1": "Young girl with her hair up playing with her parents computer.", "sentence1_binary_parse": "( ( ( ( Young girl ) ( with ( her hair ) ) ) ( up ( playing ( with ( her ( parents computer ) ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (NP (JJ Young) (NN girl)) (PP (IN with) (NP (PRP$ her) (NN hair)))) (VP (ADVP (RB up)) (VBG playing) (PP (IN with) (NP (PRP$ her) (NNS parents) (NN computer)))) (. .)))", "sentence2": "A person is learning a new program on the computer with her parents.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( learning ( a ( new program ) ) ) ( on ( ( the computer ) ( with ( her parents ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG learning) (NP (DT a) (JJ new) (NN program)) (PP (IN on) (NP (NP (DT the) (NN computer)) (PP (IN with) (NP (PRP$ her) (NNS parents))))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["contradiction"], "captionID": "2248275918.jpg#1", "gold_label": "contradiction", "pairID": "2248275918.jpg#1r1c", "sentence1": "Young girl with her hair up playing with her parents computer.", "sentence1_binary_parse": "( ( ( ( Young girl ) ( with ( her hair ) ) ) ( up ( playing ( with ( her ( parents computer ) ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (NP (JJ Young) (NN girl)) (PP (IN with) (NP (PRP$ her) (NN hair)))) (VP (ADVP (RB up)) (VBG playing) (PP (IN with) (NP (PRP$ her) (NNS parents) (NN computer)))) (. .)))", "sentence2": "An old man works on MTurk.", "sentence2_binary_parse": "( ( An ( old man ) ) ( ( works ( on MTurk ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT An) (JJ old) (NN man)) (VP (VBZ works) (PP (IN on) (NP (NNP MTurk)))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["entailment"], "captionID": "2248275918.jpg#1", "gold_label": "entailment", "pairID": "2248275918.jpg#1r1e", "sentence1": "Young girl with her hair up playing with her parents computer.", "sentence1_binary_parse": "( ( ( ( Young girl ) ( with ( her hair ) ) ) ( up ( playing ( with ( her ( parents computer ) ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (NP (JJ Young) (NN girl)) (PP (IN with) (NP (PRP$ her) (NN hair)))) (VP (ADVP (RB up)) (VBG playing) (PP (IN with) (NP (PRP$ her) (NNS parents) (NN computer)))) (. .)))", "sentence2": "A person is playing on the computer.", "sentence2_binary_parse": "( ( A person ) ( ( is ( playing ( on ( the computer ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG playing) (PP (IN on) (NP (DT the) (NN computer))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["contradiction"], "captionID": "2248275918.jpg#4", "gold_label": "contradiction", "pairID": "2248275918.jpg#4r1c", "sentence1": "A small child sits in front of a computer.", "sentence1_binary_parse": "( ( A ( small child ) ) ( ( sits ( in ( front ( of ( a computer ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ small) (NN child)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer)))))) (. .)))", "sentence2": "A man sits in front of a television.", "sentence2_binary_parse": "( ( A man ) ( ( sits ( in ( front ( of ( a television ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN man)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN television)))))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["entailment"], "captionID": "2248275918.jpg#4", "gold_label": "entailment", "pairID": "2248275918.jpg#4r1e", "sentence1": "A small child sits in front of a computer.", "sentence1_binary_parse": "( ( A ( small child ) ) ( ( sits ( in ( front ( of ( a computer ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ small) (NN child)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer)))))) (. .)))", "sentence2": "A child sits by a computer.", "sentence2_binary_parse": "( ( A child ) ( ( sits ( by ( a computer ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN child)) (VP (VBZ sits) (PP (IN by) (NP (DT a) (NN computer)))) (. .)))"}
-{"Flickr30K_ID": "2248275918", "annotator_labels": ["neutral"], "captionID": "2248275918.jpg#4", "gold_label": "neutral", "pairID": "2248275918.jpg#4r1n", "sentence1": "A small child sits in front of a computer.", "sentence1_binary_parse": "( ( A ( small child ) ) ( ( sits ( in ( front ( of ( a computer ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (JJ small) (NN child)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (NN computer)))))) (. .)))", "sentence2": "A small child sits in front of a laptop computer.", "sentence2_binary_parse": "( ( A ( small child ) ) ( ( sits ( in ( front ( of ( a ( laptop computer ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (JJ small) (NN child)) (VP (VBZ sits) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (DT a) (JJ laptop) (NN computer)))))) (. .)))"}
-{"Flickr30K_ID": "402978771", "annotator_labels": ["entailment"], "captionID": "402978771.jpg#1", "gold_label": "entailment", "pairID": "402978771.jpg#1r1e", "sentence1": "A woman sits on a bench next to a pay phone and faces train tracks and a large building.", "sentence1_binary_parse": "( ( A woman ) ( ( ( ( sits ( on ( ( a bench ) ( next ( to ( a ( pay phone ) ) ) ) ) ) ) and ) ( faces ( ( ( train tracks ) and ) ( a ( large building ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT A) (NN woman)) (VP (VP (VBZ sits) (PP (IN on) (NP (NP (DT a) (NN bench)) (ADVP (JJ next) (PP (TO to) (NP (DT a) (NN pay) (NN phone))))))) (CC and) (VP (VBZ faces) (NP (NP (NN train) (NNS tracks)) (CC and) (NP (DT a) (JJ large) (NN building))))) (. .)))", "sentence2": "The woman is sitting outside.", "sentence2_binary_parse": "( ( The woman ) ( ( is ( sitting outside ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is) (VP (VBG sitting) (ADVP (RB outside)))) (. .)))"}
diff --git a/test_fixtures/data/vqav2/annotations.json b/test_fixtures/data/vqav2/annotations.json
deleted file mode 100644
index 379c426b02a..00000000000
--- a/test_fixtures/data/vqav2/annotations.json
+++ /dev/null
@@ -1,184 +0,0 @@
-{
-  "info": {
-    "description": "This is v2.0 of the VQA dataset.",
-    "url": "http://visualqa.org",
-    "version": "2.0",
-    "year": 2017,
-    "contributor": "VQA Team",
-    "date_created": "2017-04-26 17:07:13"
-  },
-  "task_type": "Open-Ended",
-  "data_type": "mscoco",
-  "license": {
-    "url": "http://creativecommons.org/licenses/by/4.0/",
-    "name": "Creative Commons Attribution 4.0 International License"
-  },
-  "data_subtype": "train2014",
-  "annotations": [
-    {
-      "question_id": 458752000,
-      "answers": [
-        {
-          "answer": "net",
-          "answer_confidence": "maybe",
-          "answer_id": 1
-        },
-        {
-          "answer": "net",
-          "answer_confidence": "yes",
-          "answer_id": 2
-        },
-        {
-          "answer": "net",
-          "answer_confidence": "yes",
-          "answer_id": 3
-        },
-        {
-          "answer": "netting",
-          "answer_confidence": "yes",
-          "answer_id": 4
-        },
-        {
-          "answer": "net",
-          "answer_confidence": "yes",
-          "answer_id": 5
-        },
-        {
-          "answer": "net",
-          "answer_confidence": "yes",
-          "answer_id": 6
-        },
-        {
-          "answer": "mesh",
-          "answer_confidence": "maybe",
-          "answer_id": 7
-        },
-        {
-          "answer": "net",
-          "answer_confidence": "yes",
-          "answer_id": 8
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 9
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 10
-        }
-      ]
-    },
-    {
-      "question_id": 458752001,
-      "answers": [
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 1
-        },
-        {
-          "answer": "catcher",
-          "answer_confidence": "no",
-          "answer_id": 2
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 3
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 4
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 5
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 6
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 7
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 8
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 9
-        },
-        {
-          "answer": "pitcher",
-          "answer_confidence": "yes",
-          "answer_id": 10
-        }
-      ]
-    },
-    {
-      "question_id": 458752002,
-      "answers": [
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 1
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 2
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "maybe",
-          "answer_id": 3
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 4
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 5
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 6
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 7
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 8
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 9
-        },
-        {
-          "answer": "orange",
-          "answer_confidence": "yes",
-          "answer_id": 10
-        }
-      ]
-    }
-  ]
-}
diff --git a/test_fixtures/data/vqav2/images/experiment_cache/coordinates b/test_fixtures/data/vqav2/images/experiment_cache/coordinates
deleted file mode 100644
index 7d1ec6e434b..00000000000
Binary files a/test_fixtures/data/vqav2/images/experiment_cache/coordinates and /dev/null differ
diff --git a/test_fixtures/data/vqav2/images/experiment_cache/coordinates-lock b/test_fixtures/data/vqav2/images/experiment_cache/coordinates-lock
deleted file mode 100644
index d2b5a8fff4b..00000000000
Binary files a/test_fixtures/data/vqav2/images/experiment_cache/coordinates-lock and /dev/null differ
diff --git a/test_fixtures/data/vqav2/images/experiment_cache/features b/test_fixtures/data/vqav2/images/experiment_cache/features
deleted file mode 100644
index 33df98f4f53..00000000000
Binary files a/test_fixtures/data/vqav2/images/experiment_cache/features and /dev/null differ
diff --git a/test_fixtures/data/vqav2/images/experiment_cache/features-lock b/test_fixtures/data/vqav2/images/experiment_cache/features-lock
deleted file mode 100644
index 552c9d4aec8..00000000000
Binary files a/test_fixtures/data/vqav2/images/experiment_cache/features-lock and /dev/null differ
diff --git a/test_fixtures/data/vqav2/questions.json b/test_fixtures/data/vqav2/questions.json
deleted file mode 100644
index 8afde5fa3ad..00000000000
--- a/test_fixtures/data/vqav2/questions.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-  "info": {
-    "description": "This is v2.0 of the VQA dataset.",
-    "url": "http://visualqa.org",
-    "version": "2.0",
-    "year": 2017,
-    "contributor": "VQA Team",
-    "date_created": "2017-04-26 17:07:13"
-  },
-  "task_type": "Open-Ended",
-  "data_type": "mscoco",
-  "license": {
-    "url": "http://creativecommons.org/licenses/by/4.0/",
-    "name": "Creative Commons Attribution 4.0 International License"
-  },
-  "data_subtype": "train2014",
-  "questions": [
-    {
-      "image_id": 458752,
-      "question": "What is this photo taken looking through?",
-      "question_id": 458752000
-    },
-    {
-      "image_id": 458752,
-      "question": "What position is this man playing?",
-      "question_id": 458752001
-    },
-    {
-      "image_id": 458752,
-      "question": "What color is the players shirt?",
-      "question_id": 458752002
-    }
-  ]
-}
diff --git a/test_fixtures/vilbert_ve/experiment.jsonnet b/test_fixtures/vilbert_ve/experiment.jsonnet
deleted file mode 100644
index c025bb5cd03..00000000000
--- a/test_fixtures/vilbert_ve/experiment.jsonnet
+++ /dev/null
@@ -1,80 +0,0 @@
-local model_name = "epwalsh/bert-xsmall-dummy";
-
-{
-  "dataset_reader": {
-    "type": "visual-entailment",
-    "image_dir": "test_fixtures/data/visual_entailment/images",
-    "image_loader": "torch",
-    "image_featurizer": "null",
-    "region_detector": {
-      "type": "random",
-      "seed": 322
-    },
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    }
-  },
-  "train_data_path": "test_fixtures/data/visual_entailment/sample_pairs.jsonl",
-  "validation_data_path": "test_fixtures/data/visual_entailment/sample_pairs.jsonl",
-  "vocabulary": {"min_count": {"answers": 2}},
-  "datasets_for_vocab_creation": ["train"],
-  "model": {
-    "type": "ve_vilbert",
-    "text_embeddings": {
-      "vocab_size": 250,
-      "embedding_size": 20,
-      "pad_token_id": 0,
-      "max_position_embeddings": 512,
-      "type_vocab_size": 2,
-      "dropout": 0.0
-    },
-    "image_embeddings": {
-      "feature_size": 10,
-      "embedding_size": 200
-    },
-    "encoder": {
-      # text
-      "hidden_size1": 20,
-      "num_hidden_layers1": 1,
-      "intermediate_size1": 40,
-      "num_attention_heads1": 1,
-      "attention_dropout1": 0.1,
-      "hidden_dropout1": 0.1,
-      "biattention_id1": [0, 1],
-      "fixed_layer1": 0,
-
-      # vision
-      "hidden_size2": 200,
-      "num_hidden_layers2": 1,
-      "intermediate_size2": 50,
-      "num_attention_heads2": 1,
-      "attention_dropout2": 0.0,
-      "hidden_dropout2": 0.0,
-      "biattention_id2": [0, 1],
-      "fixed_layer2": 0,
-
-      "combined_num_attention_heads": 2,
-      "combined_hidden_size": 200,
-      "activation": "gelu",
-    },
-    "pooled_output_dim": 100,
-    "fusion_method": "sum",
-  },
-  "data_loader": {
-    "batch_size": 4
-  },
-  "trainer": {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 0.00005
-    },
-    "num_epochs": 1,
-  },
-}
diff --git a/test_fixtures/vilbert_ve/experiment_from_huggingface.jsonnet b/test_fixtures/vilbert_ve/experiment_from_huggingface.jsonnet
deleted file mode 100644
index 841edf839f5..00000000000
--- a/test_fixtures/vilbert_ve/experiment_from_huggingface.jsonnet
+++ /dev/null
@@ -1,60 +0,0 @@
-local model_name = "epwalsh/bert-xsmall-dummy";
-{
-  "dataset_reader": {
-    "type": "visual-entailment",
-    "image_dir": "test_fixtures/data/visual_entailment/images",
-    "image_loader": "torch",
-    "image_featurizer": "null",
-    "region_detector": {
-      "type": "random",
-      "seed": 322
-    },
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    }
-  },
-  "train_data_path": "test_fixtures/data/visual_entailment/sample_pairs.jsonl",
-  "validation_data_path": "test_fixtures/data/visual_entailment/sample_pairs.jsonl",
-  "vocabulary": {"min_count": {"answers": 2}},
-  "datasets_for_vocab_creation": ["train"],
-  "model": {
-    "type": "ve_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 10,
-    "image_num_hidden_layers": 1,
-    "image_hidden_size": 200,
-    "image_num_attention_heads": 1,
-    "image_intermediate_size": 50,
-    "image_attention_dropout": 0.0,
-    "image_hidden_dropout": 0.0,
-    "image_biattention_id": [0, 1],
-    "image_fixed_layer": 0,
-
-    "text_biattention_id": [0, 1],
-    "text_fixed_layer": 0,
-
-    "combined_hidden_size": 200,
-    "combined_num_attention_heads": 4,
-
-    "pooled_output_dim": 100,
-    "fusion_method": "sum",
-    "pooled_dropout": 0.0,
-  },
-  "data_loader": {
-    "batch_size": 32
-  },
-  "trainer": {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 0.00005
-    },
-    "num_epochs": 1,
-  },
-}
diff --git a/test_fixtures/vilbert_vqa/experiment.jsonnet b/test_fixtures/vilbert_vqa/experiment.jsonnet
deleted file mode 100644
index bd202160900..00000000000
--- a/test_fixtures/vilbert_vqa/experiment.jsonnet
+++ /dev/null
@@ -1,80 +0,0 @@
-local model_name = "epwalsh/bert-xsmall-dummy";
-
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": "test_fixtures/data/vqav2/images",
-    "image_loader": "torch",
-    "image_featurizer": "null",
-    "region_detector": {
-      "type": "random",
-      "seed": 322
-    },
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    }
-  },
-  "train_data_path": "unittest",
-  "validation_data_path": "unittest",
-  "vocabulary": {"min_count": {"answers": 2}},
-  "datasets_for_vocab_creation": ["train"],
-  "model": {
-    "type": "vqa_vilbert",
-    "text_embeddings": {
-      "vocab_size": 250,
-      "embedding_size": 20,
-      "pad_token_id": 0,
-      "max_position_embeddings": 512,
-      "type_vocab_size": 2,
-      "dropout": 0.0
-    },
-    "image_embeddings": {
-      "feature_size": 10,
-      "embedding_size": 200
-    },
-    "encoder": {
-      # text
-      "hidden_size1": 20,
-      "num_hidden_layers1": 1,
-      "intermediate_size1": 40,
-      "num_attention_heads1": 1,
-      "attention_dropout1": 0.1,
-      "hidden_dropout1": 0.1,
-      "biattention_id1": [0, 1],
-      "fixed_layer1": 0,
-
-      # vision
-      "hidden_size2": 200,
-      "num_hidden_layers2": 1,
-      "intermediate_size2": 50,
-      "num_attention_heads2": 1,
-      "attention_dropout2": 0.0,
-      "hidden_dropout2": 0.0,
-      "biattention_id2": [0, 1],
-      "fixed_layer2": 0,
-
-      "combined_num_attention_heads": 2,
-      "combined_hidden_size": 200,
-      "activation": "gelu",
-    },
-    "pooled_output_dim": 100,
-    "fusion_method": "sum",
-  },
-  "data_loader": {
-    "batch_size": 4
-  },
-  "trainer": {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 0.00005
-    },
-    "num_epochs": 1,
-  },
-}
diff --git a/test_fixtures/vilbert_vqa/experiment_from_huggingface.jsonnet b/test_fixtures/vilbert_vqa/experiment_from_huggingface.jsonnet
deleted file mode 100644
index bd6b7a1d794..00000000000
--- a/test_fixtures/vilbert_vqa/experiment_from_huggingface.jsonnet
+++ /dev/null
@@ -1,60 +0,0 @@
-local model_name = "epwalsh/bert-xsmall-dummy";
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": "test_fixtures/data/vqav2/images",
-    "image_loader": "torch",
-    "image_featurizer": "null",
-    "region_detector": {
-      "type": "random",
-      "seed": 322
-    },
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    }
-  },
-  "train_data_path": "unittest",
-  "validation_data_path": "unittest",
-  "vocabulary": {"min_count": {"answers": 2}},
-  "datasets_for_vocab_creation": ["train"],
-  "model": {
-    "type": "vqa_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 10,
-    "image_num_hidden_layers": 1,
-    "image_hidden_size": 20,
-    "image_num_attention_heads": 1,
-    "image_intermediate_size": 5,
-    "image_attention_dropout": 0.0,
-    "image_hidden_dropout": 0.0,
-    "image_biattention_id": [0, 1],
-    "image_fixed_layer": 0,
-
-    "text_biattention_id": [0, 1],
-    "text_fixed_layer": 0,
-
-    "combined_hidden_size": 20,
-    "combined_num_attention_heads": 2,
-
-    "pooled_output_dim": 20,
-    "fusion_method": "sum",
-    "pooled_dropout": 0.0,
-  },
-  "data_loader": {
-    "batch_size": 32
-  },
-  "trainer": {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 0.00005
-    },
-    "num_epochs": 1,
-  },
-}
diff --git a/tests/data/dataset_readers/gqa_test.py b/tests/data/dataset_readers/gqa_test.py
deleted file mode 100644
index fa216003ac4..00000000000
--- a/tests/data/dataset_readers/gqa_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from allennlp.common.lazy import Lazy
-from allennlp.common.testing import AllenNlpTestCase
-from allennlp.data import Batch, Vocabulary
-from allennlp.data.dataset_readers import GQAReader
-from allennlp.data.image_loader import TorchImageLoader
-from allennlp.data.tokenizers import WhitespaceTokenizer
-from allennlp.data.token_indexers import SingleIdTokenIndexer
-from allennlp.modules.vision.grid_embedder import NullGridEmbedder
-from allennlp.modules.vision.region_detector import RandomRegionDetector
-
-
-class TestGQAReader(AllenNlpTestCase):
-    def setup_method(self):
-        super().setup_method()
-        self.reader = GQAReader(
-            image_dir=self.FIXTURES_ROOT / "data" / "gqa" / "images",
-            image_loader=TorchImageLoader(),
-            image_featurizer=Lazy(NullGridEmbedder),
-            region_detector=Lazy(RandomRegionDetector),
-            tokenizer=WhitespaceTokenizer(),
-            token_indexers={"tokens": SingleIdTokenIndexer()},
-        )
-
-    def test_read(self):
-        instances = list(self.reader.read("test_fixtures/data/gqa/questions.json"))
-        assert len(instances) == 1
-
-        instance = instances[0]
-        assert len(instance.fields) == 6
-        assert len(instance["question"]) == 6
-        question_tokens = [t.text for t in instance["question"]]
-        assert question_tokens == ["What", "is", "hanging", "above", "the", "chalkboard?"]
-        assert instance["labels"][0].label == "picture"
-
-        batch = Batch(instances)
-        batch.index_instances(Vocabulary())
-        tensors = batch.as_tensor_dict()
-
-        # (batch size, num boxes (fake), num features (fake))
-        assert tensors["box_features"].size() == (1, 2, 10)
-
-        # (batch size, num boxes (fake), 4 coords)
-        assert tensors["box_coordinates"].size() == (1, 2, 4)
-
-        # (batch size, num boxes (fake),)
-        assert tensors["box_mask"].size() == (1, 2)
-
-    def test_read_from_dir(self):
-        # Test reading from multiple files in a directory
-        instances = list(self.reader.read("test_fixtures/data/gqa/question_dir/"))
-        assert len(instances) == 2
-
-        instance = instances[1]
-        assert len(instance.fields) == 6
-        assert len(instance["question"]) == 10
-        question_tokens = [t.text for t in instance["question"]]
-        assert question_tokens == [
-            "Does",
-            "the",
-            "table",
-            "below",
-            "the",
-            "water",
-            "look",
-            "wooden",
-            "and",
-            "round?",
-        ]
-        assert instance["labels"][0].label == "yes"
-
-        batch = Batch(instances)
-        batch.index_instances(Vocabulary())
-        tensors = batch.as_tensor_dict()
-
-        # (batch size, num boxes (fake), num features (fake))
-        assert tensors["box_features"].size() == (2, 2, 10)
-
-        # (batch size, num boxes (fake), 4 coords)
-        assert tensors["box_coordinates"].size() == (2, 2, 4)
-
-        # (batch size, num boxes (fake),)
-        assert tensors["box_mask"].size() == (2, 2)
diff --git a/tests/data/dataset_readers/visual_entailment_test.py b/tests/data/dataset_readers/visual_entailment_test.py
deleted file mode 100644
index b653548e500..00000000000
--- a/tests/data/dataset_readers/visual_entailment_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from allennlp.common.lazy import Lazy
-from allennlp.common.testing import AllenNlpTestCase
-from allennlp.data import Batch, Vocabulary
-from allennlp.data.dataset_readers import VisualEntailmentReader
-from allennlp.data.image_loader import TorchImageLoader
-from allennlp.data.tokenizers import WhitespaceTokenizer
-from allennlp.data.token_indexers import SingleIdTokenIndexer
-from allennlp.modules.vision.grid_embedder import NullGridEmbedder
-from allennlp.modules.vision.region_detector import RandomRegionDetector
-
-
-class TestVisualEntailmentReader(AllenNlpTestCase):
-    def test_read(self):
-        reader = VisualEntailmentReader(
-            image_dir=self.FIXTURES_ROOT / "data" / "visual_entailment",
-            image_loader=TorchImageLoader(),
-            image_featurizer=Lazy(NullGridEmbedder),
-            region_detector=Lazy(RandomRegionDetector),
-            tokenizer=WhitespaceTokenizer(),
-            token_indexers={"tokens": SingleIdTokenIndexer()},
-        )
-        instances = list(reader.read("test_fixtures/data/visual_entailment/sample_pairs.jsonl"))
-        assert len(instances) == 16
-
-        instance = instances[0]
-        assert len(instance.fields) == 5
-        assert len(instance["hypothesis"]) == 4
-        sentence_tokens = [t.text for t in instance["hypothesis"]]
-        assert sentence_tokens == ["A", "toddler", "sleeps", "outside."]
-        assert instance["labels"].label == "contradiction"
-
-        batch = Batch(instances)
-        vocab = Vocabulary()
-        vocab.add_tokens_to_namespace(["entailment", "contradiction", "neutral"], "labels")
-        batch.index_instances(vocab)
-        tensors = batch.as_tensor_dict()
-
-        # (batch size, num boxes (fake), num features (fake))
-        assert tensors["box_features"].size() == (16, 2, 10)
-
-        # (batch size, num boxes (fake), 4 coords)
-        assert tensors["box_coordinates"].size() == (16, 2, 4)
-
-        # (batch_size, num boxes (fake),)
-        assert tensors["box_mask"].size() == (16, 2)
diff --git a/tests/data/dataset_readers/vqav2_test.py b/tests/data/dataset_readers/vqav2_test.py
deleted file mode 100644
index 1b3c61c4421..00000000000
--- a/tests/data/dataset_readers/vqav2_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import torch
-
-from allennlp.common.testing import AllenNlpTestCase
-from allennlp.common.lazy import Lazy
-from allennlp.data import Batch, Vocabulary
-from allennlp.data.dataset_readers import VQAv2Reader
-from allennlp.data.image_loader import TorchImageLoader
-from allennlp.data.tokenizers import WhitespaceTokenizer
-from allennlp.data.token_indexers import SingleIdTokenIndexer
-from allennlp.modules.vision.grid_embedder import NullGridEmbedder
-from allennlp.modules.vision.region_detector import RandomRegionDetector
-
-
-class TestVQAv2Reader(AllenNlpTestCase):
-    def test_read(self):
-        reader = VQAv2Reader(
-            image_dir=self.FIXTURES_ROOT / "data" / "vqav2" / "images",
-            image_loader=TorchImageLoader(),
-            image_featurizer=Lazy(NullGridEmbedder),
-            region_detector=Lazy(RandomRegionDetector),
-            tokenizer=WhitespaceTokenizer(),
-            token_indexers={"tokens": SingleIdTokenIndexer()},
-        )
-        instances = list(reader.read("unittest"))
-        assert len(instances) == 3
-
-        instance = instances[0]
-        assert len(instance.fields) == 6
-        assert len(instance["question"]) == 7
-        question_tokens = [t.text for t in instance["question"]]
-        assert question_tokens == ["What", "is", "this", "photo", "taken", "looking", "through?"]
-        assert len(instance["labels"]) == 5
-        labels = [field.label for field in instance["labels"].field_list]
-        assert labels == ["net", "netting", "mesh", "pitcher", "orange"]
-        assert torch.allclose(
-            instance["label_weights"].tensor,
-            torch.tensor([1.0, 1.0 / 3, 1.0 / 3, 1.0 / 3, 1.0 / 3]),
-        )
-
-        batch = Batch(instances)
-        batch.index_instances(Vocabulary())
-        tensors = batch.as_tensor_dict()
-
-        # (batch size, num boxes (fake), num features (fake))
-        assert tensors["box_features"].size() == (3, 2, 10)
-
-        # (batch size, num boxes (fake), 4 coords)
-        assert tensors["box_coordinates"].size() == (3, 2, 4)
-
-        # (batch size, num boxes (fake),)
-        assert tensors["box_mask"].size() == (3, 2)
-
-        # Nothing should be masked out since the number of fake boxes is the same
-        # for each item in the batch.
-        assert tensors["box_mask"].all()
-
-    def test_read_without_images(self):
-        reader = VQAv2Reader(
-            tokenizer=WhitespaceTokenizer(),
-            token_indexers={"tokens": SingleIdTokenIndexer()},
-        )
-        instances = list(reader.read("unittest"))
-        assert len(instances) == 3
-        assert "box_coordinates" not in instances[0]
-        assert "box_features" not in instances[0]
-        assert "box_mask" not in instances[0]
diff --git a/tests/data/image_loader_test.py b/tests/data/image_loader_test.py
index d72e0f10bf4..096f1770639 100644
--- a/tests/data/image_loader_test.py
+++ b/tests/data/image_loader_test.py
@@ -10,12 +10,7 @@ class TorchImageLoaderTest(AllenNlpTestCase):
     def setup_method(self):
         super().setup_method()
         self.image_fixture_path = str(
-            self.FIXTURES_ROOT
-            / "data"
-            / "vqav2"
-            / "images"
-            / "test_fixture"
-            / "COCO_train2014_000000458752.jpg"
+            self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"
         )
 
         # Create a few small images of different sizes from the fixture.
diff --git a/tests/models/vilbert_vqa_test.py b/tests/models/vilbert_vqa_test.py
deleted file mode 100644
index 77c77e0e155..00000000000
--- a/tests/models/vilbert_vqa_test.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from transformers import AutoModel
-
-from allennlp.common.testing import ModelTestCase
-from allennlp.data import Vocabulary
-from allennlp.models.vilbert_vqa import VqaVilbert
-from allennlp.common.testing import assert_equal_parameters
-
-
-class TestVqaVilbert(ModelTestCase):
-    def test_model_can_train_save_and_load_small_model(self):
-        param_file = self.FIXTURES_ROOT / "vilbert_vqa" / "experiment.jsonnet"
-        self.ensure_model_can_train_save_and_load(param_file)
-
-    def test_model_can_train_save_and_load_with_cache(self):
-        import tempfile
-
-        with tempfile.TemporaryDirectory(prefix=self.__class__.__name__) as d:
-            overrides = {"dataset_reader": {"feature_cache_dir": str(d)}}
-            import json
-
-            overrides = json.dumps(overrides)
-            param_file = self.FIXTURES_ROOT / "vilbert_vqa" / "experiment.jsonnet"
-            self.ensure_model_can_train_save_and_load(param_file, overrides=overrides)
-
-    def test_model_can_train_save_and_load_from_huggingface(self):
-        param_file = self.FIXTURES_ROOT / "vilbert_vqa" / "experiment_from_huggingface.jsonnet"
-        self.ensure_model_can_train_save_and_load(param_file)
-
-    def test_model_loads_weights_correctly(self):
-        vocab = Vocabulary()
-        vocab.add_tokens_to_namespace(["orange", "net", "netting", "pitcher", "catcher"], "answers")
-
-        model_name = "epwalsh/bert-xsmall-dummy"
-        model = VqaVilbert.from_huggingface_model_name(
-            vocab=vocab,
-            model_name=model_name,
-            image_feature_dim=2048,
-            image_num_hidden_layers=1,
-            image_hidden_size=6,
-            combined_hidden_size=10,
-            pooled_output_dim=7,
-            image_intermediate_size=11,
-            image_attention_dropout=0.0,
-            image_hidden_dropout=0.0,
-            image_biattention_id=[0, 1],
-            text_biattention_id=[0, 1],
-            text_fixed_layer=0,
-            image_fixed_layer=0,
-            image_num_attention_heads=3,
-            combined_num_attention_heads=2,
-        )
-
-        transformer = AutoModel.from_pretrained(model_name)
-
-        # compare embedding parameters
-        mapping = {
-            val: key
-            for key, val in model.backbone.text_embeddings._construct_default_mapping(
-                transformer.embeddings, "huggingface", {}
-            ).items()
-        }
-        assert_equal_parameters(
-            transformer.embeddings, model.backbone.text_embeddings, mapping=mapping
-        )
-
-        # compare encoder parameters
-        mapping = {
-            val: key
-            for key, val in model.backbone.encoder._construct_default_mapping(
-                transformer.encoder, "huggingface", {}
-            ).items()
-        }
-
-        # We ignore the new parameters for the second modality, since they won't be present
-        # in the huggingface model.
-        assert_equal_parameters(
-            transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping
-        )
diff --git a/tests/models/visual_entailment_test.py b/tests/models/visual_entailment_test.py
deleted file mode 100644
index 0f7faff2540..00000000000
--- a/tests/models/visual_entailment_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from transformers import AutoModel
-
-from allennlp.common.testing import ModelTestCase
-from allennlp.data import Vocabulary
-from allennlp.models.visual_entailment import VisualEntailmentModel
-from allennlp.common.testing import assert_equal_parameters
-
-
-class TestVEVilbert(ModelTestCase):
-    def test_model_can_train_save_and_load_small_model(self):
-        param_file = self.FIXTURES_ROOT / "vilbert_ve" / "experiment.jsonnet"
-        self.ensure_model_can_train_save_and_load(param_file)
-
-    def test_model_can_train_save_and_load_with_cache(self):
-        import tempfile
-
-        with tempfile.TemporaryDirectory(prefix=self.__class__.__name__) as d:
-            overrides = {"dataset_reader": {"feature_cache_dir": str(d)}}
-            import json
-
-            overrides = json.dumps(overrides)
-            param_file = self.FIXTURES_ROOT / "vilbert_ve" / "experiment.jsonnet"
-            self.ensure_model_can_train_save_and_load(param_file, overrides=overrides)
-
-    def test_model_can_train_save_and_load_from_huggingface(self):
-        param_file = self.FIXTURES_ROOT / "vilbert_ve" / "experiment_from_huggingface.jsonnet"
-        self.ensure_model_can_train_save_and_load(param_file)
-
-    def test_model_loads_weights_correctly(self):
-        vocab = Vocabulary()
-
-        model_name = "epwalsh/bert-xsmall-dummy"
-        model = VisualEntailmentModel.from_huggingface_model_name(
-            vocab=vocab,
-            model_name=model_name,
-            image_feature_dim=2048,
-            image_num_hidden_layers=1,
-            image_hidden_size=3,
-            image_num_attention_heads=1,
-            combined_num_attention_heads=1,
-            combined_hidden_size=5,
-            pooled_output_dim=7,
-            image_intermediate_size=11,
-            image_attention_dropout=0.0,
-            image_hidden_dropout=0.0,
-            image_biattention_id=[0, 1],
-            text_biattention_id=[0, 1],
-            text_fixed_layer=0,
-            image_fixed_layer=0,
-        )
-
-        transformer = AutoModel.from_pretrained(model_name)
-
-        # compare embedding parameters
-        mapping = {
-            val: key
-            for key, val in model.backbone.text_embeddings._construct_default_mapping(
-                transformer.embeddings, "huggingface", {}
-            ).items()
-        }
-        assert_equal_parameters(
-            transformer.embeddings, model.backbone.text_embeddings, mapping=mapping
-        )
-
-        # compare encoder parameters
-        mapping = {
-            val: key
-            for key, val in model.backbone.encoder._construct_default_mapping(
-                transformer.encoder, "huggingface", {}
-            ).items()
-        }
-
-        # We ignore the new parameters for the second modality, since they won't be present
-        # in the huggingface model.
-        assert_equal_parameters(
-            transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping
-        )
diff --git a/tests/modules/vision/grid_embedder_test.py b/tests/modules/vision/grid_embedder_test.py
index 9012fd5d4b9..9fddfb27727 100644
--- a/tests/modules/vision/grid_embedder_test.py
+++ b/tests/modules/vision/grid_embedder_test.py
@@ -10,14 +10,7 @@ def test_forward_runs(self):
         backbone = ResnetBackbone().to("cuda:0")
 
         image_pixels, image_size = loader(
-            [
-                self.FIXTURES_ROOT
-                / "data"
-                / "vqav2"
-                / "images"
-                / "test_fixture"
-                / "COCO_train2014_000000458752.jpg"
-            ]
+            [self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"]
         )
         result = backbone(image_pixels, image_size)
         assert tuple(result.keys()) == backbone.get_feature_names()
diff --git a/tests/modules/vision/region_detector_test.py b/tests/modules/vision/region_detector_test.py
index da4746cb349..a8608fbddff 100644
--- a/tests/modules/vision/region_detector_test.py
+++ b/tests/modules/vision/region_detector_test.py
@@ -15,14 +15,7 @@ def test_forward_runs(self):
         detector = FasterRcnnRegionDetector().to(device="cuda:0")
         detector.eval()
 
-        image_path = (
-            self.FIXTURES_ROOT
-            / "data"
-            / "vqav2"
-            / "images"
-            / "test_fixture"
-            / "COCO_train2014_000000458752.jpg"
-        )
+        image_path = self.FIXTURES_ROOT / "data" / "images" / "COCO_train2014_000000458752.jpg"
 
         images, sizes = loader([image_path, image_path])
         image_features = backbone(images, sizes)
diff --git a/training_configs/vilbert_gqa_from_huggingface.jsonnet b/training_configs/vilbert_gqa_from_huggingface.jsonnet
deleted file mode 100644
index f38a046e4d6..00000000000
--- a/training_configs/vilbert_gqa_from_huggingface.jsonnet
+++ /dev/null
@@ -1,91 +0,0 @@
-local model_name = "bert-base-uncased";
-local effective_batch_size = 128;
-local gpu_batch_size = 32;
-local num_gpus = 1;
-
-local construct_vocab = false;
-
-#local gqa_dir = "/Users/dirkg/Documents/data/vision/gqa/";
-local gqa_dir = "/mnt/tank/dirkg/data/vision/gqa/";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": "https://storage.googleapis.com/allennlp-public-data/gqa/vilbert_gqa.vocab.tar.gz"
-    };
-
-{
-  "dataset_reader": {
-    "type": "gqa",
-    "image_dir": gqa_dir + "/images",
-    [if !construct_vocab then "feature_cache_dir"]: gqa_dir + "/feature_cache",
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 16,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": "train_all",
-  "validation_data_path": "testdev_all",
-  "model": {
-    "type": "vqa_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 1024,
-    "image_hidden_size": 1024,
-    "image_num_attention_heads": 8,
-    "image_num_hidden_layers": 6,
-    "combined_hidden_size": 1024,
-    "combined_num_attention_heads": 8,
-    "pooled_output_dim": 1024,
-    "image_intermediate_size": 1024,
-    "image_attention_dropout": 0.1,
-    "image_hidden_dropout": 0.1,
-    "image_biattention_id": [0, 1, 2, 3, 4, 5],
-    "text_biattention_id": [6, 7, 8, 9, 10, 11],
-    "text_fixed_layer": 0,
-    "image_fixed_layer": 0,
-    "fusion_method": "mul"
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    "max_instances_in_memory": 1024*16
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    #"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  "trainer": {
-    "optimizer": {
-        "type": "huggingface_adamw",
-        "lr": 4e-5
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      "warmup_steps": 5000,
-      "num_steps_per_epoch": std.ceil(14304359 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"])
-    },
-    "validation_metric": "+fscore",
-    "patience": 5,
-    "num_epochs": 20,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus)
-  },
-}
diff --git a/training_configs/vilbert_gqa_from_huggingface_balanced.jsonnet b/training_configs/vilbert_gqa_from_huggingface_balanced.jsonnet
deleted file mode 100644
index 0cc6f7c92dd..00000000000
--- a/training_configs/vilbert_gqa_from_huggingface_balanced.jsonnet
+++ /dev/null
@@ -1,91 +0,0 @@
-local model_name = "bert-base-uncased";
-local effective_batch_size = 128;
-local gpu_batch_size = 32;
-local num_gpus = 1;
-
-local construct_vocab = false;
-
-#local gqa_dir = "/Users/dirkg/Documents/data/vision/gqa/";
-local gqa_dir = "/mnt/tank/dirkg/data/vision/gqa/";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": "https://storage.googleapis.com/allennlp-public-data/gqa/vilbert_gqa.vocab.tar.gz"
-    };
-
-{
-  "dataset_reader": {
-    "type": "gqa",
-    "image_dir": gqa_dir + "/images",
-    [if !construct_vocab then "feature_cache_dir"]: gqa_dir + "/feature_cache",
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 16,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": "train_balanced",
-  "validation_data_path": "testdev_balanced",
-  "model": {
-    "type": "vqa_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 1024,
-    "image_hidden_size": 1024,
-    "image_num_attention_heads": 8,
-    "image_num_hidden_layers": 6,
-    "combined_hidden_size": 1024,
-    "combined_num_attention_heads": 8,
-    "pooled_output_dim": 1024,
-    "image_intermediate_size": 1024,
-    "image_attention_dropout": 0.1,
-    "image_hidden_dropout": 0.1,
-    "image_biattention_id": [0, 1, 2, 3, 4, 5],
-    "text_biattention_id": [6, 7, 8, 9, 10, 11],
-    "text_fixed_layer": 0,
-    "image_fixed_layer": 0,
-    "fusion_method": "mul"
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    "max_instances_in_memory": 1024*16
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    #"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  "trainer": {
-    "optimizer": {
-        "type": "huggingface_adamw",
-        "lr": 4e-5
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      "warmup_steps": 5000,
-      "num_steps_per_epoch": std.ceil(942255 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"])
-    },
-    "validation_metric": "+fscore",
-    "patience": 5,
-    "num_epochs": 20,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus)
-  },
-}
diff --git a/training_configs/vilbert_multitask.jsonnet b/training_configs/vilbert_multitask.jsonnet
deleted file mode 100644
index e930cb41a17..00000000000
--- a/training_configs/vilbert_multitask.jsonnet
+++ /dev/null
@@ -1,157 +0,0 @@
-local model_name = "bert-base-cased";
-local effective_batch_size = 128;
-local gpu_batch_size = 128;
-local num_gpus = 1;
-
-local construct_vocab = false;
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": std.format(
-        "https://storage.googleapis.com/allennlp-public-data/vilbert/vilbert_multitask.%s.vocab.tar.gz",
-        model_name)
-    };
-
-local reader_common = {
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000, # DEBUG
-    "image_processing_batch_size": 32,
-};
-
-{
-  "dataset_reader": {
-    "type": "multitask",
-    "readers": {
-      "vqa": reader_common {
-        "type": "vqav2",
-        "image_dir": "/mnt/tank/dirkg/data/vision/vqa/balanced_real",
-        [if !construct_vocab then "feature_cache_dir"]: "/mnt/tank/dirkg/data/vision/vqa/balanced_real/feature_cache",
-        #"image_dir": "/Users/dirkg/Documents/data/vision/vqa/balanced_real",
-        #[if !construct_vocab then "feature_cache_dir"]: "/Users/dirkg/Documents/data/vision/vqa/balanced_real/feature_cache",
-        "answer_vocab": if construct_vocab then null else vocabulary,
-        "multiple_answers_per_question": !construct_vocab
-      },
-      "gqa": reader_common {
-        "type": "gqa",
-        "image_dir": "/mnt/tank/dirkg/data/vision/gqa",
-        [if !construct_vocab then "feature_cache_dir"]: "/mnt/tank/dirkg/data/vision/gqa/feature_cache",
-        #"image_dir": "/Users/dirkg/Documents/data/vision/gqa",
-        #[if !construct_vocab then "feature_cache_dir"]: "/Users/dirkg/Documents/data/vision/gqa/feature_cache",
-        "answer_vocab": if construct_vocab then null else vocabulary
-      },
-      "ve": reader_common {
-        "type": "visual-entailment",
-        "image_dir": "/mnt/tank/dirkg/data/vision/SNLI-VE/data/Flickr30K/flickr30k_images",
-        [if !construct_vocab then "feature_cache_dir"]: "/mnt/tank/dirkg/data/vision/SNLI-VE/data/feature_cache",
-        #"image_dir": "/Users/dirkg/Documents/data/vision/SNLI-VE/data/Flickr30K/flickr30k_images",
-        #[if !construct_vocab then "feature_cache_dir"]: "/Users/dirkg/Documents/data/vision/SNLI-VE/data/feature_cache",
-      }
-    }
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "readers": super.readers {
-      "vqa": super.vqa {
-        "answer_vocab": null    // make sure we don't skip unanswerable questions during validation
-      }
-    }
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": {
-    "vqa": ["balanced_real_train", "balanced_real_val[1000:]"],
-    "gqa": "train_balanced",
-    "ve": "train",
-  },
-  "validation_data_path": {
-    "vqa": "balanced_real_val[:1000]",
-    "gqa": "val_balanced",
-    "ve": "dev",
-  },
-  "model": {
-    "type": "multitask",
-    "arg_name_mapping": {
-      "backbone": {"question": "text", "hypothesis": "text"}
-    },
-    "backbone": {
-      "type": "vilbert_from_huggingface",
-      "model_name": model_name,
-      "image_feature_dim": 1024,
-      "image_num_hidden_layers": 6,
-      "image_hidden_size": 1024,
-      "image_num_attention_heads": 8,
-      "image_intermediate_size": 1024,
-      "image_attention_dropout": 0.1,
-      "image_hidden_dropout": 0.1,
-      "image_biattention_id": [0, 1, 2, 3, 4, 5],
-      "text_biattention_id": [6, 7, 8, 9, 10, 11],
-      "text_fixed_layer": 0,
-      "image_fixed_layer": 0,
-      "combined_hidden_size": 1024,
-      "combined_num_attention_heads": 8,
-      "pooled_output_dim": 1024,
-      "fusion_method": "mul"
-    },
-    "heads": {
-      "vqa": {
-        "type": "vqa",
-        "embedding_dim": 1024
-      },
-      "gqa": {
-        "type": "vqa",
-        "embedding_dim": 1024
-      },
-      "ve": {
-        "type": "visual_entailment",
-        "embedding_dim": 1024
-      }
-    }
-  },
-  "data_loader": {
-    "type": "multitask",
-    "scheduler": {
-        "batch_size": gpu_batch_size,
-    },
-    "shuffle": true,
-    //[if !construct_vocab then "max_instances_in_memory"]: 1024*16
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    //"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  // Don't train if we're just constructing vocab. The results would be confusing.
-  [if !construct_vocab then "trainer"]: {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 4e-5,
-      "correct_bias": true,
-      "weight_decay": 0.01,
-      "parameter_groups": [[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}]],
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      "warmup_steps": 10000,
-    },
-    "validation_metric": ["+gqa_vqa", "+vqa_vqa", "+ve_acc"],
-    "patience": 5,
-    "num_epochs": 30,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus),
-  },
-  "random_seed": 876170670,
-  "numpy_seed": 876170670,
-  "pytorch_seed": 876170670,
-}
diff --git a/training_configs/vilbert_ve_from_huggingface.jsonnet b/training_configs/vilbert_ve_from_huggingface.jsonnet
deleted file mode 100644
index b0b0ed84b17..00000000000
--- a/training_configs/vilbert_ve_from_huggingface.jsonnet
+++ /dev/null
@@ -1,74 +0,0 @@
-local model_name = "bert-large-uncased";
-local effective_batch_size = 128;
-local gpu_batch_size = 32;
-local num_gpus = 0;
-
-local datadir = "/net/s3/allennlp/akshitab/data/SNLI-VE/data/";
-
-{
-  "dataset_reader": {
-    "type": "visual-entailment",
-    "image_dir": datadir + "Flickr30K/flickr30k_images",
-    "feature_cache_dir": datadir + "/feature_cache_torchvision",
-    "image_loader": "torch",
-    "image_featurizer": "resnet_backbone",
-    "region_detector": "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    "image_processing_batch_size": 16,
-  },
-  "train_data_path": "https://storage.googleapis.com/allennlp-public-data/snli-ve/snli_ve_train.jsonl.gz",
-  "validation_data_path": "https://storage.googleapis.com/allennlp-public-data/snli-ve/snli_ve_dev.jsonl.gz",
-  "test_data_path": "https://storage.googleapis.com/allennlp-public-data/snli-ve/snli_ve_test.jsonl.gz",
-  "model": {
-    "type": "ve_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 1024,
-    "image_hidden_size": 1024,
-    "image_num_attention_heads": 8,
-    "image_num_hidden_layers": 6,
-    "combined_hidden_size": 1024,
-    "combined_num_attention_heads": 8,
-    "pooled_output_dim": 1024,
-    "image_intermediate_size": 1024,
-    "image_attention_dropout": 0.1,
-    "image_hidden_dropout": 0.1,
-    "image_biattention_id": [0, 1, 2, 3, 4, 5],
-    "text_biattention_id": [6, 7, 8, 9, 10, 11],
-    "text_fixed_layer": 0,
-    "image_fixed_layer": 0,
-    "fusion_method": "mul"
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    "max_instances_in_memory": 1024
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    #"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  "trainer": {
-    "optimizer": {
-        "type": "huggingface_adamw",
-        "lr": 4e-5,
-        "weight_decay": 0.01
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      "num_steps_per_epoch": std.ceil(529527 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"]),
-      "warmup_steps": std.ceil(self.num_steps_per_epoch / 2),
-    },
-    "validation_metric": "+accuracy",
-    "num_epochs": 20,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus)
-  },
-}
diff --git a/training_configs/vilbert_vqa.jsonnet b/training_configs/vilbert_vqa.jsonnet
deleted file mode 100644
index 82b143fd2ff..00000000000
--- a/training_configs/vilbert_vqa.jsonnet
+++ /dev/null
@@ -1,122 +0,0 @@
-local model_name = "bert-base-uncased";
-local vocab_size = 30522;     // for bert-*-uncased models
-//local vocab_size = 28996;   // for bert-*-cased models
-local effective_batch_size = 128;
-local gpu_batch_size = 128;
-local num_gpus = 1;
-
-local construct_vocab = false;
-local dataset = "balanced_real";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": std.format(
-        "https://storage.googleapis.com/allennlp-public-data/vqav2/vilbert_vqa_%s.%s.vocab.tar.gz",
-        [dataset, model_name])
-    };
-
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": std.format("/mnt/tank/dirkg/data/vision/vqa/%s", dataset),
-    [if !construct_vocab then "feature_cache_dir"]: std.format("/mnt/tank/dirkg/data/vision/vqa/%s/feature_cache", dataset),
-    #"image_dir": std.format("/Users/dirkg/Documents/data/vision/vqa/%s", dataset),
-    #[if !construct_vocab then "feature_cache_dir"]: std.format("/Users/dirkg/Documents/data/vision/vqa/%s/feature_cache", dataset),
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 32,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-    "multiple_answers_per_question": !construct_vocab
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null    // make sure we don't skip unanswerable questions during validation
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": [std.format("%s_train", dataset), std.format("%s_val[1000:]", dataset)],
-  "validation_data_path": std.format("%s_val[:1000]", dataset),
-  "model": {
-    "type": "vqa_vilbert",
-    "text_embeddings": {
-      "vocab_size": vocab_size,
-      "embedding_size": 768,
-      "pad_token_id": 0,
-      "max_position_embeddings": 512,
-      "type_vocab_size": 2,
-      "dropout": 0.1
-    },
-    "image_embeddings": {
-      "feature_size": 1024,
-      "embedding_size": 1024
-    },
-    "encoder": {
-      # text
-      "hidden_size1": 768,
-      "num_hidden_layers1": 12,
-      "intermediate_size1": 3072,
-      "num_attention_heads1": 12,
-      "attention_dropout1": 0.1,
-      "hidden_dropout1": 0.1,
-      "biattention_id1": [6, 7, 8, 9, 10, 11],
-      "fixed_layer1": 0,
-
-      # vision
-      "hidden_size2": 1024,
-      "num_hidden_layers2": 6,
-      "intermediate_size2": 1024,
-      "num_attention_heads2": 8,
-      "attention_dropout2": 0.1,
-      "hidden_dropout2": 0.1,
-      "biattention_id2": [0, 1, 2, 3, 4, 5],
-      "fixed_layer2": 0,
-
-      "combined_num_attention_heads": 8,
-      "combined_hidden_size": 1024,
-      "activation": "gelu",
-    },
-    "pooled_output_dim": 1024,
-    "fusion_method": "mul"
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    //[if !construct_vocab then "max_instances_in_memory"]: 1024
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    //"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  // Don't train if we're just constructing vocab. The results would be confusing.
-  [if !construct_vocab then "trainer"]: {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 4e-4
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      //"num_steps_per_epoch": std.ceil(0 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"]),
-      "warmup_steps": 5000,
-    },
-    "validation_metric": "+vqa_score",
-    "num_epochs": 50,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus)
-  },
-  "random_seed": 876170670,
-  "numpy_seed": 876170670,
-  "pytorch_seed": 876170670,
-}
diff --git a/training_configs/vilbert_vqa_bert_large.jsonnet b/training_configs/vilbert_vqa_bert_large.jsonnet
deleted file mode 100644
index 3e7e2bbbaca..00000000000
--- a/training_configs/vilbert_vqa_bert_large.jsonnet
+++ /dev/null
@@ -1,122 +0,0 @@
-local model_name = "bert-large-uncased";
-local vocab_size = 30522;     // for bert-*-uncased models
-//local vocab_size = 28996;   // for bert-*-cased models
-local effective_batch_size = 128;
-local gpu_batch_size = 32;
-local num_gpus = 1;
-
-local construct_vocab = false;
-local dataset = "balanced_real";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": std.format(
-        "https://storage.googleapis.com/allennlp-public-data/vqav2/vilbert_vqa_%s.%s.vocab.tar.gz",
-        [dataset, model_name])
-    };
-
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": std.format("/mnt/tank/dirkg/data/vision/vqa/%s", dataset),
-    [if !construct_vocab then "feature_cache_dir"]: std.format("/mnt/tank/dirkg/data/vision/vqa/%s/feature_cache", dataset),
-    #"image_dir": std.format("/Users/dirkg/Documents/data/vision/vqa/%s", dataset),
-    #[if !construct_vocab then "feature_cache_dir"]: std.format("/Users/dirkg/Documents/data/vision/vqa/%s/feature_cache", dataset),
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 32,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-    "multiple_answers_per_question": !construct_vocab
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null    // make sure we don't skip unanswerable questions during validation
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": [std.format("%s_train", dataset), std.format("%s_val[1000:]", dataset)],
-  "validation_data_path": std.format("%s_val[:1000]", dataset),
-  "model": {
-    "type": "vqa_vilbert",
-    "text_embeddings": {
-      "vocab_size": vocab_size,
-      "hidden_size": 1024,
-      "pad_token_id": 0,
-      "max_position_embeddings": 512,
-      "type_vocab_size": 2,
-      "dropout": 0.1
-    },
-    "image_embeddings": {
-      "feature_dim": 1024,
-      "hidden_dim": 1024
-    },
-    "encoder": {
-      # text
-      "hidden_size1": 1024,
-      "num_hidden_layers1": 24,
-      "intermediate_size1": 4096,
-      "num_attention_heads1": 16,
-      "attention_dropout1": 0.1,
-      "hidden_dropout1": 0.1,
-      "biattention_id1": [18, 19, 20, 21, 22, 23],
-      "fixed_layer1": 0,
-
-      # vision
-      "hidden_size2": 1024,
-      "num_hidden_layers2": 6,
-      "intermediate_size2": 1024,
-      "num_attention_heads2": 8,
-      "attention_dropout2": 0.1,
-      "hidden_dropout2": 0.1,
-      "biattention_id2": [0, 1, 2, 3, 4, 5],
-      "fixed_layer2": 0,
-
-      "combined_num_attention_heads": 8,
-      "combined_hidden_size": 1024,
-      "activation": "gelu",
-    },
-    "pooled_output_dim": 1024,
-    "fusion_method": "mul"
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    //[if !construct_vocab then "max_instances_in_memory"]: 1024
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    //"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  // Don't train if we're just constructing vocab. The results would be confusing.
-  [if !construct_vocab then "trainer"]: {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 4e-4
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      //"num_steps_per_epoch": std.ceil(0 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"]),
-      "warmup_steps": 5000,
-    },
-    "validation_metric": "+vqa_score",
-    "num_epochs": 50,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus)
-  },
-  "random_seed": 876170670,
-  "numpy_seed": 876170670,
-  "pytorch_seed": 876170670,
-}
diff --git a/training_configs/vilbert_vqa_pretrained.jsonnet b/training_configs/vilbert_vqa_pretrained.jsonnet
deleted file mode 100644
index 22741b72f84..00000000000
--- a/training_configs/vilbert_vqa_pretrained.jsonnet
+++ /dev/null
@@ -1,113 +0,0 @@
-local model_name = "bert-base-uncased";
-local vocab_size = 30522;     // for bert-*-uncased models
-//local vocab_size = 28996;   // for bert-*-cased models
-local effective_batch_size = 128;
-local gpu_batch_size = 128;
-local num_gpus = 1;
-
-local construct_vocab = false;
-local dataset = "balanced_real";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": std.format(
-        "https://storage.googleapis.com/allennlp-public-data/vqav2/vilbert_vqa_%s.%s.vocab.tar.gz",
-        [dataset, model_name])
-    };
-
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": std.format("/mnt/tank/dirkg/data/vision/vqa/%s", dataset),
-    [if !construct_vocab then "feature_cache_dir"]: std.format("/mnt/tank/dirkg/data/vision/vqa/%s/feature_cache", dataset),
-    #"image_dir": std.format("/Users/dirkg/Documents/data/vision/vqa/%s", dataset),
-    #[if !construct_vocab then "feature_cache_dir"]: std.format("/Users/dirkg/Documents/data/vision/vqa/%s/feature_cache", dataset),
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 16,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-    "multiple_answers_per_question": !construct_vocab,
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null    // make sure we don't skip unanswerable questions during validation
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": [std.format("%s_train", dataset), std.format("%s_val[1000:]", dataset)],
-  "validation_data_path": std.format("%s_val[:1000]", dataset),
-  "model": {
-    "type": "vqa_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 1024,
-    "image_hidden_size": 1024,
-    "image_num_attention_heads": 8,
-    "image_num_hidden_layers": 6,
-    "combined_hidden_size": 1024,
-    "combined_num_attention_heads": 8,
-    "pooled_output_dim": 1024,
-    "image_intermediate_size": 1024,
-    "image_attention_dropout": 0.1,
-    "image_hidden_dropout": 0.1,
-    "image_biattention_id": [0, 1, 2, 3, 4, 5],
-    "text_biattention_id": [6, 7, 8, 9, 10, 11],
-    "text_fixed_layer": 0,
-    "image_fixed_layer": 0,
-    "fusion_method": "mul",
-    "ignore_text": false, # debug setting
-    "ignore_image": false, # debug setting
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    //[if !construct_vocab then "max_instances_in_memory"]: 10240
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    #"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  // Don't train if we're just constructing vocab. The results would be confusing.
-  [if !construct_vocab then "trainer"]: {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 4e-5,
-      "correct_bias": true,
-      "weight_decay": 0.01,
-      "parameter_groups": [
-        // [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], // can't use both at the same time
-        // smaller learning rate for the pretrained weights
-        [["^embeddings\\.", "^encoder.layers1\\.", "^t_pooler\\."], {"lr": 4e-6}]
-      ],
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      //"num_steps_per_epoch": std.ceil(0 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"]),
-      "warmup_steps": 5000
-    },
-    "validation_metric": "+vqa_score",
-    "patience": 5,
-    "num_epochs": 40,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus),
-    "tensorboard_writer": {
-        "summary_interval": 10,
-        "should_log_learning_rate": true
-    },
-  },
-  "random_seed": 876170670,
-  "numpy_seed": 876170670,
-  "pytorch_seed": 876170670,
-}
diff --git a/training_configs/vilbert_vqa_pretrained_bert_large.jsonnet b/training_configs/vilbert_vqa_pretrained_bert_large.jsonnet
deleted file mode 100644
index 5ea0b203cfe..00000000000
--- a/training_configs/vilbert_vqa_pretrained_bert_large.jsonnet
+++ /dev/null
@@ -1,113 +0,0 @@
-local model_name = "bert-large-uncased";
-local vocab_size = 30522;     // for bert-*-uncased models
-//local vocab_size = 28996;   // for bert-*-cased models
-local effective_batch_size = 128;
-local gpu_batch_size = 32;
-local num_gpus = 1;
-
-local construct_vocab = false;
-local dataset = "balanced_real";
-
-local vocabulary = if construct_vocab then {
-      // read the files to construct the vocab
-      "min_count": {"answers": 9}
-    } else {
-      // read the constructed vocab
-      "type": "from_files",
-      "directory": std.format(
-        "https://storage.googleapis.com/allennlp-public-data/vqav2/vilbert_vqa_%s.%s.vocab.tar.gz",
-        [dataset, model_name])
-    };
-
-{
-  "dataset_reader": {
-    "type": "vqav2",
-    "image_dir": std.format("/mnt/tank/dirkg/data/vision/vqa/%s", dataset),
-    [if !construct_vocab then "feature_cache_dir"]: std.format("/mnt/tank/dirkg/data/vision/vqa/%s/feature_cache", dataset),
-    #"image_dir": std.format("/Users/dirkg/Documents/data/vision/vqa/%s", dataset),
-    #[if !construct_vocab then "feature_cache_dir"]: std.format("/Users/dirkg/Documents/data/vision/vqa/%s/feature_cache", dataset),
-    [if !construct_vocab then "image_loader"]: "torch",
-    [if !construct_vocab then "image_featurizer"]: "resnet_backbone",
-    [if !construct_vocab then "region_detector"]: "faster_rcnn",
-    "tokenizer": {
-      "type": "pretrained_transformer",
-      "model_name": model_name
-    },
-    "token_indexers": {
-      "tokens": {
-        "type": "pretrained_transformer",
-        "model_name": model_name
-      }
-    },
-    #"max_instances": 1000,
-    "image_processing_batch_size": 16,
-    "answer_vocab": if construct_vocab then null else vocabulary,
-    "multiple_answers_per_question": !construct_vocab
-  },
-  "validation_dataset_reader": self.dataset_reader {
-    "answer_vocab": null    // make sure we don't skip unanswerable questions during validation
-  },
-  "vocabulary": vocabulary,
-  "train_data_path": [std.format("%s_train", dataset), std.format("%s_val[1000:]", dataset)],
-  "validation_data_path": std.format("%s_val[:1000]", dataset),
-  "model": {
-    "type": "vqa_vilbert_from_huggingface",
-    "model_name": model_name,
-    "image_feature_dim": 1024,
-    "image_hidden_size": 1024,
-    "image_num_attention_heads": 8,
-    "image_num_hidden_layers": 6,
-    "combined_hidden_size": 1024,
-    "combined_num_attention_heads": 8,
-    "pooled_output_dim": 1024,
-    "image_intermediate_size": 1024,
-    "image_attention_dropout": 0.1,
-    "image_hidden_dropout": 0.1,
-    "image_biattention_id": [0, 1, 2, 3, 4, 5],
-    "text_biattention_id": [18, 19, 20, 21, 22, 23],
-    "text_fixed_layer": 0,
-    "image_fixed_layer": 0,
-    "fusion_method": "mul",
-    "ignore_text": false, # debug setting
-    "ignore_image": false, # debug setting
-  },
-  "data_loader": {
-    "batch_size": gpu_batch_size,
-    "shuffle": true,
-    //[if !construct_vocab then "max_instances_in_memory"]: 1024
-  },
-  [if num_gpus > 1 then "distributed"]: {
-    "cuda_devices": std.range(0, num_gpus - 1)
-    #"cuda_devices": std.repeat([-1], num_gpus)  # Use this for debugging on CPU
-  },
-  // Don't train if we're just constructing vocab. The results would be confusing.
-  [if !construct_vocab then "trainer"]: {
-    "optimizer": {
-      "type": "huggingface_adamw",
-      "lr": 4e-4,
-      "correct_bias": true,
-      "weight_decay": 0.01,
-      "parameter_groups": [
-        // [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}], // can't use both at the same time
-        // smaller learning rate for the pretrained weights
-        [["^embeddings\\.", "^encoder.layers1\\.", "^t_pooler\\."], {"lr": 4e-5}]
-      ],
-    },
-    "learning_rate_scheduler": {
-      "type": "linear_with_warmup",
-      //"num_steps_per_epoch": std.ceil(0 / $["data_loader"]["batch_size"] / $["trainer"]["num_gradient_accumulation_steps"]),
-      "warmup_steps": 5000
-    },
-    "validation_metric": "+vqa_score",
-    "patience": 5,
-    "num_epochs": 30,
-    "num_gradient_accumulation_steps": effective_batch_size / gpu_batch_size / std.max(1, num_gpus),
-    "tensorboard_writer": {
-        "summary_interval": 10,
-        "should_log_learning_rate": true
-    },
-  },
-  "random_seed": 42,
-  "numpy_seed": 42,
-  "pytorch_seed": 42,
-}