diff --git a/Makefile b/Makefile
index c3aa8c0d23f..5b85460f12f 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@ MD_DOCS_ROOT = docs/
 MD_DOCS_API_ROOT = $(MD_DOCS_ROOT)api/
 MD_DOCS_SRC = $(filter-out $(SRC)/__main__.py %/__init__.py $(SRC)/version.py,$(shell find $(SRC) -type f -name '*.py' | grep -v -E 'tests/'))
 MD_DOCS = $(subst .py,.md,$(subst $(SRC)/,$(MD_DOCS_API_ROOT),$(MD_DOCS_SRC)))
-MD_DOCS_CMD = python scripts/py2md.py
+MD_DOCS_CMD = python allennlp/tools/py2md.py
 MD_DOCS_CONF = mkdocs.yml
 MD_DOCS_CONF_SRC = mkdocs-skeleton.yml
 MD_DOCS_TGT = site/
@@ -118,7 +118,7 @@ $(MD_DOCS_ROOT)%.md : %.md
 $(MD_DOCS_CONF) : $(MD_DOCS_CONF_SRC) $(MD_DOCS)
 	python scripts/build_docs_config.py $@ $(MD_DOCS_CONF_SRC) $(MD_DOCS_ROOT) $(MD_DOCS_API_ROOT)
 
-$(MD_DOCS_API_ROOT)%.md : $(SRC)/%.py scripts/py2md.py
+$(MD_DOCS_API_ROOT)%.md : $(SRC)/%.py allennlp/tools/py2md.py
 	mkdir -p $(shell dirname $@)
 	$(MD_DOCS_CMD) $(subst /,.,$(subst .py,,$<)) --out $@
 
diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index c580c823f8d..087e99c0e3c 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -2,54 +2,8 @@
 The `evaluate` subcommand can be used to
 evaluate a trained model against a dataset
 and report any metrics calculated by the model.
-
-    $ allennlp evaluate --help
-    usage: allennlp evaluate [-h] [--output-file OUTPUT_FILE]
-                             [--weights-file WEIGHTS_FILE]
-                             [--cuda-device CUDA_DEVICE] [-o OVERRIDES]
-                             [--batch-size BATCH_SIZE]
-                             [--batch-weight-key BATCH_WEIGHT_KEY]
-                             [--extend-vocab]
-                             [--embedding-sources-mapping EMBEDDING_SOURCES_MAPPING]
-                             [--include-package INCLUDE_PACKAGE]
-                             archive_file input_file
-
-    Evaluate the specified model + dataset
-
-    positional arguments:
-      archive_file          path to an archived trained model
-      input_file            path to the file containing the evaluation data
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      --output-file OUTPUT_FILE
-                            path to output file
-      --weights-file WEIGHTS_FILE
-                            a path that overrides which weights file to use
-      --cuda-device CUDA_DEVICE
-                            id of GPU to use (if any)
-      -o OVERRIDES, --overrides OVERRIDES
-                            a JSON structure used to override the experiment
-                            configuration
-      --batch-size BATCH_SIZE
-                            If non-empty, the batch size to use during evaluation.
-      --batch-weight-key BATCH_WEIGHT_KEY
-                            If non-empty, name of metric used to weight the loss
-                            on a per-batch basis.
-      --extend-vocab        if specified, we will use the instances in your new
-                            dataset to extend your vocabulary. If pretrained-file
-                            was used to initialize embedding layers, you may also
-                            need to pass --embedding-sources-mapping.
-      --embedding-sources-mapping EMBEDDING_SOURCES_MAPPING
-                            a JSON dict defining mapping from embedding module
-                            path to embedding pretrained-file used during
-                            training. If not passed, and embedding needs to be
-                            extended, we will try to use the original file paths
-                            used during training. If they are not available we
-                            will use random vectors for embedding extension.
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
+
 import argparse
 import json
 import logging
diff --git a/allennlp/commands/find_learning_rate.py b/allennlp/commands/find_learning_rate.py
index 6a086332b32..0f86e37caf7 100644
--- a/allennlp/commands/find_learning_rate.py
+++ b/allennlp/commands/find_learning_rate.py
@@ -2,43 +2,6 @@
 The `find-lr` subcommand can be used to find a good learning rate for a model.
 It requires a configuration file and a directory in
 which to write the results.
-
-    $ allennlp find-lr --help
-    usage: allennlp find-lr [-h] -s SERIALIZATION_DIR [-o OVERRIDES]
-                            [--start-lr START_LR] [--end-lr END_LR]
-                            [--num-batches NUM_BATCHES]
-                            [--stopping-factor STOPPING_FACTOR] [--linear] [-f]
-                            [--include-package INCLUDE_PACKAGE]
-                            param_path
-
-    Find a learning rate range where loss decreases quickly for the specified
-    model and dataset.
-
-    positional arguments:
-      param_path            path to parameter file describing the model to be
-                            trained
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -s SERIALIZATION_DIR, --serialization-dir SERIALIZATION_DIR
-                            The directory in which to save results.
-      -o OVERRIDES, --overrides OVERRIDES
-                            a JSON structure used to override the experiment
-                            configuration
-      --start-lr START_LR   learning rate to start the search (default = 1e-05)
-      --end-lr END_LR       learning rate up to which search is done (default =
-                            10)
-      --num-batches NUM_BATCHES
-                            number of mini-batches to run learning rate finder
-                            (default = 100)
-      --stopping-factor STOPPING_FACTOR
-                            stop the search when the current loss exceeds the best
-                            loss recorded by multiple of stopping factor
-      --linear              increase learning rate linearly instead of exponential
-                            increase
-      -f, --force           overwrite the output directory if it exists
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
 
 import argparse
@@ -161,7 +124,7 @@ def find_learning_rate_model(
 
     # Parameters
 
-    params : [`Params`](../common/params.md#params)
+    params : `Params`
         A parameter object specifying an AllenNLP Experiment.
     serialization_dir : `str`
         The directory in which to save results.
@@ -266,7 +229,7 @@ def search_learning_rate(
 
     # Parameters
 
-    trainer: [`GradientDescentTrainer`](../training/trainer.md#gradientdescenttrainer)
+    trainer: `GradientDescentTrainer`
     start_lr : `float`
         The learning rate to start the search.
     end_lr : `float`
diff --git a/allennlp/commands/predict.py b/allennlp/commands/predict.py
index 084b0797ff3..1dde655de40 100644
--- a/allennlp/commands/predict.py
+++ b/allennlp/commands/predict.py
@@ -2,51 +2,8 @@
 The `predict` subcommand allows you to make bulk JSON-to-JSON
 or dataset to JSON predictions using a trained model and its
 [`Predictor`](../predictors/predictor.md#predictor) wrapper.
-
-    $ allennlp predict --help
-    usage: allennlp predict [-h] [--output-file OUTPUT_FILE]
-                            [--weights-file WEIGHTS_FILE]
-                            [--batch-size BATCH_SIZE] [--silent]
-                            [--cuda-device CUDA_DEVICE] [--use-dataset-reader]
-                            [--dataset-reader-choice {train,validation}]
-                            [-o OVERRIDES] [--predictor PREDICTOR]
-                            [--include-package INCLUDE_PACKAGE]
-                            archive_file input_file
-
-    Run the specified model against a JSON-lines input file.
-
-    positional arguments:
-      archive_file          the archived model to make predictions with
-      input_file            path to or url of the input file
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      --output-file OUTPUT_FILE
-                            path to output file
-      --weights-file WEIGHTS_FILE
-                            a path that overrides which weights file to use
-      --batch-size BATCH_SIZE
-                            The batch size to use for processing
-      --silent              do not print output to stdout
-      --cuda-device CUDA_DEVICE
-                            id of GPU to use (if any)
-      --use-dataset-reader  Whether to use the dataset reader of the original
-                            model to load Instances. The validation dataset reader
-                            will be used if it exists, otherwise it will fall back
-                            to the train dataset reader. This behavior can be
-                            overridden with the --dataset-reader-choice flag.
-      --dataset-reader-choice {train,validation}
-                            Indicates which model dataset reader to use if the
-                            --use-dataset-reader flag is set. (default =
-                            validation)
-      -o OVERRIDES, --overrides OVERRIDES
-                            a JSON structure used to override the experiment
-                            configuration
-      --predictor PREDICTOR
-                            optionally specify a specific predictor to use
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
+
 from typing import List, Iterator, Optional
 import argparse
 import sys
diff --git a/allennlp/commands/print_results.py b/allennlp/commands/print_results.py
index 92d409affa7..25d960e638d 100644
--- a/allennlp/commands/print_results.py
+++ b/allennlp/commands/print_results.py
@@ -1,29 +1,8 @@
 """
 The `print-results` subcommand allows you to print results from multiple
 allennlp serialization directories to the console in a helpful csv format.
-
-    $ allennlp print-results --help
-    usage: allennlp print-results [-h] [-k KEYS [KEYS ...]] [-m METRICS_FILENAME]
-                                  [--include-package INCLUDE_PACKAGE]
-                                  path
-
-    Print results from allennlp training runs in a helpful CSV format.
-
-    positional arguments:
-      path                  Path to recursively search for allennlp serialization
-                            directories.
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -k KEYS [KEYS ...], --keys KEYS [KEYS ...]
-                            Keys to print from metrics.json.Keys not present in
-                            all metrics.json will result in "N/A"
-      -m METRICS_FILENAME, --metrics-filename METRICS_FILENAME
-                            Name of the metrics file to inspect. (default =
-                            metrics.json)
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
+
 import argparse
 import json
 import logging
diff --git a/allennlp/commands/subcommand.py b/allennlp/commands/subcommand.py
index b37ee6b9049..e7419dbb232 100644
--- a/allennlp/commands/subcommand.py
+++ b/allennlp/commands/subcommand.py
@@ -1,6 +1,7 @@
 """
 Base class for subcommands under `allennlp.run`.
 """
+
 import argparse
 from typing import Callable, Dict, Optional, Type, TypeVar
 
diff --git a/allennlp/commands/test_install.py b/allennlp/commands/test_install.py
index c15f8e086da..0dac3a905c0 100644
--- a/allennlp/commands/test_install.py
+++ b/allennlp/commands/test_install.py
@@ -1,20 +1,6 @@
 """
 The `test-install` subcommand verifies
 an installation by running the unit tests.
-
-    $ allennlp test-install --help
-    usage: allennlp test-install [-h] [--run-all] [-k K]
-                                 [--include-package INCLUDE_PACKAGE]
-
-    Test that installation works by running the unit tests.
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      --run-all             By default, we skip tests that are slow or download
-                            large files. This flag will run all tests.
-      -k K                  Limit tests by setting pytest -k argument
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
 
 import argparse
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 5fb597858d0..e22f7a294a7 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -2,38 +2,6 @@
 The `train` subcommand can be used to train a model.
 It requires a configuration file and a directory in
 which to write the results.
-
-    $ allennlp train --help
-    usage: allennlp train [-h] -s SERIALIZATION_DIR [-r] [-f] [-o OVERRIDES]
-                          [--file-friendly-logging] [--node-rank NODE_RANK]
-                          [--dry-run] [--include-package INCLUDE_PACKAGE]
-                          param_path
-
-    Train the specified model on the specified dataset.
-
-    positional arguments:
-      param_path            path to parameter file describing the model to be
-                            trained
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -s SERIALIZATION_DIR, --serialization-dir SERIALIZATION_DIR
-                            directory in which to save the model and its logs
-      -r, --recover         recover training from the state in serialization_dir
-      -f, --force           overwrite the output directory if it exists
-      -o OVERRIDES, --overrides OVERRIDES
-                            a JSON structure used to override the experiment
-                            configuration
-      --file-friendly-logging
-                            outputs tqdm status on separate lines and slows tqdm
-                            refresh rate
-      --node-rank NODE_RANK
-                            rank of this node in the distributed setup (default =
-                            0)
-      --dry-run             do not train a model, but create a vocabulary, show
-                            dataset statistics and other training information
-      --include-package INCLUDE_PACKAGE
-                            additional packages to include
 """
 
 import argparse
@@ -168,20 +136,20 @@ def train_model_from_file(
         [`train_model`](#train_model).
     overrides : `str`
         A JSON string that we will use to override values in the input parameter file.
-    file_friendly_logging : `bool`, optional (default=False)
+    file_friendly_logging : `bool`, optional (default=`False`)
         If `True`, we make our output more friendly to saved model files.  We just pass this
         along to [`train_model`](#train_model).
-    recover : `bool`, optional (default=False)
+    recover : `bool`, optional (default=`False`)
         If `True`, we will try to recover a training run from an existing serialization
         directory.  This is only intended for use when something actually crashed during the middle
         of a run.  For continuing training a model on new data, see `Model.from_archive`.
-    force : `bool`, optional (default=False)
+    force : `bool`, optional (default=`False`)
         If `True`, we will overwrite the serialization directory if it already exists.
     node_rank : `int`, optional
         Rank of the current node in distributed training
     include_package : `str`, optional
         In distributed mode, extra packages mentioned will be imported in trainer workers.
-    dry_run : `bool`, optional (default=False)
+    dry_run : `bool`, optional (default=`False`)
         Do not train a model, but create a vocabulary, show dataset statistics and other training
         information.
 
@@ -225,22 +193,22 @@ def train_model(
         A parameter object specifying an AllenNLP Experiment.
     serialization_dir : `str`
         The directory in which to save results and logs.
-    file_friendly_logging : `bool`, optional (default=False)
+    file_friendly_logging : `bool`, optional (default=`False`)
         If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
         down tqdm's output to only once every 10 seconds.
-    recover : `bool`, optional (default=False)
+    recover : `bool`, optional (default=`False`)
         If `True`, we will try to recover a training run from an existing serialization
         directory.  This is only intended for use when something actually crashed during the middle
         of a run.  For continuing training a model on new data, see `Model.from_archive`.
-    force : `bool`, optional (default=False)
+    force : `bool`, optional (default=`False`)
         If `True`, we will overwrite the serialization directory if it already exists.
     node_rank : `int`, optional
         Rank of the current node in distributed training
     include_package : `List[str]`, optional
         In distributed mode, extra packages mentioned will be imported in trainer workers.
-    batch_weight_key : `str`, optional (default="")
+    batch_weight_key : `str`, optional (default=`""`)
         If non-empty, name of metric used to weight the loss on a per-batch basis.
-    dry_run : `bool`, optional (default=False)
+    dry_run : `bool`, optional (default=`False`)
         Do not train a model, but create a vocabulary, show dataset statistics and other training
         information.
 
@@ -366,23 +334,23 @@ def _train_worker(
         A parameter object specifying an AllenNLP Experiment.
     serialization_dir : `str`
         The directory in which to save results and logs.
-    file_friendly_logging : `bool`, optional (default=False)
+    file_friendly_logging : `bool`, optional (default=`False`)
         If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
         down tqdm's output to only once every 10 seconds.
     include_package : `List[str]`, optional
         In distributed mode, since this function would have been spawned as a separate process,
         the extra imports need to be done again. NOTE: This does not have any effect in single
         GPU training.
-    batch_weight_key : `str`, optional (default="")
+    batch_weight_key : `str`, optional (default=`""`)
         If non-empty, name of metric used to weight the loss on a per-batch basis.
-    dry_run : `bool`, optional (default=False)
+    dry_run : `bool`, optional (default=`False`)
         Do not train a model, but create a vocabulary, show dataset statistics and other training
         information.
     node_rank : `int`, optional
         Rank of the node.
-    master_addr : `str`, optional (default="127.0.0.1")
+    master_addr : `str`, optional (default=`"127.0.0.1"`)
         Address of the master node for distributed training.
-    master_port : `str`, optional (default="29500")
+    master_port : `str`, optional (default=`"29500"`)
         Port of the master node for distributed training.
     world_size : `int`, optional
         The number of processes involved in distributed training.
@@ -604,26 +572,26 @@ def from_partial_objects(
         trainer: `Lazy[Trainer]`
             The `Trainer` that actually implements the training loop.  This is a lazy object because
             it depends on the model that's going to be trained.
-        vocabulary: `Lazy[Vocabulary]`, optional (default=None)
+        vocabulary: `Lazy[Vocabulary]`, optional (default=`None`)
             The `Vocabulary` that we will use to convert strings in the data to integer ids (and
             possibly set sizes of embedding matrices in the `Model`).  By default we construct the
             vocabulary from the instances that we read.
-        datasets_for_vocab_creation: `List[str]`, optional (default=None)
+        datasets_for_vocab_creation: `List[str]`, optional (default=`None`)
             If you pass in more than one dataset but don't want to use all of them to construct a
             vocabulary, you can pass in this key to limit it.  Valid entries in the list are
             "train", "validation" and "test".
-        validation_dataset_reader: `DatasetReader`, optional (default=None)
+        validation_dataset_reader: `DatasetReader`, optional (default=`None`)
             If given, we will use this dataset reader for the validation data instead of
             `dataset_reader`.
-        validation_data_path: `str`, optional (default=None)
+        validation_data_path: `str`, optional (default=`None`)
             If given, we will use this data for computing validation metrics and early stopping.
-        validation_data_loader: `Lazy[DataLoader]`, optional (default=None)
+        validation_data_loader: `Lazy[DataLoader]`, optional (default=`None`)
             If given, the data_loader we use to batch instances from the dataset reader at
             validation and test time. This is lazy because it takes a dataset in it's constructor.
-        test_data_path: `str`, optional (default=None)
+        test_data_path: `str`, optional (default=`None`)
             If given, we will use this as test data.  This makes it available for vocab creation by
             default, but nothing else.
-        evaluate_on_test: `bool`, optional (default=False)
+        evaluate_on_test: `bool`, optional (default=`False`)
             If given, we will evaluate the final model on this data at the end of training.  Note
             that we do not recommend using this for actual test data in every-day experimentation;
             you should only very rarely evaluate your model on actual test data.
diff --git a/allennlp/common/file_utils.py b/allennlp/common/file_utils.py
index c490527ab81..5612a0f88be 100644
--- a/allennlp/common/file_utils.py
+++ b/allennlp/common/file_utils.py
@@ -190,8 +190,8 @@ def session_with_backoff() -> requests.Session:
     We ran into an issue where http requests to s3 were timing out,
     possibly because we were making too many requests too quickly.
     This helper function returns a requests session that has retry-with-backoff
-    built in.
-    see stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library
+    built in. See
+    <https://stackoverflow.com/questions/23267409/how-to-implement-retry-mechanism-into-python-requests-library>.
     """
     session = requests.Session()
     retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
diff --git a/allennlp/common/params.py b/allennlp/common/params.py
index 3407bd7d819..d5a8e609ebe 100644
--- a/allennlp/common/params.py
+++ b/allennlp/common/params.py
@@ -321,7 +321,7 @@ def pop_choice(
             the param dictionary is not in `choices`, we raise a `ConfigurationError`, because
             the user specified an invalid value in their parameter file.
 
-        default_to_first_choice: `bool`, optional (default=False)
+        default_to_first_choice: `bool`, optional (default = `False`)
 
             If this is `True`, we allow the `key` to not be present in the parameter
             dictionary.  If the key is not present, we will use the return as the value the first
@@ -330,7 +330,7 @@ def pop_choice(
             specify your model class when running an experiment, but you can feel free to use
             default settings for encoders if you want).
 
-        allow_class_names: `bool`, optional (default = True)
+        allow_class_names: `bool`, optional (default = `True`)
 
             If this is `True`, then we allow unknown choices that look like fully-qualified class names.
             This is to allow e.g. specifying a model type as my_library.my_model.MyModel
@@ -358,11 +358,11 @@ def as_dict(self, quiet: bool = False, infer_type_and_cast: bool = False):
 
         # Parameters
 
-        quiet: `bool`, optional (default = False)
+        quiet: `bool`, optional (default = `False`)
 
             Whether to log the parameters before returning them as a dict.
 
-        infer_type_and_cast: `bool`, optional (default = False)
+        infer_type_and_cast: `bool`, optional (default = `False`)
 
             If True, we infer types and cast (e.g. things that look like floats to floats).
         """
diff --git a/allennlp/common/registrable.py b/allennlp/common/registrable.py
index f01d53bafef..75a1ae47780 100644
--- a/allennlp/common/registrable.py
+++ b/allennlp/common/registrable.py
@@ -50,11 +50,11 @@ def register(cls: Type[T], name: str, constructor: str = None, exist_ok: bool =
 
         name : `str`
             The name to register the class under.
-        constructor : `str`, optional (default=None)
+        constructor : `str`, optional (default=`None`)
             The name of the method to use on the class to construct the object.  If this is given,
             we will use this method (which must be a `@classmethod`) instead of the default
             constructor.
-        exist_ok : `bool`, optional (default=False)
+        exist_ok : `bool`, optional (default=`False`)
             If True, overwrites any existing models registered under `name`. Else,
             throws an error if a model is already registered under `name`.
 
diff --git a/allennlp/common/testing/model_test_case.py b/allennlp/common/testing/model_test_case.py
index 0f676560345..fbbb1c4a6ca 100644
--- a/allennlp/common/testing/model_test_case.py
+++ b/allennlp/common/testing/model_test_case.py
@@ -63,31 +63,31 @@ def ensure_model_can_train_save_and_load(
         param_file : `str`
             Path to a training configuration file that we will use to train the model for this
             test.
-        tolerance : `float`, optional (default=1e-4)
+        tolerance : `float`, optional (default=`1e-4`)
             When comparing model predictions between the originally-trained model and the model
             after saving and loading, we will use this tolerance value (passed as `rtol` to
             `numpy.testing.assert_allclose`).
-        cuda_device : `int`, optional (default=-1)
+        cuda_device : `int`, optional (default=`-1`)
             The device to run the test on.
-        gradients_to_ignore : `Set[str]`, optional (default=None)
+        gradients_to_ignore : `Set[str]`, optional (default=`None`)
             This test runs a gradient check to make sure that we're actually computing gradients
             for all of the parameters in the model.  If you really want to ignore certain
             parameters when doing that check, you can pass their names here.  This is not
             recommended unless you're `really` sure you don't need to have non-zero gradients for
             those parameters (e.g., some of the beam search / state machine models have
             infrequently-used parameters that are hard to force the model to use in a small test).
-        overrides : `str`, optional (default = "")
+        overrides : `str`, optional (default = `""`)
             A JSON string that we will use to override values in the input parameter file.
-        metric_to_check: `str`, optional (default = None)
+        metric_to_check: `str`, optional (default = `None`)
             We may want to automatically perform a check that model reaches given metric when
             training (on validation set, if it is specified). It may be useful in CI, for example.
             You can pass any metric that is in your model returned metrics.
-        metric_terminal_value: `str`, optional (default = None)
+        metric_terminal_value: `str`, optional (default = `None`)
             When you set `metric_to_check`, you need to set the value this metric must converge to
-        metric_tolerance: `float`, optional (default=1e-4)
+        metric_tolerance: `float`, optional (default=`1e-4`)
             Tolerance to check you model metric against metric terminal value. One can expect some
             variance in model metrics when the training process is highly stochastic.
-        disable_dropout : `bool`, optional (default = True)
+        disable_dropout : `bool`, optional (default = `True`)
             If True we will set all dropout to 0 before checking gradients. (Otherwise, with small
             datasets, you may get zero gradients because of unlucky dropout.)
         """
@@ -268,7 +268,7 @@ def ensure_batch_predictions_are_consistent(self, keys_to_ignore: Iterable[str]
 
         # Parameters
 
-        keys_to_ignore : `Iterable[str]`, optional (default=())
+        keys_to_ignore : `Iterable[str]`, optional (default=`()`)
             Names of metrics that should not be taken into account, e.g. "batch_weight".
         """
         self.model.eval()
diff --git a/allennlp/common/util.py b/allennlp/common/util.py
index c44f56f387f..349c98f6d43 100644
--- a/allennlp/common/util.py
+++ b/allennlp/common/util.py
@@ -151,25 +151,25 @@ def pad_sequence_to_length(
 
     # Parameters
 
-    sequence : List
+    sequence : `List`
         A list of objects to be padded.
 
-    desired_length : int
+    desired_length : `int`
         Maximum length of each sequence. Longer sequences are truncated to this length, and
         shorter ones are padded to it.
 
-    default_value: Callable, default=lambda: 0
+    default_value: `Callable`, optional (default=`lambda: 0`)
         Callable that outputs a default value (of any type) to use as padding values.  This is
         a lambda to avoid using the same object when the default value is more complex, like a
         list.
 
-    padding_on_right : bool, default=True
+    padding_on_right : `bool`, optional (default=`True`)
         When we add padding tokens (or truncate the sequence), should we do it on the right or
         the left?
 
     # Returns
 
-    padded_sequence : List
+    padded_sequence : `List`
     """
     # Truncates the sequence to the desired length.
     if padding_on_right:
@@ -227,7 +227,7 @@ def prepare_environment(params: Params):
 
     # Parameters
 
-    params: Params object or dict, required.
+    params: `Params`
         A `Params` object or dict holding the json parameters.
     """
     seed = params.pop_int("random_seed", 13370)
@@ -473,13 +473,13 @@ def is_master(
 
     # Parameters
 
-    global_rank : int ( default = None )
+    global_rank : `int` ( default = `None` )
         Global rank of the process if in a distributed process group. If not
         given, rank is obtained using `torch.distributed.get_rank()`
-    world_size : int ( default = None )
+    world_size : `int` ( default = `None` )
         Number of processes in the distributed group. If not
         given, this is obtained using `torch.distributed.get_world_size()`
-    num_procs_per_node: int ( default = None ),
+    num_procs_per_node: `int` ( default = `None` )
         Number of GPU processes running per node
     """
     distributed = dist.is_available() and dist.is_initialized()
diff --git a/allennlp/data/dataset_readers/dataset_reader.py b/allennlp/data/dataset_readers/dataset_reader.py
index 22f2265ea98..fead0ffc7a5 100644
--- a/allennlp/data/dataset_readers/dataset_reader.py
+++ b/allennlp/data/dataset_readers/dataset_reader.py
@@ -112,17 +112,17 @@ class DatasetReader(Registrable):
 
     # Parameters
 
-    lazy : `bool`, optional (default=False)
+    lazy : `bool`, optional (default=`False`)
         If this is true, `instances()` will return an object whose `__iter__` method
         reloads the dataset each time it's called. Otherwise, `instances()` returns a list.
-    cache_directory : `str`, optional (default=None)
+    cache_directory : `str`, optional (default=`None`)
         If given, we will use this directory to store a cache of already-processed `Instances` in
         every file passed to :func:`read`, serialized (by default, though you can override this) as
         one string-formatted `Instance` per line.  If the cache file for a given `file_path` exists,
         we read the `Instances` from the cache instead of re-processing the data (using
         :func:`_instances_from_cache_file`).  If the cache file does _not_ exist, we will _create_
         it on our first pass through the data (using :func:`_instances_to_cache_file`).
-    max_instances : `int`, optional (default=None)
+    max_instances : `int`, optional (default=`None`)
         If given, will stop reading after this many instances. This is a useful setting for debugging.
 
         IMPORTANT CAVEAT: It is the _caller's_ responsibility to make sure that this directory is
diff --git a/allennlp/data/dataset_readers/dataset_utils/span_utils.py b/allennlp/data/dataset_readers/dataset_utils/span_utils.py
index d7422fe49d9..2f6212a5454 100644
--- a/allennlp/data/dataset_readers/dataset_utils/span_utils.py
+++ b/allennlp/data/dataset_readers/dataset_utils/span_utils.py
@@ -43,15 +43,15 @@ def enumerate_spans(
     sentence : `List[T]`, required.
         The sentence to generate spans for. The type is generic, as this function
         can be used with strings, or Spacy `Tokens` or other sequences.
-    offset : `int`, optional (default = 0)
+    offset : `int`, optional (default = `0`)
         A numeric offset to add to all span start and end indices. This is helpful
         if the sentence is part of a larger structure, such as a document, which
         the indices need to respect.
-    max_span_width : `int`, optional (default = None)
+    max_span_width : `int`, optional (default = `None`)
         The maximum length of spans which should be included. Defaults to len(sentence).
-    min_span_width : `int`, optional (default = 1)
+    min_span_width : `int`, optional (default = `1`)
         The minimum length of spans which should be included. Defaults to 1.
-    filter_function : `Callable[[List[T]], bool]`, optional (default = None)
+    filter_function : `Callable[[List[T]], bool]`, optional (default = `None`)
         A function mapping sequences of the passed type T to a boolean value.
         If `True`, the span is included in the returned spans from the
         sentence, otherwise it is excluded..
@@ -85,15 +85,15 @@ def bio_tags_to_spans(
 
     # Parameters
 
-    tag_sequence : List[str], required.
+    tag_sequence : `List[str]`, required.
         The integer class labels for a sequence.
-    classes_to_ignore : List[str], optional (default = None).
+    classes_to_ignore : `List[str]`, optional (default = `None`).
         A list of string class labels `excluding` the bio tag
         which should be ignored when extracting spans.
 
     # Returns
 
-    spans : List[TypedStringSpan]
+    spans : `List[TypedStringSpan]`
         The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
         Note that the label `does not` contain any BIO tag prefixes.
     """
@@ -157,15 +157,15 @@ def iob1_tags_to_spans(
 
     # Parameters
 
-    tag_sequence : List[str], required.
+    tag_sequence : `List[str]`, required.
         The integer class labels for a sequence.
-    classes_to_ignore : List[str], optional (default = None).
+    classes_to_ignore : `List[str]`, optional (default = `None`).
         A list of string class labels `excluding` the bio tag
         which should be ignored when extracting spans.
 
     # Returns
 
-    spans : List[TypedStringSpan]
+    spans : `List[TypedStringSpan]`
         The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
         Note that the label `does not` contain any BIO tag prefixes.
     """
@@ -237,7 +237,7 @@ def bioul_tags_to_spans(
 
     tag_sequence : `List[str]`, required.
         The tag sequence encoded in BIOUL, e.g. ["B-PER", "L-PER", "O"].
-    classes_to_ignore : `List[str]`, optional (default = None).
+    classes_to_ignore : `List[str]`, optional (default = `None`).
         A list of string class labels `excluding` the bio tag
         which should be ignored when extracting spans.
 
@@ -293,7 +293,7 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
 
     tag_sequence : `List[str]`, required.
         The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"].
-    encoding : `str`, optional, (default = `IOB1`).
+    encoding : `str`, optional, (default = `"IOB1"`).
         The encoding type to convert from. Must be either "IOB1" or "BIO".
 
     # Returns
@@ -398,15 +398,15 @@ def bmes_tags_to_spans(
 
     # Parameters
 
-    tag_sequence : List[str], required.
+    tag_sequence : `List[str]`, required.
         The integer class labels for a sequence.
-    classes_to_ignore : List[str], optional (default = None).
+    classes_to_ignore : `List[str]`, optional (default = `None`).
         A list of string class labels `excluding` the bio tag
         which should be ignored when extracting spans.
 
     # Returns
 
-    spans : List[TypedStringSpan]
+    spans : `List[TypedStringSpan]`
         The typed, extracted spans from the sequence, in the format (label, (span_start, span_end)).
         Note that the label `does not` contain any BIO tag prefixes.
     """
diff --git a/allennlp/data/dataset_readers/interleaving_dataset_reader.py b/allennlp/data/dataset_readers/interleaving_dataset_reader.py
index 212ab3b275c..547100c3219 100644
--- a/allennlp/data/dataset_readers/interleaving_dataset_reader.py
+++ b/allennlp/data/dataset_readers/interleaving_dataset_reader.py
@@ -27,9 +27,9 @@ class InterleavingDatasetReader(DatasetReader):
     readers : `Dict[str, DatasetReader]`
         The dataset readers to wrap. The keys of this dictionary will be used
         as the values in the MetadataField indicating provenance.
-    dataset_field_name : str, optional (default = "dataset")
+    dataset_field_name : `str`, optional (default = `"dataset"`)
         The name of the MetadataField indicating which dataset an instance came from.
-    scheme : str, optional (default = "round_robin")
+    scheme : `str`, optional (default = `"round_robin"`)
         Indicates how to interleave instances. Currently the two options are "round_robin",
         which repeatedly cycles through the datasets grabbing one instance from each;
         and "all_at_once", which yields all the instances from the first dataset,
diff --git a/allennlp/data/dataset_readers/text_classification_json.py b/allennlp/data/dataset_readers/text_classification_json.py
index 0bac7305324..ce16e2d0ddf 100644
--- a/allennlp/data/dataset_readers/text_classification_json.py
+++ b/allennlp/data/dataset_readers/text_classification_json.py
@@ -104,16 +104,16 @@ def text_to_instance(
 
         text : `str`, required.
             The text to classify
-        label : `str`, optional, (default = None).
+        label : `str`, optional, (default = `None`).
             The label for this text.
 
         # Returns
 
         An `Instance` containing the following fields:
-            tokens : `TextField`
-                The tokens in the sentence or phrase.
-            label : `LabelField`
-                The label label of the sentence or phrase.
+            - tokens (`TextField`) :
+              The tokens in the sentence or phrase.
+            - label (`LabelField`) :
+              The label label of the sentence or phrase.
         """
 
         fields: Dict[str, Field] = {}
diff --git a/allennlp/data/fields/adjacency_field.py b/allennlp/data/fields/adjacency_field.py
index 570c315f933..524b4ad4f82 100644
--- a/allennlp/data/fields/adjacency_field.py
+++ b/allennlp/data/fields/adjacency_field.py
@@ -30,13 +30,13 @@ class AdjacencyField(Field[torch.Tensor]):
     sequence_field : `SequenceField`
         A field containing the sequence that this `AdjacencyField` is labeling.  Most often,
         this is a `TextField`, for tagging edge relations between tokens in a sentence.
-    labels : `List[str]`, optional, default = None
+    labels : `List[str]`, optional, (default = `None`)
         Optional labels for the edges of the adjacency matrix.
-    label_namespace : `str`, optional (default='labels')
+    label_namespace : `str`, optional (default=`'labels'`)
         The namespace to use for converting tag strings into integers.  We convert tag strings to
         integers for you, and this parameter tells the `Vocabulary` object which mapping from
         strings to integers to use (so that "O" as a tag doesn't get the same id as "O" as a word).
-    padding_value : `int`, (optional, default = -1)
+    padding_value : `int`, optional (default = `-1`)
         The value to use as padding.
     """
 
diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py
index fef5f20539d..dcad79a7b76 100644
--- a/allennlp/data/fields/label_field.py
+++ b/allennlp/data/fields/label_field.py
@@ -23,7 +23,7 @@ class LabelField(Field[torch.Tensor]):
     # Parameters
 
     label : `Union[str, int]`
-    label_namespace : `str`, optional (default="labels")
+    label_namespace : `str`, optional (default=`"labels"`)
         The namespace to use for converting label strings into integers.  We map label strings to
         integers for you (e.g., "entailment" and "contradiction" get converted to 0, 1, ...),
         and this namespace tells the `Vocabulary` object which mapping from strings to integers
@@ -31,7 +31,7 @@ class LabelField(Field[torch.Tensor]):
         word).  If you have multiple different label fields in your data, you should make sure you
         use different namespaces for each one, always using the suffix "labels" (e.g.,
         "passage_labels" and "question_labels").
-    skip_indexing : `bool`, optional (default=False)
+    skip_indexing : `bool`, optional (default=`False`)
         If your labels are 0-indexed integers, you can pass in this flag, and we'll skip the indexing
         step.  If this is `False` and your labels are not strings, this throws a `ConfigurationError`.
     """
diff --git a/allennlp/data/fields/multilabel_field.py b/allennlp/data/fields/multilabel_field.py
index 5402fbfe189..69fcb011109 100644
--- a/allennlp/data/fields/multilabel_field.py
+++ b/allennlp/data/fields/multilabel_field.py
@@ -26,7 +26,7 @@ class MultiLabelField(Field[torch.Tensor]):
     # Parameters
 
     labels : `Sequence[Union[str, int]]`
-    label_namespace : `str`, optional (default="labels")
+    label_namespace : `str`, optional (default=`"labels"`)
         The namespace to use for converting label strings into integers.  We map label strings to
         integers for you (e.g., "entailment" and "contradiction" get converted to 0, 1, ...),
         and this namespace tells the `Vocabulary` object which mapping from strings to integers
@@ -34,10 +34,10 @@ class MultiLabelField(Field[torch.Tensor]):
         word).  If you have multiple different label fields in your data, you should make sure you
         use different namespaces for each one, always using the suffix "labels" (e.g.,
         "passage_labels" and "question_labels").
-    skip_indexing : `bool`, optional (default=False)
+    skip_indexing : `bool`, optional (default=`False`)
         If your labels are 0-indexed integers, you can pass in this flag, and we'll skip the indexing
         step.  If this is `False` and your labels are not strings, this throws a `ConfigurationError`.
-    num_labels : `int`, optional (default=None)
+    num_labels : `int`, optional (default=`None`)
         If `skip_indexing=True`, the total number of possible labels should be provided, which is required
         to decide the size of the output tensor. `num_labels` should equal largest label id + 1.
         If `skip_indexing=False`, `num_labels` is not required.
diff --git a/allennlp/data/fields/sequence_label_field.py b/allennlp/data/fields/sequence_label_field.py
index b97c55143cb..aed08854bf5 100644
--- a/allennlp/data/fields/sequence_label_field.py
+++ b/allennlp/data/fields/sequence_label_field.py
@@ -33,7 +33,7 @@ class SequenceLabelField(Field[torch.Tensor]):
     sequence_field : `SequenceField`
         A field containing the sequence that this `SequenceLabelField` is labeling.  Most often, this is a
         `TextField`, for tagging individual tokens in a sentence.
-    label_namespace : `str`, optional (default='labels')
+    label_namespace : `str`, optional (default=`'labels'`)
         The namespace to use for converting tag strings into integers.  We convert tag strings to
         integers for you, and this parameter tells the `Vocabulary` object which mapping from
         strings to integers to use (so that "O" as a tag doesn't get the same id as "O" as a word).
diff --git a/allennlp/data/samplers/bucket_batch_sampler.py b/allennlp/data/samplers/bucket_batch_sampler.py
index c0e3ca9189d..4f53aaa6deb 100644
--- a/allennlp/data/samplers/bucket_batch_sampler.py
+++ b/allennlp/data/samplers/bucket_batch_sampler.py
@@ -30,10 +30,10 @@ class BucketBatchSampler(BatchSampler):
 
     # Parameters
 
-    data_source: `data.Dataset`, required,
+    data_source: `data.Dataset`, required
         The pytorch `Dataset` of allennlp Instances to bucket.
 
-    batch_size : `int`, required.
+    batch_size : `int`, required
         The size of each batch of instances yielded when calling the dataloader.
 
     sorting_keys : `List[str]`, optional
@@ -52,12 +52,12 @@ class BucketBatchSampler(BatchSampler):
         call `Instance.get_padding_lengths()` to see a list of all keys used in your data.  You
         should give one or more of those as the sorting keys here.
 
-    padding_noise : `float`, optional (default=.1)
+    padding_noise : `float`, optional (default=`.1`)
         When sorting by padding length, we add a bit of noise to the lengths, so that the sorting
         isn't deterministic.  This parameter determines how much noise we add, as a percentage of
         the actual padding value for each instance.
 
-    drop_last : `bool`, (default = False)
+    drop_last : `bool`, (default = `False`)
         If `True`, the sampler will drop the last batch if
         its size would be less than batch_size`.
 
@@ -129,7 +129,7 @@ def _guess_sorting_keys(self, instances: Iterable[Instance], num_instances: int
 
         instances : `Iterable[Instance]`, required.
             The dataset to guess sorting keys for.
-        num_instances : `int`, optional (default = 10)
+        num_instances : `int`, optional (default = `10`)
             The number of instances to use to guess sorting keys. Typically
             the default value is completely sufficient, but if your instances
             are not homogeneous, you might need more.
diff --git a/allennlp/data/samplers/samplers.py b/allennlp/data/samplers/samplers.py
index a545de24235..db6aaa8ec8c 100644
--- a/allennlp/data/samplers/samplers.py
+++ b/allennlp/data/samplers/samplers.py
@@ -58,9 +58,9 @@ class RandomSampler(data.RandomSampler, Sampler):
     Registered as a `Sampler` with name "random".
 
     # Parameters
-    data_source: `Dataset`, reqired
+    data_source: `Dataset`, required
         The dataset to sample from.
-    replacement : `bool`, optional(default = False)
+    replacement : `bool`, optional (default = `False`)
         Samples are drawn with replacement if `True`.
     num_samples: `int` (default = `len(dataset)`)
         The number of samples to draw. This argument
diff --git a/allennlp/data/token_indexers/pretrained_transformer_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_indexer.py
index b2e194238a3..b5e58aae1f5 100644
--- a/allennlp/data/token_indexers/pretrained_transformer_indexer.py
+++ b/allennlp/data/token_indexers/pretrained_transformer_indexer.py
@@ -33,7 +33,7 @@ class PretrainedTransformerIndexer(TokenIndexer):
         We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
         tokens to this namespace, which would break on loading because we wouldn't find our default
         OOV token.
-    max_length : `int`, optional (default = None)
+    max_length : `int`, optional (default = `None`)
         If not None, split the document into segments of this many tokens (including special tokens)
         before feeding into the embedder. The embedder embeds these segments independently and
         concatenate the results to get the original document representation. Should be set to
diff --git a/allennlp/data/token_indexers/pretrained_transformer_mismatched_indexer.py b/allennlp/data/token_indexers/pretrained_transformer_mismatched_indexer.py
index fe4e036c2fc..724dadbcd69 100644
--- a/allennlp/data/token_indexers/pretrained_transformer_mismatched_indexer.py
+++ b/allennlp/data/token_indexers/pretrained_transformer_mismatched_indexer.py
@@ -34,7 +34,7 @@ class PretrainedTransformerMismatchedIndexer(TokenIndexer):
         We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
         tokens to this namespace, which would break on loading because we wouldn't find our default
         OOV token.
-    max_length : `int`, optional (default = None)
+    max_length : `int`, optional (default = `None`)
         If positive, split the document into segments of this many tokens (including special tokens)
         before feeding into the embedder. The embedder embeds these segments independently and
         concatenate the results to get the original document representation. Should be set to
diff --git a/allennlp/data/token_indexers/single_id_token_indexer.py b/allennlp/data/token_indexers/single_id_token_indexer.py
index c1abf4097d8..ea49de9d76d 100644
--- a/allennlp/data/token_indexers/single_id_token_indexer.py
+++ b/allennlp/data/token_indexers/single_id_token_indexer.py
@@ -20,7 +20,7 @@ class SingleIdTokenIndexer(TokenIndexer):
 
     # Parameters
 
-    namespace : `Optional[str]`, optional (default=`tokens`)
+    namespace : `Optional[str]`, optional (default=`"tokens"`)
         We will use this namespace in the :class:`Vocabulary` to map strings to indices.  If you
         explicitly pass in `None` here, we will skip indexing and vocabulary lookups.  This means
         that the `feature_name` you use must correspond to an integer value (like `text_id`, for
@@ -32,7 +32,7 @@ class SingleIdTokenIndexer(TokenIndexer):
         These are prepended to the tokens provided to `tokens_to_indices`.
     end_tokens : `List[str]`, optional (default=`None`)
         These are appended to the tokens provided to `tokens_to_indices`.
-    feature_name : `str`, optional (default=`text`)
+    feature_name : `str`, optional (default=`"text"`)
         We will use the :class:`Token` attribute with this name as input.  This is potentially
         useful, e.g., for using NER tags instead of (or in addition to) surface forms as your inputs
         (passing `ent_type_` here would do that).  If you use a non-default value here, you almost
diff --git a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
index ab5c065fb37..c3db4a13338 100644
--- a/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
+++ b/allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
@@ -35,16 +35,16 @@ class PretrainedTransformerTokenizer(Tokenizer):
 
     model_name : `str`
         The name of the pretrained wordpiece tokenizer to use.
-    add_special_tokens : `bool`, optional, (default=True)
+    add_special_tokens : `bool`, optional, (default=`True`)
         If set to `True`, the sequences will be encoded with the special tokens relative
         to their model.
-    max_length : `int`, optional (default=None)
+    max_length : `int`, optional (default=`None`)
         If set to a number, will limit the total sequence returned so that it has a maximum length.
         If there are overflowing tokens, those will be added to the returned dictionary
-    stride : `int`, optional (default=0)
+    stride : `int`, optional (default=`0`)
         If set to a number along with max_length, the overflowing tokens returned will contain some tokens
         from the main sequence returned. The value of this argument defines the number of additional tokens.
-    truncation_strategy : `str`, optional (default='longest_first')
+    truncation_strategy : `str`, optional (default=`'longest_first'`)
         String selected in the following options:
         - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
         starting from the longest one at each token (when there is a pair of input sequences)
diff --git a/allennlp/data/tokenizers/spacy_tokenizer.py b/allennlp/data/tokenizers/spacy_tokenizer.py
index 7d47887a14f..6435b6ce273 100644
--- a/allennlp/data/tokenizers/spacy_tokenizer.py
+++ b/allennlp/data/tokenizers/spacy_tokenizer.py
@@ -26,26 +26,26 @@ class SpacyTokenizer(Tokenizer):
 
     # Parameters
 
-    language : `str`, optional, (default="en_core_web_sm")
+    language : `str`, optional, (default=`"en_core_web_sm"`)
         Spacy model name.
-    pos_tags : `bool`, optional, (default=False)
+    pos_tags : `bool`, optional, (default=`False`)
         If `True`, performs POS tagging with spacy model on the tokens.
         Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.PosTagIndexer`.
-    parse : `bool`, optional, (default=False)
+    parse : `bool`, optional, (default=`False`)
         If `True`, performs dependency parsing with spacy model on the tokens.
         Generally used in conjunction with :class:`~allennlp.data.token_indexers.pos_tag_indexer.DepLabelIndexer`.
-    ner : `bool`, optional, (default=False)
+    ner : `bool`, optional, (default=`False`)
         If `True`, performs dependency parsing with spacy model on the tokens.
         Generally used in conjunction with :class:`~allennlp.data.token_indexers.ner_tag_indexer.NerTagIndexer`.
-    keep_spacy_tokens : `bool`, optional, (default=False)
+    keep_spacy_tokens : `bool`, optional, (default=`False`)
         If `True`, will preserve spacy token objects, We copy spacy tokens into our own class by default instead
         because spacy Cython Tokens can't be pickled.
-    split_on_spaces : `bool`, optional, (default=False)
+    split_on_spaces : `bool`, optional, (default=`False`)
         If `True`, will split by spaces without performing tokenization.
         Used when your data is already tokenized, but you want to perform pos, ner or parsing on the tokens.
-    start_tokens : `Optional[List[str]]`, optional, (default=None)
+    start_tokens : `Optional[List[str]]`, optional, (default=`None`)
         If given, these tokens will be added to the beginning of every string we tokenize.
-    end_tokens : `Optional[List[str]]`, optional, (default=None)
+    end_tokens : `Optional[List[str]]`, optional, (default=`None`)
         If given, these tokens will be added to the end of every string we tokenize.
     """
 
diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
index 1e3c62b43e7..2d3f78c092c 100644
--- a/allennlp/data/vocabulary.py
+++ b/allennlp/data/vocabulary.py
@@ -152,7 +152,7 @@ class Vocabulary(Registrable):
         words are in-vocabulary.  If this is `None`, we just won't initialize the vocabulary with
         anything.
 
-    min_count : `Dict[str, int]`, optional (default=None)
+    min_count : `Dict[str, int]`, optional (default=`None`)
         When initializing the vocab from a counter, you can specify a minimum count, and every
         token with a count less than this will not be added to the dictionary.  These minimum
         counts are `namespace-specific`, so you can specify different minimums for labels versus
@@ -194,7 +194,7 @@ class Vocabulary(Registrable):
         most common words) to keep from pretrained embedding files, even for words not
         appearing in the data.
 
-    only_include_pretrained_words : `bool`, optional (default=False)
+    only_include_pretrained_words : `bool`, optional (default=`False`)
         This defines the strategy for using any pretrained embedding files which may have been
         specified in `pretrained_files`. If False, an inclusive strategy is used: and words
         which are in the `counter` and in the pretrained file are added to the `Vocabulary`,
@@ -202,15 +202,15 @@ class Vocabulary(Registrable):
         exclusive strategy: words are only included in the Vocabulary if they are in the pretrained
         embedding file (their count must still be at least `min_count`).
 
-    tokens_to_add : `Dict[str, List[str]]`, optional (default=None)
+    tokens_to_add : `Dict[str, List[str]]`, optional (default=`None`)
         If given, this is a list of tokens to add to the vocabulary, keyed by the namespace to add
         the tokens to.  This is a way to be sure that certain items appear in your vocabulary,
         regardless of any other vocabulary computation.
 
-    padding_token : `str`,  optional (default=DEFAULT_PADDING_TOKEN)
+    padding_token : `str`,  optional (default=`DEFAULT_PADDING_TOKEN`)
         If given, this the string used for padding.
 
-    oov_token : `str`,  optional (default=DEFAULT_OOV_TOKEN)
+    oov_token : `str`,  optional (default=`DEFAULT_OOV_TOKEN`)
         If given, this the string used for the out of vocabulary (OOVs) tokens.
 
     """
@@ -410,17 +410,17 @@ def set_from_file(
             line, with nothing else in the line.  The index we assign to the token is the line
             number in the file (1-indexed if `is_padded`, 0-indexed otherwise).  Note that this
             file should contain the OOV token string!
-        is_padded : `bool`, optional (default=True)
+        is_padded : `bool`, optional (default=`True`)
             Is this vocabulary padded?  For token / word / character vocabularies, this should be
             `True`; while for tag or label vocabularies, this should typically be `False`.  If
             `True`, we add a padding token with index 0, and we enforce that the `oov_token` is
             present in the file.
-        oov_token : `str`, optional (default=DEFAULT_OOV_TOKEN)
+        oov_token : `str`, optional (default=`DEFAULT_OOV_TOKEN`)
             What token does this vocabulary use to represent out-of-vocabulary characters?  This
             must show up as a line in the vocabulary file.  When we find it, we replace
             `oov_token` with `self._oov_token`, because we only use one OOV token across
             namespaces.
-        namespace : `str`, optional (default="tokens")
+        namespace : `str`, optional (default=`"tokens"`)
             What namespace should we overwrite with this vocab file?
         """
         if is_padded:
diff --git a/allennlp/interpret/attackers/hotflip.py b/allennlp/interpret/attackers/hotflip.py
index f93f7ef7dd4..85320423215 100644
--- a/allennlp/interpret/attackers/hotflip.py
+++ b/allennlp/interpret/attackers/hotflip.py
@@ -45,12 +45,12 @@ class Hotflip(Attacker):
     predictor : `Predictor`
         The model (inside a Predictor) that we're attacking.  We use this to get gradients and
         predictions.
-    vocab_namespace : `str`, optional (default='tokens')
+    vocab_namespace : `str`, optional (default=`'tokens'`)
         We use this to know three things: (1) which tokens we should ignore when producing flips
         (we don't consider non-alphanumeric tokens); (2) what the string value is of the token that
         we produced, so we can show something human-readable to the user; and (3) if we need to
         construct a fake embedding matrix, we use the tokens in the vocabulary as flip candidates.
-    max_tokens : `int`, optional (default=5000)
+    max_tokens : `int`, optional (default=`5000`)
         This is only used when we need to construct a fake embedding matrix.  That matrix can take
         a lot of memory when the vocab size is large.  This parameter puts a cap on the number of
         tokens to use, so the fake embedding matrix doesn't take as much memory.
@@ -174,17 +174,17 @@ def attack_from_json(
 
         inputs : `JsonDict`
             The model inputs, the same as what is passed to a `Predictor`.
-        input_field_to_attack : `str`, optional (default='tokens')
+        input_field_to_attack : `str`, optional (default=`'tokens'`)
             The field that has the tokens that we're going to be flipping.  This must be a
             `TextField`.
-        grad_input_field : `str`, optional (default='grad_input_1')
+        grad_input_field : `str`, optional (default=`'grad_input_1'`)
             If there is more than one field that gets embedded in your model (e.g., a question and
             a passage, or a premise and a hypothesis), this tells us the key to use to get the
             correct gradients.  This selects from the output of :func:`Predictor.get_gradients`.
-        ignore_tokens : `List[str]`, optional (default=DEFAULT_IGNORE_TOKENS)
+        ignore_tokens : `List[str]`, optional (default=`DEFAULT_IGNORE_TOKENS`)
             These tokens will not be flipped.  The default list includes some simple punctuation,
             OOV and padding tokens, and common control tokens for BERT, etc.
-        target : `JsonDict`, optional (default=None)
+        target : `JsonDict`, optional (default=`None`)
             If given, this will be a `targeted` hotflip attack, where instead of just trying to
             change a model's prediction from what it current is predicting, we try to change it to
             a `specific` target value.  This is a `JsonDict` because it needs to specify the
diff --git a/allennlp/models/archival.py b/allennlp/models/archival.py
index b119375c2b1..bf8e8de4f8e 100644
--- a/allennlp/models/archival.py
+++ b/allennlp/models/archival.py
@@ -58,7 +58,7 @@ def extract_module(self, path: str, freeze: bool = True) -> Module:
         path : `str`, required
             Path of target module to be loaded from the model.
             Eg. "_textfield_embedder.token_embedder_tokens"
-        freeze : `bool`, optional (default=True)
+        freeze : `bool`, optional (default=`True`)
             Whether to freeze the module parameters or not.
 
         """
@@ -98,9 +98,9 @@ def archive_model(
 
     serialization_dir : `str`
         The directory where the weights and vocabulary are written out.
-    weights : `str`, optional (default=_DEFAULT_WEIGHTS)
+    weights : `str`, optional (default=`_DEFAULT_WEIGHTS`)
         Which weights file to include in the archive. The default is `best.th`.
-    archive_path : `str`, optional, (default = None)
+    archive_path : `str`, optional, (default = `None`)
         A full path to serialize the model to. The default is "model.tar.gz" inside the
         serialization_dir. If you pass a directory here, we'll serialize the model
         to "model.tar.gz" inside the directory.
@@ -141,7 +141,7 @@ def load_archive(
 
     archive_file : `str`
         The archive file to load the model from.
-    cuda_device : `int`, optional (default = -1)
+    cuda_device : `int`, optional (default = `-1`)
         If `cuda_device` is >= 0, the model will be loaded onto the
         corresponding GPU. Otherwise it will be loaded onto the CPU.
     opt_level : `str`, optional, (default = `None`)
@@ -150,9 +150,9 @@ def load_archive(
         See the Apex [documentation](https://nvidia.github.io/apex/amp.html#opt-levels-and-properties) for
         more details. If `None`, defaults to the `opt_level` found in the model params. If `cuda_device==-1`,
         Amp is not used and this argument is ignored.
-    overrides : `str`, optional (default = "")
+    overrides : `str`, optional (default = `""`)
         JSON overrides to apply to the unarchived `Params` object.
-    weights_file : `str`, optional (default = None)
+    weights_file : `str`, optional (default = `None`)
         The weights file to use.  If unspecified, weights.th in the archive_file will be used.
     """
     # redirect to the cache, if necessary
diff --git a/allennlp/models/basic_classifier.py b/allennlp/models/basic_classifier.py
index 6f06af740d6..a4259803a6e 100644
--- a/allennlp/models/basic_classifier.py
+++ b/allennlp/models/basic_classifier.py
@@ -34,14 +34,14 @@ class BasicClassifier(Model):
         Required Seq2Vec encoder layer. If `seq2seq_encoder` is provided, this encoder
         will pool its output. Otherwise, this encoder will operate directly on the output
         of the `text_field_embedder`.
-    feedforward : `FeedForward`, optional, (default = None).
+    feedforward : `FeedForward`, optional, (default = `None`)
         An optional feedforward layer to apply after the seq2vec_encoder.
     dropout : `float`, optional (default = `None`)
         Dropout percentage to use.
     num_labels : `int`, optional (default = `None`)
         Number of labels to project to in classification layer. By default, the classification layer will
         project to the size of the vocabulary namespace corresponding to labels.
-    label_namespace : `str`, optional (default = "labels")
+    label_namespace : `str`, optional (default = `"labels"`)
         Vocabulary namespace corresponding to labels. By default, we use the "labels" namespace.
     initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
         If provided, will be used to initialize the model parameters.
@@ -100,9 +100,9 @@ def forward(  # type: ignore
         """
         # Parameters
 
-        tokens : TextFieldTensors
+        tokens : `TextFieldTensors`
             From a `TextField`
-        label : torch.IntTensor, optional (default = None)
+        label : `torch.IntTensor`, optional (default = `None`)
             From a `LabelField`
 
         # Returns
diff --git a/allennlp/models/model.py b/allennlp/models/model.py
index f9c6496fe53..2b76bcbfe8e 100644
--- a/allennlp/models/model.py
+++ b/allennlp/models/model.py
@@ -117,7 +117,7 @@ def forward(self, input1, input2, targets=None):
 
         # Parameters
 
-        *inputs :
+        *inputs : `Any`
             Tensors comprising everything needed to perform a training update, `including` labels,
             which should be optional (i.e have a default value of `None`).  At inference time,
             simply pass the relevant inputs, not including the labels.
diff --git a/allennlp/models/simple_tagger.py b/allennlp/models/simple_tagger.py
index bca1abe698a..43def309b24 100644
--- a/allennlp/models/simple_tagger.py
+++ b/allennlp/models/simple_tagger.py
@@ -45,7 +45,7 @@ class SimpleTagger(Model):
     label_namespace : `str`, optional (default=`labels`)
         This is needed to compute the SpanBasedF1Measure metric, if desired.
         Unless you did something unusual, the default value should be what you want.
-    verbose_metrics : `bool`, optional (default = False)
+    verbose_metrics : `bool`, optional (default = `False`)
         If true, metrics will be returned per label class in addition
         to the overall statistics.
     initializer : `InitializerApplicator`, optional (default=`InitializerApplicator()`)
@@ -115,7 +115,7 @@ def forward(
         """
         # Parameters
 
-        tokens : TextFieldTensors, required
+        tokens : `TextFieldTensors`, required
             The output of `TextField.as_array()`, which should typically be passed directly to a
             `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer`
             tensors.  At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens":
@@ -124,12 +124,12 @@ def forward(
             sequence.  The dictionary is designed to be passed directly to a `TextFieldEmbedder`,
             which knows how to combine different word representations into a single vector per
             token in your input.
-        tags : torch.LongTensor, optional (default = None)
+        tags : `torch.LongTensor`, optional (default = `None`)
             A torch tensor representing the sequence of integer gold class labels of shape
             `(batch_size, num_tokens)`.
-        metadata : `List[Dict[str, Any]]`, optional, (default = None)
+        metadata : `List[Dict[str, Any]]`, optional, (default = `None`)
             metadata containing the original words in the sentence to be tagged under a 'words' key.
-        ignore_loss_on_o_tags : `bool`, optional (default = False)
+        ignore_loss_on_o_tags : `bool`, optional (default = `False`)
             If True, we compute the loss only for actual spans in `tags`, and not on `O` tokens.
             This is useful for computing gradients of the loss on a _single span_, for
             interpretation / attacking.
diff --git a/allennlp/modules/attention/additive_attention.py b/allennlp/modules/attention/additive_attention.py
index 1558cbc7b25..ca1497733ad 100644
--- a/allennlp/modules/attention/additive_attention.py
+++ b/allennlp/modules/attention/additive_attention.py
@@ -27,7 +27,7 @@ class AdditiveAttention(Attention):
         The dimension of the matrix, `y`, described above.  This is `y.size()[-1]` - the length
         of the vector that will go into the similarity computation.  We need this so we can build
         the weight matrix correctly.
-    normalize : `bool`, optional (default : `True`)
+    normalize : `bool`, optional (default = `True`)
         If true, we normalize the computed similarities with a softmax, to return a probability
         distribution for your attention.  If false, this is just computing a similarity score.
     """
diff --git a/allennlp/modules/attention/attention.py b/allennlp/modules/attention/attention.py
index abb88b60af2..1c525bf3bc4 100644
--- a/allennlp/modules/attention/attention.py
+++ b/allennlp/modules/attention/attention.py
@@ -29,7 +29,7 @@ class Attention(torch.nn.Module, Registrable):
 
     # Parameters
 
-    normalize : `bool`, optional (default : `True`)
+    normalize : `bool`, optional (default = `True`)
         If true, we normalize the computed similarities with a softmax, to return a probability
         distribution for your attention.  If false, this is just computing a similarity score.
     """
diff --git a/allennlp/modules/attention/bilinear_attention.py b/allennlp/modules/attention/bilinear_attention.py
index 84da81b6cff..265f6f605da 100644
--- a/allennlp/modules/attention/bilinear_attention.py
+++ b/allennlp/modules/attention/bilinear_attention.py
@@ -25,10 +25,10 @@ class BilinearAttention(Attention):
         The dimension of the matrix, `y`, described above.  This is `y.size()[-1]` - the length
         of the vector that will go into the similarity computation.  We need this so we can build
         the weight matrix correctly.
-    activation : `Activation`, optional (default=linear (i.e. no activation))
-        An activation function applied after the `x^T W y + b` calculation.  Default is no
-        activation.
-    normalize : `bool`, optional (default : `True`)
+    activation : `Activation`, optional (default=`linear`)
+        An activation function applied after the `x^T W y + b` calculation.  Default is
+        linear, i.e. no activation.
+    normalize : `bool`, optional (default=`True`)
         If true, we normalize the computed similarities with a softmax, to return a probability
         distribution for your attention.  If false, this is just computing a similarity score.
     """
diff --git a/allennlp/modules/attention/linear_attention.py b/allennlp/modules/attention/linear_attention.py
index 9a55e4a9d5f..6ae86490a9a 100644
--- a/allennlp/modules/attention/linear_attention.py
+++ b/allennlp/modules/attention/linear_attention.py
@@ -38,12 +38,12 @@ class LinearAttention(Attention):
         The dimension of the second tensor, `y`, described above.  This is `y.size()[-1]` - the
         length of the vector that will go into the similarity computation.  We need this so we can
         build weight vectors correctly.
-    combination : `str`, optional (default="x,y")
+    combination : `str`, optional (default=`"x,y"`)
         Described above.
-    activation : `Activation`, optional (default=linear (i.e. no activation))
-        An activation function applied after the `w^T * [x;y] + b` calculation.  Default is no
-        activation.
-    normalize : `bool`, optional (default=True)
+    activation : `Activation`, optional (default=`linear`)
+        An activation function applied after the `w^T * [x;y] + b` calculation.  Default is
+        linear, i.e. no activation.
+    normalize : `bool`, optional (default=`True`)
     """
 
     def __init__(
diff --git a/allennlp/modules/augmented_lstm.py b/allennlp/modules/augmented_lstm.py
index 34f72c26474..9c3028b0e36 100644
--- a/allennlp/modules/augmented_lstm.py
+++ b/allennlp/modules/augmented_lstm.py
@@ -17,21 +17,24 @@
 class AugmentedLSTMCell(torch.nn.Module):
     """
     `AugmentedLSTMCell` implements a AugmentedLSTM cell.
-    Args:
-        embed_dim (int): The number of expected features in the input.
-        lstm_dim (int): Number of features in the hidden state of the LSTM.
-        Defaults to 32.
-        use_highway (bool): If `True` we append a highway network to the
-        outputs of the LSTM.
-        Defaults to True.
-        use_bias (bool): If `True` we use a bias in our LSTM calculations, otherwise
-        we don't.
-
-    Attributes:
-        input_linearity (nn.Module): Fused weight matrix which
-            computes a linear function over the input.
-        state_linearity (nn.Module): Fused weight matrix which
-            computes a linear function over the states.
+
+    # Parameters
+
+    embed_dim : `int`
+        The number of expected features in the input.
+    lstm_dim : `int`
+        Number of features in the hidden state of the LSTM.
+    use_highway : `bool`, optional (default = `True`)
+        If `True` we append a highway network to the outputs of the LSTM.
+    use_bias : `bool`, optional (default = `True`)
+        If `True` we use a bias in our LSTM calculations, otherwise we don't.
+
+    # Attributes
+
+    input_linearity : `nn.Module`
+        Fused weight matrix which computes a linear function over the input.
+    state_linearity : `nn.Module`
+        Fused weight matrix which computes a linear function over the states.
     """
 
     def __init__(
@@ -85,19 +88,23 @@ def forward(
         variational_dropout_mask: Optional[torch.BoolTensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
-        Warning: DO NOT USE THIS LAYER DIRECTLY, INSTEAD USE the AugmentedLSTM class
+        !!! Warning
+            DO NOT USE THIS LAYER DIRECTLY, instead use the AugmentedLSTM class
+
+        # Parameters
 
-        Args:
-            x (torch.Tensor): Input tensor of shape
-                (bsize x input_dim).
-            states (Tuple[torch.Tensor, torch.Tensor]): Tuple of tensors containing
-                the hidden state and the cell state of each element in
-                the batch. Each of these tensors have a dimension of
-                (bsize x nhid). Defaults to `None`.
+        x : `torch.Tensor`
+            Input tensor of shape (bsize x input_dim).
+        states : `Tuple[torch.Tensor, torch.Tensor]`
+            Tuple of tensors containing
+            the hidden state and the cell state of each element in
+            the batch. Each of these tensors have a dimension of
+            (bsize x nhid). Defaults to `None`.
 
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]:
-                Returned states. Shape of each state is (bsize x nhid).
+        # Returns
+
+        `Tuple[torch.Tensor, torch.Tensor]`
+            Returned states. Shape of each state is (bsize x nhid).
 
         """
         hidden_state, memory_state = states
@@ -145,21 +152,28 @@ class AugmentedLstm(torch.nn.Module):
     appends an optional highway network to the output layer. Furthermore the
     dropout controls the level of variational dropout done.
 
-    Args:
-        input_size (int): The number of expected features in the input.
-        hidden_size (int): Number of features in the hidden state of the LSTM.
-            Defaults to 32.
-        go_forward (bool): Whether to compute features left to right (forward)
-            or right to left (backward).
-        recurrent_dropout_probability (float): Variational dropout probability
-            to use. Defaults to 0.0.
-        use_highway (bool): If `True` we append a highway network to the
-            outputs of the LSTM.
-        use_input_projection_bias (bool): If `True` we use a bias in
-            our LSTM calculations, otherwise we don't.
-
-    Attributes:
-        cell (AugmentedLSTMCell): AugmentedLSTMCell that is applied at every timestep.
+    # Parameters
+
+    input_size : `int`
+        The number of expected features in the input.
+    hidden_size : `int`
+        Number of features in the hidden state of the LSTM.
+        Defaults to 32.
+    go_forward : `bool`
+        Whether to compute features left to right (forward)
+        or right to left (backward).
+    recurrent_dropout_probability : `float`
+        Variational dropout probability to use. Defaults to 0.0.
+    use_highway : `bool`
+        If `True` we append a highway network to the outputs of the LSTM.
+    use_input_projection_bias : `bool`
+        If `True` we use a bias in our LSTM calculations, otherwise we don't.
+
+    # Attributes
+
+    cell : `AugmentedLSTMCell`
+        `AugmentedLSTMCell` that is applied at every timestep.
+
     """
 
     def __init__(
@@ -193,17 +207,21 @@ def forward(
         Given an input batch of sequential data such as word embeddings, produces a single layer unidirectional
         AugmentedLSTM representation of the sequential input and new state tensors.
 
-        Args:
-            inputs (PackedSequence): `bsize` sequences of shape `(len, input_dim)` each, in PackedSequence format
-            states (Tuple[torch.Tensor, torch.Tensor]): Tuple of tensors containing the initial hidden state and
-                the cell state of each element in the batch. Each of these tensors have a dimension of
-                (1 x bsize x nhid). Defaults to `None`.
+        # Parameters
+
+        inputs : `PackedSequence`
+            `bsize` sequences of shape `(len, input_dim)` each, in PackedSequence format
+        states : `Tuple[torch.Tensor, torch.Tensor]`
+            Tuple of tensors containing the initial hidden state and
+            the cell state of each element in the batch. Each of these tensors have a dimension of
+            (1 x bsize x nhid). Defaults to `None`.
 
-        Returns:
-            Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]:
-                AugmentedLSTM representation of input and the state of the LSTM `t = seq_len`.
-                Shape of representation is (bsize x seq_len x representation_dim).
-                Shape of each state is (1 x bsize x nhid).
+        # Returns
+
+        `Tuple[PackedSequence, Tuple[torch.Tensor, torch.Tensor]]`
+            AugmentedLSTM representation of input and the state of the LSTM `t = seq_len`.
+            Shape of representation is (bsize x seq_len x representation_dim).
+            Shape of each state is (1 x bsize x nhid).
 
         """
         if not isinstance(inputs, PackedSequence):
@@ -281,31 +299,34 @@ class BiAugmentedLstm(torch.nn.Module):
     Furthermore the dropout controls the level of variational dropout done.
 
     # Parameters
-        input_size : `int`, required.
-            The dimension of the inputs to the LSTM.
-        hidden_size : `int`, required.
-            The dimension of the outputs of the LSTM.
-        num_layers (int): Number of recurrent layers. Eg. setting `num_layers=2`
-            would mean stacking two LSTMs together to form a stacked LSTM,
-            with the second LSTM taking in the outputs of the first LSTM and
-            computing the final result. Defaults to 1.
-        bias (bool): If `True` we use a bias in our LSTM calculations, otherwise
-            we don't.
-        recurrent_dropout_probability (float): Variational dropout probability to use.
-            Defaults to 0.0.
-        bidirectional (bool): If `True`, becomes a bidirectional LSTM. Defaults
-            to `True`.
-                    to the outputs of the LSTM.
-        padding_value (float): Value for the padded elements. Defaults to 0.0.
-        use_highway : `bool`, optional (default = True)
-            Whether or not to use highway connections between layers. This effectively involves
-            reparameterising the normal output of an LSTM as::
-
-                gate = sigmoid(W_x1 * x_t + W_h * h_t)
-                output = gate * h_t  + (1 - gate) * (W_x2 * x_t)
+
+    input_size : `int`, required
+        The dimension of the inputs to the LSTM.
+    hidden_size : `int`, required.
+        The dimension of the outputs of the LSTM.
+    num_layers : `int`
+        Number of recurrent layers. Eg. setting `num_layers=2`
+        would mean stacking two LSTMs together to form a stacked LSTM,
+        with the second LSTM taking in the outputs of the first LSTM and
+        computing the final result. Defaults to 1.
+    bias : `bool`
+        If `True` we use a bias in our LSTM calculations, otherwise we don't.
+    recurrent_dropout_probability : `float`, optional (default = `0.0`)
+        Variational dropout probability to use.
+    bidirectional : `bool`
+        If `True`, becomes a bidirectional LSTM. Defaults to `True`.
+    padding_value : `float`, optional (default = `0.0`)
+        Value for the padded elements. Defaults to 0.0.
+    use_highway : `bool`, optional (default = `True`)
+        Whether or not to use highway connections between layers. This effectively involves
+        reparameterising the normal output of an LSTM as::
+
+            gate = sigmoid(W_x1 * x_t + W_h * h_t)
+            output = gate * h_t  + (1 - gate) * (W_x2 * x_t)
+
     # Returns
 
-    output_accumulator : PackedSequence
+    output_accumulator : `PackedSequence`
         The outputs of the LSTM for each timestep. A tensor of shape (batch_size, max_timesteps, hidden_size) where
         for a given batch element, all outputs past the sequence length for that batch are zero tensors.
     """
@@ -371,21 +392,24 @@ def forward(
         a AugmentedLSTM representation of the sequential input and new state
         tensors.
 
-        Args:
-            inputs : `PackedSequence`, required.
-                A tensor of shape (batch_size, num_timesteps, input_size)
-                to apply the LSTM over.
-            states (Tuple[torch.Tensor, torch.Tensor]): Tuple of tensors containing
-                the initial hidden state and the cell state of each element in
-                the batch. Each of these tensors have a dimension of
-                (bsize x num_layers x num_directions * nhid). Defaults to `None`.
-
-        Returns:
-            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-                AgumentedLSTM representation of input and
-                the state of the LSTM `t = seq_len`.
-                Shape of representation is (bsize x seq_len x representation_dim).
-                Shape of each state is (bsize x num_layers * num_directions x nhid).
+        # Parameters
+
+        inputs : `PackedSequence`, required.
+            A tensor of shape (batch_size, num_timesteps, input_size)
+            to apply the LSTM over.
+        states : `Tuple[torch.Tensor, torch.Tensor]`
+            Tuple of tensors containing
+            the initial hidden state and the cell state of each element in
+            the batch. Each of these tensors have a dimension of
+            (bsize x num_layers x num_directions * nhid). Defaults to `None`.
+
+        # Returns
+
+        `Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]`
+            AgumentedLSTM representation of input and
+            the state of the LSTM `t = seq_len`.
+            Shape of representation is (bsize x seq_len x representation_dim).
+            Shape of each state is (bsize x num_layers * num_directions x nhid).
 
         """
 
diff --git a/allennlp/modules/bimpm_matching.py b/allennlp/modules/bimpm_matching.py
index 5e04a94f29d..6d75ede67bf 100644
--- a/allennlp/modules/bimpm_matching.py
+++ b/allennlp/modules/bimpm_matching.py
@@ -115,23 +115,23 @@ class BiMpmMatching(nn.Module, FromParams):
 
     # Parameters
 
-    hidden_dim : `int`, optional (default = 100)
+    hidden_dim : `int`, optional (default = `100`)
         The hidden dimension of the representations
-    num_perspectives : `int`, optional (default = 20)
+    num_perspectives : `int`, optional (default = `20`)
         The number of perspectives for matching
-    share_weights_between_directions : `bool`, optional (default = True)
+    share_weights_between_directions : `bool`, optional (default = `True`)
         If True, share weight between matching from sentence1 to sentence2 and from sentence2
         to sentence1, useful for non-symmetric tasks
-    is_forward : `bool`, optional (default = None)
+    is_forward : `bool`, optional (default = `None`)
         Whether the matching is for forward sequence or backward sequence, useful in finding last
         token in full matching. It can not be None if with_full_match is True.
-    with_full_match : `bool`, optional (default = True)
+    with_full_match : `bool`, optional (default = `True`)
         If True, include full match
-    with_maxpool_match : `bool`, optional (default = True)
+    with_maxpool_match : `bool`, optional (default = `True`)
         If True, include max pool match
-    with_attentive_match : `bool`, optional (default = True)
+    with_attentive_match : `bool`, optional (default = `True`)
         If True, include attentive match
-    with_max_attentive_match : `bool`, optional (default = True)
+    with_max_attentive_match : `bool`, optional (default = `True`)
         If True, include max attentive match
     """
 
diff --git a/allennlp/modules/conditional_random_field.py b/allennlp/modules/conditional_random_field.py
index e5f9c0b3055..452bc43792e 100644
--- a/allennlp/modules/conditional_random_field.py
+++ b/allennlp/modules/conditional_random_field.py
@@ -167,12 +167,12 @@ class ConditionalRandomField(torch.nn.Module):
 
     num_tags : `int`, required
         The number of tags.
-    constraints : `List[Tuple[int, int]]`, optional (default: None)
+    constraints : `List[Tuple[int, int]]`, optional (default = `None`)
         An optional list of allowed transitions (from_tag_id, to_tag_id).
         These are applied to `viterbi_tags()` but do not affect `forward()`.
         These should be derived from `allowed_transitions` so that the
         start and end transitions are handled correctly for your tag type.
-    include_start_end_transitions : `bool`, optional (default: True)
+    include_start_end_transitions : `bool`, optional (default = `True`)
         Whether to include the start and end transition parameters.
     """
 
diff --git a/allennlp/modules/elmo.py b/allennlp/modules/elmo.py
index bfd48927afc..f48c2554966 100644
--- a/allennlp/modules/elmo.py
+++ b/allennlp/modules/elmo.py
@@ -63,26 +63,26 @@ class Elmo(torch.nn.Module, FromParams):
         character-convnet output, 1st lstm output, 2nd lstm output).
     requires_grad : `bool`, optional
         If True, compute gradient of ELMo parameters for fine tuning.
-    do_layer_norm : `bool`, optional, (default = False).
+    do_layer_norm : `bool`, optional, (default = `False`).
         Should we apply layer normalization (passed to `ScalarMix`)?
-    dropout : `float`, optional, (default = 0.5).
+    dropout : `float`, optional, (default = `0.5`).
         The dropout to be applied to the ELMo representations.
-    vocab_to_cache : `List[str]`, optional, (default = None).
+    vocab_to_cache : `List[str]`, optional, (default = `None`).
         A list of words to pre-compute and cache character convolutions
         for. If you use this option, Elmo expects that you pass word
         indices of shape (batch_size, timesteps) to forward, instead
         of character indices. If you use this option and pass a word which
         wasn't pre-cached, this will break.
-    keep_sentence_boundaries : `bool`, optional, (default = False)
+    keep_sentence_boundaries : `bool`, optional, (default = `False`)
         If True, the representation of the sentence boundary tokens are
         not removed.
-    scalar_mix_parameters : `List[float]`, optional, (default = None)
+    scalar_mix_parameters : `List[float]`, optional, (default = `None`)
         If not `None`, use these scalar mix parameters to weight the representations
         produced by different layers. These mixing weights are not updated during
         training. The mixing weights here should be the unnormalized (i.e., pre-softmax)
         weights. So, if you wanted to use only the 1st layer of a 2-layer ELMo,
         you can set this to [-9e10, 1, -9e10 ].
-    module : `torch.nn.Module`, optional, (default = None).
+    module : `torch.nn.Module`, optional, (default = `None`).
         If provided, then use this module instead of the pre-trained ELMo biLM.
         If using this option, then pass `None` for both `options_file`
         and `weight_file`.  The module must provide a public attribute
@@ -269,9 +269,10 @@ class _ElmoCharacterEncoder(torch.nn.Module):
         ELMo JSON options file
     weight_file : `str`
         ELMo hdf5 weight file
-    requires_grad : `bool`, optional, (default = False).
+    requires_grad : `bool`, optional, (default = `False`).
         If True, compute gradient of ELMo parameters for fine tuning.
 
+
     The relevant section of the options file is something like:
 
     ```
@@ -486,9 +487,9 @@ class _ElmoBiLm(torch.nn.Module):
         ELMo JSON options file
     weight_file : `str`
         ELMo hdf5 weight file
-    requires_grad : `bool`, optional, (default = False).
+    requires_grad : `bool`, optional, (default = `False`).
         If True, compute gradient of ELMo parameters for fine tuning.
-    vocab_to_cache : `List[str]`, optional, (default = None).
+    vocab_to_cache : `List[str]`, optional, (default = `None`).
         A list of words to pre-compute and cache character convolutions
         for. If you use this option, _ElmoBiLm expects that you pass word
         indices of shape (batch_size, timesteps) to forward, instead
diff --git a/allennlp/modules/elmo_lstm.py b/allennlp/modules/elmo_lstm.py
index bfd6cad87c3..230e40e63b4 100644
--- a/allennlp/modules/elmo_lstm.py
+++ b/allennlp/modules/elmo_lstm.py
@@ -47,12 +47,12 @@ class ElmoLstm(_EncoderBase):
         The number of bidirectional LSTMs to use.
     requires_grad : `bool`, optional
         If True, compute gradient of ELMo parameters for fine tuning.
-    recurrent_dropout_probability : `float`, optional (default = 0.0)
+    recurrent_dropout_probability : `float`, optional (default = `0.0`)
         The dropout probability to be used in a dropout scheme as stated in
         [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks][0].
-    state_projection_clip_value : `float`, optional, (default = None)
+    state_projection_clip_value : `float`, optional, (default = `None`)
         The magnitude with which to clip the hidden_state after projecting it.
-    memory_cell_clip_value : `float`, optional, (default = None)
+    memory_cell_clip_value : `float`, optional, (default = `None`)
         The magnitude with which to clip the memory cell.
     """
 
@@ -176,7 +176,7 @@ def _lstm_forward(
 
         inputs : `PackedSequence`, required.
             A batch first `PackedSequence` to run the stacked LSTM over.
-        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = None)
+        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = `None`)
             A tuple (state, memory) representing the initial hidden state and memory
             of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
             (num_layers, batch_size, 2 * cell_size) respectively.
diff --git a/allennlp/modules/encoder_base.py b/allennlp/modules/encoder_base.py
index 865309f9548..0c49f4b8c8f 100644
--- a/allennlp/modules/encoder_base.py
+++ b/allennlp/modules/encoder_base.py
@@ -57,16 +57,18 @@ def sort_and_run_forward(
 
         # Parameters
 
-        module : `Callable[[PackedSequence, Optional[RnnState]],
-                            Tuple[Union[PackedSequence, torch.Tensor], RnnState]]`, required.
-            A function to run on the inputs. In most cases, this is a `torch.nn.Module`.
+        module : `Callable[RnnInputs, RnnOutputs]`
+            A function to run on the inputs, where
+            `RnnInputs: [PackedSequence, Optional[RnnState]]` and
+            `RnnOutputs: Tuple[Union[PackedSequence, torch.Tensor], RnnState]`.
+            In most cases, this is a `torch.nn.Module`.
         inputs : `torch.Tensor`, required.
             A tensor of shape `(batch_size, sequence_length, embedding_size)` representing
             the inputs to the Encoder.
         mask : `torch.BoolTensor`, required.
             A tensor of shape `(batch_size, sequence_length)`, representing masked and
             non-masked elements of the sequence for each element in the batch.
-        hidden_state : `Optional[RnnState]`, (default = None).
+        hidden_state : `Optional[RnnState]`, (default = `None`).
             A single tensor of shape (num_layers, batch_size, hidden_size) representing the
             state of an RNN with or a tuple of
             tensors of shapes (num_layers, batch_size, hidden_size) and
diff --git a/allennlp/modules/feedforward.py b/allennlp/modules/feedforward.py
index 352e1d060ed..cd1746f385a 100644
--- a/allennlp/modules/feedforward.py
+++ b/allennlp/modules/feedforward.py
@@ -29,27 +29,28 @@ class FeedForward(torch.nn.Module, FromParams):
         The activation function to use after each `Linear` layer.  If this is a single function,
         we use it after all `Linear` layers.  If it is a `List[Activation]`,
         `len(activations)` must be `num_layers`. Activation must have torch.nn.Module type.
-    dropout : `Union[float, List[float]]`, optional (default = 0.0)
+    dropout : `Union[float, List[float]]`, optional (default = `0.0`)
         If given, we will apply this amount of dropout after each layer.  Semantics of `float`
         versus `List[float]` is the same as with other parameters.
 
-    Example:
-    ```
-        >>> FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
-        FeedForward(
-          (_activations): ModuleList(
-            (0): ReLU()
-            (1): ReLU()
-          )
-          (_linear_layers): ModuleList(
-            (0): Linear(in_features=124, out_features=64, bias=True)
-            (1): Linear(in_features=64, out_features=32, bias=True)
-          )
-          (_dropout): ModuleList(
-            (0): Dropout(p=0.2, inplace=False)
-            (1): Dropout(p=0.2, inplace=False)
-          )
-        )
+    # Examples
+
+    ```python
+    FeedForward(124, 2, [64, 32], torch.nn.ReLU(), 0.2)
+    #> FeedForward(
+    #>   (_activations): ModuleList(
+    #>     (0): ReLU()
+    #>     (1): ReLU()
+    #>   )
+    #>   (_linear_layers): ModuleList(
+    #>     (0): Linear(in_features=124, out_features=64, bias=True)
+    #>     (1): Linear(in_features=64, out_features=32, bias=True)
+    #>   )
+    #>   (_dropout): ModuleList(
+    #>     (0): Dropout(p=0.2, inplace=False)
+    #>     (1): Dropout(p=0.2, inplace=False)
+    #>   )
+    #> )
     ```
     """
 
diff --git a/allennlp/modules/gated_sum.py b/allennlp/modules/gated_sum.py
index b3a7df6dde7..8a353231aca 100644
--- a/allennlp/modules/gated_sum.py
+++ b/allennlp/modules/gated_sum.py
@@ -15,7 +15,7 @@ class GatedSum(torch.nn.Module):
 
     input_dim : `int`, required
         The dimensionality of the input. We assume the input have shape `(..., input_dim)`.
-    activation : `Activation`, optional (default = torch.nn.Sigmoid())
+    activation : `Activation`, optional (default = `torch.nn.Sigmoid()`)
         The activation function to use.
     """
 
diff --git a/allennlp/modules/lstm_cell_with_projection.py b/allennlp/modules/lstm_cell_with_projection.py
index 8c715ac20da..1890f72152f 100644
--- a/allennlp/modules/lstm_cell_with_projection.py
+++ b/allennlp/modules/lstm_cell_with_projection.py
@@ -28,18 +28,18 @@ class LstmCellWithProjection(torch.nn.Module):
         The dimension of the outputs of the LSTM.
     cell_size : `int`, required.
         The dimension of the memory cell used for the LSTM.
-    go_forward : `bool`, optional (default = True)
+    go_forward : `bool`, optional (default = `True`)
         The direction in which the LSTM is applied to the sequence.
         Forwards by default, or backwards if False.
-    recurrent_dropout_probability : `float`, optional (default = 0.0)
+    recurrent_dropout_probability : `float`, optional (default = `0.0`)
         The dropout probability to be used in a dropout scheme as stated in
         [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks]
         [0]. Implementation wise, this simply
         applies a fixed dropout mask per sequence to the recurrent connection of the
         LSTM.
-    state_projection_clip_value : `float`, optional, (default = None)
+    state_projection_clip_value : `float`, optional, (default = `None`)
         The magnitude with which to clip the hidden_state after projecting it.
-    memory_cell_clip_value : `float`, optional, (default = None)
+    memory_cell_clip_value : `float`, optional, (default = `None`)
         The magnitude with which to clip the memory cell.
 
     # Returns
@@ -109,7 +109,7 @@ def forward(
             to apply the LSTM over.
         batch_lengths : `List[int]`, required.
             A list of length batch_size containing the lengths of the sequences in batch.
-        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = None)
+        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = `None`)
             A tuple (state, memory) representing the initial hidden state and memory
             of the LSTM. The `state` has shape (1, batch_size, hidden_size) and the
             `memory` has shape (1, batch_size, cell_size).
diff --git a/allennlp/modules/matrix_attention/bilinear_matrix_attention.py b/allennlp/modules/matrix_attention/bilinear_matrix_attention.py
index ebce4f39170..e33807329f9 100644
--- a/allennlp/modules/matrix_attention/bilinear_matrix_attention.py
+++ b/allennlp/modules/matrix_attention/bilinear_matrix_attention.py
@@ -25,14 +25,14 @@ class BilinearMatrixAttention(MatrixAttention):
         The dimension of the matrix `Y`, described above.  This is `Y.size()[-1]` - the length
         of the vector that will go into the similarity computation.  We need this so we can build
         the weight matrix correctly.
-    activation : `Activation`, optional (default=linear (i.e. no activation))
-        An activation function applied after the `X W Y^T + b` calculation.  Default is no
-        activation.
-    use_input_biases : `bool`, optional (default = False)
+    activation : `Activation`, optional (default=`linear`)
+        An activation function applied after the `X W Y^T + b` calculation.  Default is
+        linear, i.e. no activation.
+    use_input_biases : `bool`, optional (default = `False`)
         If True, we add biases to the inputs such that the final computation
         is equivalent to the original bilinear matrix multiplication plus a
         projection of both inputs.
-    label_dim : `int`, optional (default = 1)
+    label_dim : `int`, optional (default = `1`)
         The number of output classes. Typically in an attention setting this will be one,
         but this parameter allows this class to function as an equivalent to `torch.nn.Bilinear`
         for matrices, rather than vectors.
diff --git a/allennlp/modules/matrix_attention/linear_matrix_attention.py b/allennlp/modules/matrix_attention/linear_matrix_attention.py
index b03ebca2055..1184b848198 100644
--- a/allennlp/modules/matrix_attention/linear_matrix_attention.py
+++ b/allennlp/modules/matrix_attention/linear_matrix_attention.py
@@ -40,11 +40,11 @@ class LinearMatrixAttention(MatrixAttention):
         The dimension of the second tensor, `y`, described above.  This is `y.size()[-1]` - the
         length of the vector that will go into the similarity computation.  We need this so we can
         build weight vectors correctly.
-    combination : `str`, optional (default="x,y")
+    combination : `str`, optional (default=`"x,y"`)
         Described above.
-    activation : `Activation`, optional (default=linear (i.e. no activation))
-        An activation function applied after the `w^T * [x;y] + b` calculation.  Default is no
-        activation.
+    activation : `Activation`, optional (default=`linear`)
+        An activation function applied after the `w^T * [x;y] + b` calculation.  Default is
+        linear, i.e. no activation.
     """
 
     def __init__(
diff --git a/allennlp/modules/maxout.py b/allennlp/modules/maxout.py
index ffb8893a1de..79b86c97fe1 100644
--- a/allennlp/modules/maxout.py
+++ b/allennlp/modules/maxout.py
@@ -27,7 +27,7 @@ class Maxout(torch.nn.Module, FromParams):
         The size of max-pools.  If this is a single `int`, we use
         it for all maxout layers.  If it is a `Sequence[int]`, `len(pool_sizes)` must be
         `num_layers`.
-    dropout : `Union[float, Sequence[float]]`, optional (default = 0.0)
+    dropout : `Union[float, Sequence[float]]`, optional (default = `0.0`)
         If given, we will apply this amount of dropout after each layer.  Semantics of `float`
         versus `Sequence[float]` is the same as with other parameters.
     """
diff --git a/allennlp/modules/sampled_softmax_loss.py b/allennlp/modules/sampled_softmax_loss.py
index 7fb01ee631d..c500fd2ab66 100644
--- a/allennlp/modules/sampled_softmax_loss.py
+++ b/allennlp/modules/sampled_softmax_loss.py
@@ -64,13 +64,13 @@ class SampledSoftmaxLoss(torch.nn.Module):
         The dimension to softmax over
     num_samples, `int`, required
         During training take this many samples. Must be less than num_words.
-    sparse, `bool`, optional (default = False)
+    sparse, `bool`, optional (default = `False`)
         If this is true, we use a sparse embedding matrix.
-    unk_id, `int`, optional (default = None)
+    unk_id, `int`, optional (default = `None`)
         If provided, the id that represents unknown characters.
-    use_character_inputs, `bool`, optional (default = True)
+    use_character_inputs, `bool`, optional (default = `True`)
         Whether to use character inputs
-    use_fast_sampler, `bool`, optional (default = False)
+    use_fast_sampler, `bool`, optional (default = `False`)
         Whether to use the fast cython sampler.
     """
 
diff --git a/allennlp/modules/seq2seq_encoders/compose_encoder.py b/allennlp/modules/seq2seq_encoders/compose_encoder.py
index 7142b51a3ec..2dbbb00fa74 100644
--- a/allennlp/modules/seq2seq_encoders/compose_encoder.py
+++ b/allennlp/modules/seq2seq_encoders/compose_encoder.py
@@ -50,7 +50,7 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.
 
         inputs : `torch.Tensor`, required.
             A tensor of shape (batch_size, timesteps, input_dim)
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of shape (batch_size, timesteps).
 
         # Returns
diff --git a/allennlp/modules/seq2seq_encoders/feedforward_encoder.py b/allennlp/modules/seq2seq_encoders/feedforward_encoder.py
index ed42ddd7f42..973b339cccd 100644
--- a/allennlp/modules/seq2seq_encoders/feedforward_encoder.py
+++ b/allennlp/modules/seq2seq_encoders/feedforward_encoder.py
@@ -36,7 +36,7 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.
 
         inputs : `torch.Tensor`, required.
             A tensor of shape (batch_size, timesteps, input_dim)
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of shape (batch_size, timesteps).
 
         # Returns
diff --git a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py
index 09f05623ebe..398255a2aa5 100644
--- a/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py
+++ b/allennlp/modules/seq2seq_encoders/gated_cnn_encoder.py
@@ -146,9 +146,9 @@ class GatedCnnEncoder(Seq2SeqEncoder):
         The dimension of the inputs.
     layers : `Sequence[Sequence[Sequence[int]]]`, required
         The layer dimensions for each `ResidualBlock`.
-    dropout : `float`, optional (default = 0.0)
+    dropout : `float`, optional (default = `0.0`)
         The dropout for each `ResidualBlock`.
-    return_all_layers : `bool`, optional (default = False)
+    return_all_layers : `bool`, optional (default = `False`)
         Whether to return all layers or just the last layer.
     """
 
diff --git a/allennlp/modules/seq2seq_encoders/pass_through_encoder.py b/allennlp/modules/seq2seq_encoders/pass_through_encoder.py
index 33b9bb979f5..6029c5c2bd2 100644
--- a/allennlp/modules/seq2seq_encoders/pass_through_encoder.py
+++ b/allennlp/modules/seq2seq_encoders/pass_through_encoder.py
@@ -37,7 +37,7 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.
 
         inputs : `torch.Tensor`, required.
             A tensor of shape (batch_size, timesteps, input_dim)
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of shape (batch_size, timesteps).
 
         # Returns
diff --git a/allennlp/modules/seq2seq_encoders/pytorch_transformer_wrapper.py b/allennlp/modules/seq2seq_encoders/pytorch_transformer_wrapper.py
index 08fff1eb317..9752de641ad 100644
--- a/allennlp/modules/seq2seq_encoders/pytorch_transformer_wrapper.py
+++ b/allennlp/modules/seq2seq_encoders/pytorch_transformer_wrapper.py
@@ -30,12 +30,12 @@ class PytorchTransformer(Seq2SeqEncoder):
         The number of stacked self attention -> feedforward -> layer normalisation blocks.
     num_attention_heads : `int`, required.
         The number of attention heads to use per layer.
-    use_positional_encoding : `bool`, optional, (default = True)
+    use_positional_encoding : `bool`, optional, (default = `True`)
         Whether to add sinusoidal frequencies to the input tensor. This is strongly recommended,
         as without this feature, the self attention layers have no idea of absolute or relative
         position (as they are just computing pairwise similarity between vectors of elements),
         which can be important features for many tasks.
-    dropout_prob : `float`, optional, (default = 0.1)
+    dropout_prob : `float`, optional, (default = `0.1`)
         The dropout probability for the feedforward network.
     """  # noqa
 
diff --git a/allennlp/modules/seq2vec_encoders/bert_pooler.py b/allennlp/modules/seq2vec_encoders/bert_pooler.py
index 3bbc702552e..bcf45ffb30f 100644
--- a/allennlp/modules/seq2vec_encoders/bert_pooler.py
+++ b/allennlp/modules/seq2vec_encoders/bert_pooler.py
@@ -25,10 +25,10 @@ class BertPooler(Seq2VecEncoder):
         The pretrained BERT model to use. If this is a string,
         we will call `BertModel.from_pretrained(pretrained_model)`
         and use that.
-    requires_grad : `bool`, optional, (default = True)
+    requires_grad : `bool`, optional, (default = `True`)
         If True, the weights of the pooler will be updated during training.
         Otherwise they will not.
-    dropout : `float`, optional, (default = 0.0)
+    dropout : `float`, optional, (default = `0.0`)
         Amount of dropout to apply after pooling
     """
 
diff --git a/allennlp/modules/seq2vec_encoders/cls_pooler.py b/allennlp/modules/seq2vec_encoders/cls_pooler.py
index a70a13ccb15..fe312241d95 100644
--- a/allennlp/modules/seq2vec_encoders/cls_pooler.py
+++ b/allennlp/modules/seq2vec_encoders/cls_pooler.py
@@ -16,13 +16,13 @@ class ClsPooler(Seq2VecEncoder):
 
     # Parameters
 
-    embedding_dim: int, optional
+    embedding_dim: `int`, optional
         This isn't needed for any computation that we do, but we sometimes rely on `get_input_dim`
         and `get_output_dim` to check parameter settings, or to instantiate final linear layers.  In
         order to give the right values there, we need to know the embedding dimension.  If you're
         using this with a transformer from the `transformers` library, this can often be found with
         `model.config.hidden_size`, if you're not sure.
-    cls_is_last_token: bool, optional
+    cls_is_last_token: `bool`, optional
         The [CLS] token is the first token for most of the pretrained transformer models.
         For some models such as XLNet, however, it is the last token, and we therefore need to
         select at the end.
diff --git a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py
index b115b744f3b..95d96912aa3 100644
--- a/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py
+++ b/allennlp/modules/seq2vec_encoders/cnn_highway_encoder.py
@@ -30,9 +30,9 @@ class CnnHighwayEncoder(Seq2VecEncoder):
         The number of highway layers.
     projection_dim : `int`, required
         The output dimension of the projection layer.
-    activation : `str`, optional (default = 'relu')
+    activation : `str`, optional (default = `'relu'`)
         The activation function for the convolutional layers.
-    projection_location : `str`, optional (default = 'after_highway')
+    projection_location : `str`, optional (default = `'after_highway'`)
         Where to apply the projection layer. Valid values are
         'after_highway', 'after_cnn', and None.
     """
@@ -106,10 +106,10 @@ def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor) -> Dict[str, tor
 
         # Parameters
 
-        inputs:
+        inputs: `torch.Tensor`
             Shape `(batch_size, num_characters, embedding_dim)`
             Character embeddings representing the current batch.
-        mask:
+        mask: `torch.BoolTensor`
             Shape `(batch_size, num_characters)`
             Currently unused. The mask for characters is implicit. See TokenCharactersEncoder.forward.
 
diff --git a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py
index e9ddb1772bf..a2f941b222d 100644
--- a/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py
+++ b/allennlp/modules/span_extractors/bidirectional_endpoint_span_extractor.py
@@ -42,20 +42,20 @@ class BidirectionalEndpointSpanExtractor(SpanExtractor):
 
     input_dim : `int`, required
         The final dimension of the `sequence_tensor`.
-    forward_combination : `str`, optional (default = "y-x").
+    forward_combination : `str`, optional (default = `"y-x"`).
         The method used to combine the `forward_start_embeddings` and `forward_end_embeddings`
         for the forward direction of the bidirectional representation.
         See above for a full description.
-    backward_combination : `str`, optional (default = "x-y").
+    backward_combination : `str`, optional (default = `"x-y"`).
         The method used to combine the `backward_start_embeddings` and `backward_end_embeddings`
         for the backward direction of the bidirectional representation.
         See above for a full description.
-    num_width_embeddings : `int`, optional (default = None).
+    num_width_embeddings : `int`, optional (default = `None`).
         Specifies the number of buckets to use when representing
         span width features.
-    span_width_embedding_dim : `int`, optional (default = None).
+    span_width_embedding_dim : `int`, optional (default = `None`).
         The embedding size for the span_width features.
-    bucket_widths : `bool`, optional (default = False).
+    bucket_widths : `bool`, optional (default = `False`).
         Whether to bucket the span widths into log-space buckets. If `False`,
         the raw span widths are used.
     use_sentinels : `bool`, optional (default = `True`).
diff --git a/allennlp/modules/span_extractors/endpoint_span_extractor.py b/allennlp/modules/span_extractors/endpoint_span_extractor.py
index 2a46b7daf08..d2387759f08 100644
--- a/allennlp/modules/span_extractors/endpoint_span_extractor.py
+++ b/allennlp/modules/span_extractors/endpoint_span_extractor.py
@@ -30,15 +30,15 @@ class EndpointSpanExtractor(SpanExtractor):
 
     input_dim : `int`, required.
         The final dimension of the `sequence_tensor`.
-    combination : `str`, optional (default = "x,y").
+    combination : `str`, optional (default = `"x,y"`).
         The method used to combine the `start_embedding` and `end_embedding`
         representations. See above for a full description.
-    num_width_embeddings : `int`, optional (default = None).
+    num_width_embeddings : `int`, optional (default = `None`).
         Specifies the number of buckets to use when representing
         span width features.
-    span_width_embedding_dim : `int`, optional (default = None).
+    span_width_embedding_dim : `int`, optional (default = `None`).
         The embedding size for the span_width features.
-    bucket_widths : `bool`, optional (default = False).
+    bucket_widths : `bool`, optional (default = `False`).
         Whether to bucket the span widths into log-space buckets. If `False`,
         the raw span widths are used.
     use_exclusive_start_indices : `bool`, optional (default = `False`).
diff --git a/allennlp/modules/stacked_alternating_lstm.py b/allennlp/modules/stacked_alternating_lstm.py
index 482700e8645..fc9eb5a9ee3 100644
--- a/allennlp/modules/stacked_alternating_lstm.py
+++ b/allennlp/modules/stacked_alternating_lstm.py
@@ -29,10 +29,10 @@ class StackedAlternatingLstm(torch.nn.Module):
         The dimension of the outputs of the LSTM.
     num_layers : `int`, required
         The number of stacked LSTMs to use.
-    recurrent_dropout_probability : `float`, optional (default = 0.0)
+    recurrent_dropout_probability : `float`, optional (default = `0.0`)
         The dropout probability to be used in a dropout scheme as stated in
         [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks][1].
-    use_input_projection_bias : `bool`, optional (default = True)
+    use_input_projection_bias : `bool`, optional (default = `True`)
         Whether or not to use a bias on the input projection layer. This is mainly here
         for backwards compatibility reasons and will be removed (and set to False)
         in future releases.
@@ -87,7 +87,7 @@ def forward(
 
         inputs : `PackedSequence`, required.
             A batch first `PackedSequence` to run the stacked LSTM over.
-        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
+        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = `None`)
             A tuple (state, memory) representing the initial hidden state and memory
             of the LSTM. Each tensor has shape (1, batch_size, output_dimension).
 
diff --git a/allennlp/modules/stacked_bidirectional_lstm.py b/allennlp/modules/stacked_bidirectional_lstm.py
index 95c80a57fba..4ed74efe9f8 100644
--- a/allennlp/modules/stacked_bidirectional_lstm.py
+++ b/allennlp/modules/stacked_bidirectional_lstm.py
@@ -28,15 +28,15 @@ class StackedBidirectionalLstm(torch.nn.Module):
         The dimension of the outputs of the LSTM.
     num_layers : `int`, required
         The number of stacked Bidirectional LSTMs to use.
-    recurrent_dropout_probability : `float`, optional (default = 0.0)
+    recurrent_dropout_probability : `float`, optional (default = `0.0`)
         The recurrent dropout probability to be used in a dropout scheme as
         stated in [A Theoretically Grounded Application of Dropout in Recurrent
         Neural Networks][0].
-    layer_dropout_probability : `float`, optional (default = 0.0)
+    layer_dropout_probability : `float`, optional (default = `0.0`)
         The layer wise dropout probability to be used in a dropout scheme as
         stated in [A Theoretically Grounded Application of Dropout in Recurrent
         Neural Networks][0].
-    use_highway : `bool`, optional (default = True)
+    use_highway : `bool`, optional (default = `True`)
         Whether or not to use highway connections between layers. This effectively involves
         reparameterising the normal output of an LSTM as::
 
@@ -97,7 +97,7 @@ def forward(
 
         inputs : `PackedSequence`, required.
             A batch first `PackedSequence` to run the stacked LSTM over.
-        initial_state : Tuple[torch.Tensor, torch.Tensor], optional, (default = None)
+        initial_state : `Tuple[torch.Tensor, torch.Tensor]`, optional, (default = `None`)
             A tuple (state, memory) representing the initial hidden state and memory
             of the LSTM. Each tensor has shape (num_layers, batch_size, output_dimension * 2).
 
diff --git a/allennlp/modules/text_field_embedders/text_field_embedder.py b/allennlp/modules/text_field_embedders/text_field_embedder.py
index f01a4228ec0..fb9db2488c5 100644
--- a/allennlp/modules/text_field_embedders/text_field_embedder.py
+++ b/allennlp/modules/text_field_embedders/text_field_embedder.py
@@ -33,7 +33,7 @@ def forward(
             A dictionary that was the output of a call to `TextField.as_tensor`.  Each tensor in
             here is assumed to have a shape roughly similar to `(batch_size, sequence_length)`
             (perhaps with an extra trailing dimension for the characters in each token).
-        num_wrapping_dims : `int`, optional (default=0)
+        num_wrapping_dims : `int`, optional (default=`0`)
             If you have a `ListField[TextField]` that created the `text_field_input`, you'll
             end up with tensors of shape `(batch_size, wrapping_dim1, wrapping_dim2, ...,
             sequence_length)`.  This parameter tells us how many wrapping dimensions there are, so
diff --git a/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py b/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
index 2083d7e4feb..ebc18522011 100644
--- a/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
+++ b/allennlp/modules/token_embedders/bag_of_word_counts_token_embedder.py
@@ -22,7 +22,7 @@ class BagOfWordCountsTokenEmbedder(TokenEmbedder):
     # Parameters
 
     vocab : `Vocabulary`
-    vocab_namespace : `str`, optional (default = "tokens")
+    vocab_namespace : `str`, optional (default = `"tokens"`)
         namespace of vocabulary to embed
     projection_dim : `int`, optional (default = `None`)
         if specified, will project the resulting bag of words representation
diff --git a/allennlp/modules/token_embedders/elmo_token_embedder.py b/allennlp/modules/token_embedders/elmo_token_embedder.py
index 83ae83a6023..5c609b10a92 100644
--- a/allennlp/modules/token_embedders/elmo_token_embedder.py
+++ b/allennlp/modules/token_embedders/elmo_token_embedder.py
@@ -25,7 +25,7 @@ class ElmoTokenEmbedder(TokenEmbedder):
         An ELMo hdf5 weight file.
     do_layer_norm : `bool`, optional.
         Should we apply layer normalization (passed to `ScalarMix`)?
-    dropout : `float`, optional, (default = 0.5).
+    dropout : `float`, optional, (default = `0.5`).
         The dropout value to be applied to the ELMo representations.
     requires_grad : `bool`, optional
         If True, compute gradient of ELMo parameters for fine tuning.
@@ -39,7 +39,7 @@ class ElmoTokenEmbedder(TokenEmbedder):
         indices of shape (batch_size, timesteps) to forward, instead
         of character indices. If you use this option and pass a word which
         wasn't pre-cached, this will break.
-    scalar_mix_parameters : `List[int]`, optional, (default=None)
+    scalar_mix_parameters : `List[int]`, optional, (default=`None`)
         If not `None`, use these scalar mix parameters to weight the representations
         produced by different layers. These mixing weights are not updated during
         training. The mixing weights here should be the unnormalized (i.e., pre-softmax)
diff --git a/allennlp/modules/token_embedders/embedding.py b/allennlp/modules/token_embedders/embedding.py
index f6b26eacce4..a64cae06016 100644
--- a/allennlp/modules/token_embedders/embedding.py
+++ b/allennlp/modules/token_embedders/embedding.py
@@ -50,36 +50,36 @@ class Embedding(TokenEmbedder):
         Size of the dictionary of embeddings (vocabulary size).
     embedding_dim : `int`
         The size of each embedding vector.
-    projection_dim : `int`, (optional, default=None)
+    projection_dim : `int`, optional (default=`None`)
         If given, we add a projection layer after the embedding layer.  This really only makes
         sense if `trainable` is `False`.
-    weight : `torch.FloatTensor`, (optional, default=None)
+    weight : `torch.FloatTensor`, optional (default=`None`)
         A pre-initialised weight matrix for the embedding lookup, allowing the use of
         pretrained vectors.
-    padding_index : `int`, (optional, default=None)
+    padding_index : `int`, optional (default=`None`)
         If given, pads the output with zeros whenever it encounters the index.
-    trainable : `bool`, (optional, default=True)
+    trainable : `bool`, optional (default=`True`)
         Whether or not to optimize the embedding parameters.
-    max_norm : `float`, (optional, default=None)
+    max_norm : `float`, optional (default=`None`)
         If given, will renormalize the embeddings to always have a norm lesser than this
-    norm_type : `float`, (optional, default=2)
+    norm_type : `float`, optional (default=`2`)
         The p of the p-norm to compute for the max_norm option
-    scale_grad_by_freq : `bool`, (optional, default=False)
+    scale_grad_by_freq : `bool`, optional (default=`False`)
         If given, this will scale gradients by the frequency of the words in the mini-batch.
-    sparse : `bool`, (optional, default=False)
+    sparse : `bool`, optional (default=`False`)
         Whether or not the Pytorch backend should use a sparse representation of the embedding weight.
-    vocab_namespace : `str`, (optional, default=None)
+    vocab_namespace : `str`, optional (default=`None`)
         In case of fine-tuning/transfer learning, the model's embedding matrix needs to be
         extended according to the size of extended-vocabulary. To be able to know how much to
         extend the embedding-matrix, it's necessary to know which vocab_namspace was used to
         construct it in the original training. We store vocab_namespace used during the original
         training as an attribute, so that it can be retrieved during fine-tuning.
-    pretrained_file : `str`, (optional, default=None)
+    pretrained_file : `str`, optional (default=`None`)
         Path to a file of word vectors to initialize the embedding matrix. It can be the
         path to a local file or a URL of a (cached) remote file. Two formats are supported:
             * hdf5 file - containing an embedding matrix in the form of a torch.Tensor;
             * text file - an utf-8 encoded text file with space separated fields.
-    vocab : `Vocabulary` (optional, default = None)
+    vocab : `Vocabulary`, optional (default = `None`)
         Used to construct an embedding from a pretrained file.
 
     # Returns
@@ -226,16 +226,16 @@ def extend_vocab(
         extended_vocab : `Vocabulary`
             Vocabulary extended from original vocabulary used to construct
             this `Embedding`.
-        vocab_namespace : `str`, (optional, default=None)
+        vocab_namespace : `str`, (optional, default=`None`)
             In case you know what vocab_namespace should be used for extension, you
             can pass it. If not passed, it will check if vocab_namespace used at the
             time of `Embedding` construction is available. If so, this namespace
             will be used or else extend_vocab will be a no-op.
-        extension_pretrained_file : `str`, (optional, default=None)
+        extension_pretrained_file : `str`, (optional, default=`None`)
             A file containing pretrained embeddings can be specified here. It can be
             the path to a local file or an URL of a (cached) remote file. Check format
             details in `from_params` of `Embedding` class.
-        model_path : `str`, (optional, default=None)
+        model_path : `str`, (optional, default=`None`)
             Path traversing the model attributes upto this embedding module.
             Eg. "_text_field_embedder.token_embedder_tokens". This is only useful
             to give a helpful error message when extend_vocab is implicitly called
@@ -348,9 +348,9 @@ def _read_pretrained_embeddings_file(
 
     vocab : `Vocabulary`, required.
         A Vocabulary object.
-    namespace : `str`, (optional, default=tokens)
+    namespace : `str`, (optional, default=`"tokens"`)
         The namespace of the vocabulary to find pretrained embeddings for.
-    trainable : `bool`, (optional, default=True)
+    trainable : `bool`, (optional, default=`True`)
         Whether or not the embedding parameters should be optimized.
 
     # Returns
diff --git a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
index 68662f4ca6c..14dc5115f89 100644
--- a/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
+++ b/allennlp/modules/token_embedders/pretrained_transformer_embedder.py
@@ -25,7 +25,7 @@ class PretrainedTransformerEmbedder(TokenEmbedder):
     model_name : `str`
         The name of the `transformers` model to use. Should be the same as the corresponding
         `PretrainedTransformerIndexer`.
-    max_length : `int`, optional (default = None)
+    max_length : `int`, optional (default = `None`)
         If positive, folds input token IDs into multiple segments of this length, pass them
         through the transformer model independently, and concatenate the final representations.
         Should be set to the same value as the `max_length` option on the
@@ -69,25 +69,21 @@ def forward(
         """
         # Parameters
 
-        token_ids: torch.LongTensor
-            Shape: [
-                batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces
-            ].
+        token_ids: `torch.LongTensor`
+            Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`.
             num_segment_concat_wordpieces is num_wordpieces plus special tokens inserted in the
             middle, e.g. the length of: "[CLS] A B C [SEP] [CLS] D E F [SEP]" (see indexer logic).
-        mask: torch.BoolTensor
+        mask: `torch.BoolTensor`
             Shape: [batch_size, num_wordpieces].
-        type_ids: Optional[torch.LongTensor]
-            Shape: [
-                batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces
-            ].
-        segment_concat_mask: Optional[torch.BoolTensor]
-            Shape: [batch_size, num_segment_concat_wordpieces].
+        type_ids: `Optional[torch.LongTensor]`
+            Shape: `[batch_size, num_wordpieces if max_length is None else num_segment_concat_wordpieces]`.
+        segment_concat_mask: `Optional[torch.BoolTensor]`
+            Shape: `[batch_size, num_segment_concat_wordpieces]`.
 
         # Returns
 
         `torch.Tensor`
-            Shape: [batch_size, num_wordpieces, embedding_size].
+            Shape: `[batch_size, num_wordpieces, embedding_size]`.
 
         """
 
@@ -147,14 +143,14 @@ def _fold_long_sequences(
         # Parameters
 
         token_ids: `torch.LongTensor`
-            Shape: [batch_size, num_segment_concat_wordpieces].
+            Shape: `[batch_size, num_segment_concat_wordpieces]`.
             num_segment_concat_wordpieces is num_wordpieces plus special tokens inserted in the
             middle, i.e. the length of: "[CLS] A B C [SEP] [CLS] D E F [SEP]" (see indexer logic).
         mask: `torch.BoolTensor`
-            Shape: [batch_size, num_segment_concat_wordpieces].
+            Shape: `[batch_size, num_segment_concat_wordpieces]`.
             The mask for the concatenated segments of wordpieces. The same as `segment_concat_mask`
             in `forward()`.
-        type_ids: Optional[torch.LongTensor]
+        type_ids: `Optional[torch.LongTensor]`
             Shape: [batch_size, num_segment_concat_wordpieces].
 
         # Returns:
diff --git a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py
index 8e6c979c4b8..9e60bf58e04 100644
--- a/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py
+++ b/allennlp/modules/token_embedders/pretrained_transformer_mismatched_embedder.py
@@ -20,7 +20,7 @@ class PretrainedTransformerMismatchedEmbedder(TokenEmbedder):
     model_name : `str`
         The name of the `transformers` model to use. Should be the same as the corresponding
         `PretrainedTransformerMismatchedIndexer`.
-    max_length : `int`, optional (default = None)
+    max_length : `int`, optional (default = `None`)
         If positive, folds input token IDs into multiple segments of this length, pass them
         through the transformer model independently, and concatenate the final representations.
         Should be set to the same value as the `max_length` option on the
@@ -49,20 +49,20 @@ def forward(
         """
         # Parameters
 
-        token_ids: torch.LongTensor
+        token_ids: `torch.LongTensor`
             Shape: [batch_size, num_wordpieces] (for exception see `PretrainedTransformerEmbedder`).
-        mask: torch.BoolTensor
+        mask: `torch.BoolTensor`
             Shape: [batch_size, num_orig_tokens].
-        offsets: torch.LongTensor
+        offsets: `torch.LongTensor`
             Shape: [batch_size, num_orig_tokens, 2].
             Maps indices for the original tokens, i.e. those given as input to the indexer,
             to a span in token_ids. `token_ids[i][offsets[i][j][0]:offsets[i][j][1] + 1]`
             corresponds to the original j-th token from the i-th batch.
-        wordpiece_mask: torch.BoolTensor
+        wordpiece_mask: `torch.BoolTensor`
             Shape: [batch_size, num_wordpieces].
-        type_ids: Optional[torch.LongTensor]
+        type_ids: `Optional[torch.LongTensor]`
             Shape: [batch_size, num_wordpieces].
-        segment_concat_mask: Optional[torch.BoolTensor]
+        segment_concat_mask: `Optional[torch.BoolTensor]`
             See `PretrainedTransformerEmbedder`.
 
         # Returns
diff --git a/allennlp/nn/beam_search.py b/allennlp/nn/beam_search.py
index d9a6a56b163..b9d281fe677 100644
--- a/allennlp/nn/beam_search.py
+++ b/allennlp/nn/beam_search.py
@@ -20,12 +20,12 @@ class BeamSearch:
 
     end_index : `int`
         The index of the "stop" or "end" token in the target vocabulary.
-    max_steps : `int`, optional (default = 50)
+    max_steps : `int`, optional (default = `50`)
         The maximum number of decoding steps to take, i.e. the maximum length
         of the predicted sequences.
-    beam_size : `int`, optional (default = 10)
+    beam_size : `int`, optional (default = `10`)
         The width of the beam used.
-    per_node_beam_size : `int`, optional (default = beam_size)
+    per_node_beam_size : `int`, optional (default = `beam_size`)
         The maximum number of candidates to consider per node, at each step in the search.
         If not given, this just defaults to `beam_size`. Setting this parameter
         to a number smaller than `beam_size` may give better results, as it can introduce
diff --git a/allennlp/nn/chu_liu_edmonds.py b/allennlp/nn/chu_liu_edmonds.py
index 8219a74eb34..74d9726ddcc 100644
--- a/allennlp/nn/chu_liu_edmonds.py
+++ b/allennlp/nn/chu_liu_edmonds.py
@@ -23,7 +23,7 @@ def decode_mst(
     length : `int`, required.
         The length of this sequence, as the energy may have come
         from a padded batch.
-    has_labels : `bool`, optional, (default = True)
+    has_labels : `bool`, optional, (default = `True`)
         Whether the graph has labels or not.
     """
     if has_labels and energy.ndim != 3:
diff --git a/allennlp/nn/initializers.py b/allennlp/nn/initializers.py
index 919a46c9ded..ed6bd5d4584 100644
--- a/allennlp/nn/initializers.py
+++ b/allennlp/nn/initializers.py
@@ -72,7 +72,7 @@ def uniform_unit_scaling(tensor: torch.Tensor, nonlinearity: str = "linear"):
 
     tensor : `torch.Tensor`, required.
         The tensor to initialise.
-    nonlinearity : `str`, optional (default = "linear")
+    nonlinearity : `str`, optional (default = `"linear"`)
         The non-linearity which is performed after the projection that this
         tensor is involved in. This must be the name of a function contained
         in the `torch.nn.functional` package.
@@ -107,12 +107,12 @@ def block_orthogonal(tensor: torch.Tensor, split_sizes: List[int], gain: float =
 
     tensor : `torch.Tensor`, required.
         A tensor to initialize.
-    split_sizes : List[int], required.
+    split_sizes : `List[int]`, required.
         A list of length `tensor.ndim()` specifying the size of the
         blocks along that particular dimension. E.g. `[10, 20]` would
         result in the tensor being split into chunks of size 10 along the
         first dimension and 20 along the second.
-    gain : float, optional (default = 1.0)
+    gain : `float`, optional (default = `1.0`)
         The gain (scaling) applied to the orthogonal initialization.
     """
     data = tensor.data
@@ -375,7 +375,7 @@ class PretrainedModelInitializer(Initializer):
 
     weights_file_path : `str`, required
         The path to the weights file which has the pretrained model parameters.
-    parameter_name_overrides : `Dict[str, str]`, optional (default = None)
+    parameter_name_overrides : `Dict[str, str]`, optional (default = `None`)
         The mapping from the new parameter name to the name which should be used
         to index into the pretrained model parameters. If a parameter name is not
         specified, the initializer will use the parameter's default name as the key.
@@ -440,12 +440,12 @@ class InitializerApplicator(FromParams):
 
     # Parameters
 
-    regexes : `List[Tuple[str, Initializer]]`, optional (default = [])
+    regexes : `List[Tuple[str, Initializer]]`, optional (default = `[]`)
         A list mapping parameter regexes to initializers.  We will check each parameter against
         each regex in turn, and apply the initializer paired with the first matching regex, if
         any.
 
-    prevent_regexes: `List[str]`, optional (default=None)
+    prevent_regexes: `List[str]`, optional (default=`None`)
         Any parameter name matching one of these regexes will not be initialized, regardless of
         whether it matches one of the regexes passed in the `regexes` parameter.
     """
@@ -465,7 +465,7 @@ def __call__(self, module: torch.nn.Module) -> None:
 
         # Parameters
 
-        module : torch.nn.Module, required.
+        module : `torch.nn.Module`, required.
             The Pytorch module to apply the initializers to.
         """
         logger.info("Initializing parameters")
diff --git a/allennlp/nn/regularizers/regularizer_applicator.py b/allennlp/nn/regularizers/regularizer_applicator.py
index 0cc455897b2..5b6c6144c07 100644
--- a/allennlp/nn/regularizers/regularizer_applicator.py
+++ b/allennlp/nn/regularizers/regularizer_applicator.py
@@ -16,7 +16,7 @@ def __init__(self, regexes: List[Tuple[str, Regularizer]] = None) -> None:
         """
         # Parameters
 
-        regexes : List[Tuple[str, Regularizer]], optional (default = None)
+        regexes : `List[Tuple[str, Regularizer]]`, optional (default = `None`)
             A sequence of pairs (regex, Regularizer), where each Regularizer
             applies to the parameters its regex matches (and that haven't previously
             been matched).
@@ -27,7 +27,7 @@ def __call__(self, module: torch.nn.Module) -> torch.Tensor:
         """
         # Parameters
 
-        module : torch.nn.Module, required
+        module : `torch.nn.Module`, required
             The module to regularize.
         """
         accumulator = 0.0
diff --git a/allennlp/nn/util.py b/allennlp/nn/util.py
index bc7f31a4c13..2934dc10c3c 100644
--- a/allennlp/nn/util.py
+++ b/allennlp/nn/util.py
@@ -145,9 +145,9 @@ def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor):
 
     # Parameters
 
-    tensor : torch.FloatTensor, required.
+    tensor : `torch.FloatTensor`, required.
         A batch first Pytorch tensor.
-    sequence_lengths : torch.LongTensor, required.
+    sequence_lengths : `torch.LongTensor`, required.
         A tensor representing the lengths of some dimension of the tensor which
         we want to sort by.
 
@@ -221,9 +221,9 @@ def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tenso
 
     # Parameters
 
-    dropout_probability : float, required.
+    dropout_probability : `float`, required.
         Probability of dropping a dimension of the input.
-    tensor_for_masking : torch.Tensor, required.
+    tensor_for_masking : `torch.Tensor`, required.
 
 
     # Returns
@@ -413,13 +413,13 @@ def viterbi_decode(
 
     # Parameters
 
-    tag_sequence : torch.Tensor, required.
+    tag_sequence : `torch.Tensor`, required.
         A tensor of shape (sequence_length, num_tags) representing scores for
         a set of tags over a given sequence.
-    transition_matrix : torch.Tensor, required.
+    transition_matrix : `torch.Tensor`, required.
         A tensor of shape (num_tags, num_tags) representing the binary potentials
         for transitioning between a given pair of tags.
-    tag_observations : Optional[List[int]], optional, (default = None)
+    tag_observations : `Optional[List[int]]`, optional, (default = `None`)
         A list of length `sequence_length` containing the class ids of observed
         elements in the sequence, with unobserved elements being set to -1. Note that
         it is possible to provide evidence which results in degenerate labelings if
@@ -427,15 +427,15 @@ def viterbi_decode(
         other, or those transitions are extremely unlikely. In this situation we log a
         warning, but the responsibility for providing self-consistent evidence ultimately
         lies with the user.
-    allowed_start_transitions : torch.Tensor, optional, (default = None)
+    allowed_start_transitions : `torch.Tensor`, optional, (default = `None`)
         An optional tensor of shape (num_tags,) describing which tags the START token
         may transition *to*. If provided, additional transition constraints will be used for
         determining the start element of the sequence.
-    allowed_end_transitions : torch.Tensor, optional, (default = None)
+    allowed_end_transitions : `torch.Tensor`, optional, (default = `None`)
         An optional tensor of shape (num_tags,) describing which tags may transition *to* the
         end tag. If provided, additional transition constraints will be used for determining
         the end element of the sequence.
-    top_k : int, optional, (default = None)
+    top_k : `int`, optional, (default = `None`)
         Optional integer specifying how many of the top paths to return. For top_k>=1, returns
         a tuple of two lists: top_k_paths, top_k_scores, For top_k==None, returns a flattened
         tuple with just the top path and its score (not in lists, for backwards compatibility).
@@ -716,20 +716,20 @@ def sequence_cross_entropy_with_logits(
         index of the true class for each corresponding step.
     weights : `Union[torch.FloatTensor, torch.BoolTensor]`, required.
         A `torch.FloatTensor` of size (batch, sequence_length)
-    average: str, optional (default = "batch")
+    average: `str`, optional (default = `"batch"`)
         If "batch", average the loss across the batches. If "token", average
         the loss across each item in the input. If `None`, return a vector
         of losses per batch element.
-    label_smoothing : `float`, optional (default = None)
+    label_smoothing : `float`, optional (default = `None`)
         Whether or not to apply label smoothing to the cross-entropy loss.
         For example, with a label smoothing value of 0.2, a 4 class classification
         target would look like `[0.05, 0.05, 0.85, 0.05]` if the 3rd class was
         the correct label.
-    gamma : `float`, optional (default = None)
+    gamma : `float`, optional (default = `None`)
         Focal loss[*] focusing parameter `gamma` to reduces the relative loss for
         well-classified examples and put more focus on hard. The greater value
         `gamma` is, the more focus on hard examples.
-    alpha : `float` or `List[float]`, optional (default = None)
+    alpha : `Union[float, List[float]]`, optional (default = `None`)
         Focal loss[*] weighting factor `alpha` to balance between classes. Can be
         used independently with `gamma`. If a single `float` is provided, it
         is assumed binary case using `alpha` and `1 - alpha` for positive and
@@ -1131,11 +1131,11 @@ def logsumexp(tensor: torch.Tensor, dim: int = -1, keepdim: bool = False) -> tor
 
     # Parameters
 
-    tensor : torch.FloatTensor, required.
+    tensor : `torch.FloatTensor`, required.
         A tensor of arbitrary size.
-    dim : int, optional (default = -1)
+    dim : `int`, optional (default = `-1`)
         The dimension of the tensor to apply the logsumexp to.
-    keepdim: bool, optional (default = False)
+    keepdim: `bool`, optional (default = `False`)
         Whether to retain a dimension of size one at the dimension we reduce over.
     """
     max_score, _ = tensor.max(dim, keepdim=keepdim)
@@ -1233,7 +1233,7 @@ def batched_index_select(
     indices : `torch.LongTensor`
         A tensor of shape (batch_size, ...), where each element is an index into the
         `sequence_length` dimension of the `target` tensor.
-    flattened_indices : Optional[torch.Tensor], optional (default = None)
+    flattened_indices : `Optional[torch.Tensor]`, optional (default = `None`)
         An optional tensor representing the result of calling `flatten_and_batch_shift_indices`
         on `indices`. This is helpful in the case that the indices can be flattened once and
         cached for many batch lookups.
@@ -1384,9 +1384,9 @@ def bucket_values(
 
     distances : `torch.Tensor`, required.
         A Tensor of any size, to be bucketed.
-    num_identity_buckets: int, optional (default = 4).
+    num_identity_buckets: `int`, optional (default = `4`).
         The number of identity buckets (those only holding a single value).
-    num_total_buckets : int, (default = 10)
+    num_total_buckets : `int`, (default = `10`)
         The total number of buckets to bucket values into.
 
     # Returns
@@ -1430,9 +1430,11 @@ def add_sentence_boundary_token_ids(
         A tensor of shape `(batch_size, timesteps)` or `(batch_size, timesteps, dim)`
     mask : `torch.BoolTensor`
          A tensor of shape `(batch_size, timesteps)`
-    sentence_begin_token: Any (anything that can be broadcast in torch for assignment)
+    sentence_begin_token: `Any`
+        Can be anything that can be broadcast in torch for assignment.
         For 2D input, a scalar with the `<S>` id. For 3D input, a tensor with length dim.
-    sentence_end_token: Any (anything that can be broadcast in torch for assignment)
+    sentence_end_token: `Any`
+        Can be anything that can be broadcast in torch for assignment.
         For 2D input, a scalar with the `</S>` id. For 3D input, a tensor with length dim.
 
     # Returns
@@ -1536,9 +1538,9 @@ def add_positional_features(
 
     tensor : `torch.Tensor`
         a Tensor with shape (batch_size, timesteps, hidden_dim).
-    min_timescale : `float`, optional (default = 1.0)
+    min_timescale : `float`, optional (default = `1.0`)
         The smallest timescale to use.
-    max_timescale : `float`, optional (default = 1.0e4)
+    max_timescale : `float`, optional (default = `1.0e4`)
         The largest timescale to use.
 
     # Returns
diff --git a/allennlp/predictors/predictor.py b/allennlp/predictors/predictor.py
index 3327954fa4a..53e7a5887c3 100644
--- a/allennlp/predictors/predictor.py
+++ b/allennlp/predictors/predictor.py
@@ -70,17 +70,17 @@ def get_gradients(self, instances: List[Instance]) -> Tuple[Dict[str, Any], Dict
 
         # Parameters
 
-        instances: List[Instance]
+        instances : `List[Instance]`
 
         # Returns
 
-        Tuple[Dict[str, Any], Dict[str, Any]]
-        The first item is a Dict of gradient entries for each input.
-        The keys have the form  `{grad_input_1: ..., grad_input_2: ... }`
-        up to the number of inputs given. The second item is the model's output.
+        `Tuple[Dict[str, Any], Dict[str, Any]]`
+            The first item is a Dict of gradient entries for each input.
+            The keys have the form  `{grad_input_1: ..., grad_input_2: ... }`
+            up to the number of inputs given. The second item is the model's output.
+
+        # Notes
 
-        Notes
-        -----
         Takes a `JsonDict` representing the inputs of the model and converts
         them to [`Instances`](../data/instance.md)), sends these through
         the model [`forward`](../models/model.md#forward) function after registering hooks on the embedding
@@ -235,16 +235,16 @@ def from_path(
 
         archive_path : `str`
             The path to the archive.
-        predictor_name : `str`, optional (default=None)
+        predictor_name : `str`, optional (default=`None`)
             Name that the predictor is registered as, or None to use the
             predictor associated with the model.
-        cuda_device : `int`, optional (default=-1)
+        cuda_device : `int`, optional (default=`-1`)
             If `cuda_device` is >= 0, the model will be loaded onto the
             corresponding GPU. Otherwise it will be loaded onto the CPU.
-        dataset_reader_to_load : `str`, optional (default="validation")
+        dataset_reader_to_load : `str`, optional (default=`"validation"`)
             Which dataset reader to load from the archive, either "train" or
             "validation".
-        frozen : `bool`, optional (default=True)
+        frozen : `bool`, optional (default=`True`)
             If we should call `model.eval()` when building the predictor.
 
         # Returns
diff --git a/allennlp/predictors/sentence_tagger.py b/allennlp/predictors/sentence_tagger.py
index 77b05032611..0129a0ca21d 100644
--- a/allennlp/predictors/sentence_tagger.py
+++ b/allennlp/predictors/sentence_tagger.py
@@ -55,10 +55,15 @@ def predictions_to_labeled_instances(
         We then return a list of those Instances.
 
         For example:
+
+        ```text
         Mary  went to Seattle to visit Microsoft Research
         U-Per  O    O   U-Loc  O   O     B-Org     L-Org
+        ```
 
         We create three instances.
+
+        ```text
         Mary  went to Seattle to visit Microsoft Research
         U-Per  O    O    O     O   O       O         O
 
@@ -67,6 +72,7 @@ def predictions_to_labeled_instances(
 
         Mary  went to Seattle to visit Microsoft Research
         O      O    O    O     O   O     B-Org     L-Org
+        ```
 
         We additionally add a flag to these instances to tell the model to only compute loss on
         non-O tags, so that we get gradients that are specific to the particular span prediction
diff --git a/allennlp/tests/commands/docstring_help_test.py b/allennlp/tests/commands/docstring_help_test.py
deleted file mode 100644
index 1cc1c5bcbf5..00000000000
--- a/allennlp/tests/commands/docstring_help_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import argparse
-import importlib
-import io
-import pkgutil
-import re
-
-import allennlp
-from allennlp.commands import create_parser
-from allennlp.common.testing import AllenNlpTestCase
-
-
-def _subcommand_help_output(subcommand: str) -> str:
-    parser = create_parser("allennlp")
-    for action in parser._actions:
-        if isinstance(action, argparse._SubParsersAction):
-            file = io.StringIO()
-            action._name_parser_map[subcommand].print_help(file)
-            file.seek(0)
-            return file.read()
-    raise LookupError(
-        "The main program parser does not contain a argparse._SubParsersAction object"
-    )
-
-
-class TestDocstringHelp(AllenNlpTestCase):
-    RE_DOCSTRING_CALL_SUBCOMMAND_HELP = re.compile(r"^\s*\$ (allennlp (\S+) --help)$", re.MULTILINE)
-    RE_STARTS_WITH_INDENTATION = re.compile(r"^ {4}", re.MULTILINE)
-
-    def test_docstring_help(self):
-        parent_module = allennlp.commands
-        for module_info in pkgutil.iter_modules(
-            parent_module.__path__, parent_module.__name__ + "."
-        ):
-            module = importlib.import_module(module_info.name)
-            match = self.RE_DOCSTRING_CALL_SUBCOMMAND_HELP.search(module.__doc__)
-            if match:
-                expected_output = self.RE_STARTS_WITH_INDENTATION.sub(
-                    "", module.__doc__[match.end(0) + 1 :]
-                )
-
-                str_call_subcommand_help = match.group(1)
-                subcommand = match.group(2)
-                actual_output = _subcommand_help_output(subcommand)
-
-                assert expected_output == actual_output, (
-                    f"The documentation for the subcommand usage"
-                    f" in the module {module_info.name}"
-                    f" does not match the output of running"
-                    f" `{str_call_subcommand_help}`."
-                    f" Please update the docstring to match the"
-                    f" output."
-                )
-            else:
-                assert module_info.name in [parent_module.__name__ + ".subcommand"], (
-                    f"The documentation for the subcommand usage was not found within the docstring of"
-                    f" the module {module_info.name}",
-                )
diff --git a/allennlp/tests/fixtures/__init__.py b/allennlp/tests/fixtures/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/allennlp/tests/fixtures/tools/__init__.py b/allennlp/tests/fixtures/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/allennlp/tests/fixtures/tools/py2md/__init__.py b/allennlp/tests/fixtures/tools/py2md/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/allennlp/tests/fixtures/tools/py2md/basic_example.py b/allennlp/tests/fixtures/tools/py2md/basic_example.py
new file mode 100644
index 00000000000..e68034f7a90
--- /dev/null
+++ b/allennlp/tests/fixtures/tools/py2md/basic_example.py
@@ -0,0 +1,118 @@
+"""
+This is a docstring.
+"""
+
+SOME_GLOBAL_VAR = "Ahhhh I'm a global var!!"
+"""
+This is a global var.
+"""
+
+
+def func_with_no_args():
+    """
+    This function has no args.
+    """
+    return None
+
+
+def func_with_args(a: int, b: int, c: int = 3) -> int:
+    """
+    This function has some args.
+
+    # Parameters
+
+    a : `int`
+        A number.
+    b : `int`
+        Another number.
+    c : `int`, optional (default = `3`)
+        Yet another number.
+
+    # Returns
+
+    `int`
+        The result of `a + b * c`.
+    """
+    return a + b * c
+
+
+class SomeClass:
+    """
+    I'm a class!
+
+    # Paramaters
+
+    x : `float`
+        This attribute is called `x`.
+    """
+
+    some_class_level_variable = 1
+    """
+    This is how you document a class-level variable.
+    """
+
+    some_class_level_var_with_type: int = 1
+
+    def __init__(self) -> None:
+        self.x = 1.0
+
+    def _private_method(self) -> None:
+        """
+        Private methods should not be included in documentation.
+        """
+        pass
+
+    def some_method(self) -> None:
+        """
+        I'm a method!
+
+        But I don't do anything.
+
+        # Returns
+
+        `None`
+        """
+        return None
+
+    def method_with_alternative_return_section(self) -> int:
+        """
+        Another method.
+
+        # Returns
+
+        A completely arbitrary number.
+        """
+        return 3
+
+    def method_with_alternative_return_section3(self) -> int:
+        """
+        Another method.
+
+        # Returns
+
+        number : `int`
+            A completely arbitrary number.
+        """
+        return 3
+
+
+class AnotherClassWithReallyLongConstructor:
+    def __init__(
+        self,
+        a_really_long_argument_name: int = 0,
+        another_long_name: float = 2,
+        these_variable_names_are_terrible: str = "yea I know",
+        **kwargs,
+    ) -> None:
+        self.a = a_really_long_argument_name
+        self.b = another_long_name
+        self.c = these_variable_names_are_terrible
+        self.other = kwargs
+
+
+class _PrivateClass:
+    def public_method_on_private_class(self):
+        """
+        This should not be documented since the class is private.
+        """
+        pass
diff --git a/allennlp/tests/fixtures/tools/py2md/basic_example_expected_output.md b/allennlp/tests/fixtures/tools/py2md/basic_example_expected_output.md
new file mode 100644
index 00000000000..ec0a3e2f39d
--- /dev/null
+++ b/allennlp/tests/fixtures/tools/py2md/basic_example_expected_output.md
@@ -0,0 +1,140 @@
+[ *allennlp**.tests**.fixtures**.tools**.py2md***.basic_example** ]
+
+---
+
+This is a docstring.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SOME_GLOBAL_VAR"></a>
+## SOME\_GLOBAL\_VAR
+
+```python
+SOME_GLOBAL_VAR = "Ahhhh I'm a global var!!"
+```
+
+This is a global var.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.func_with_no_args"></a>
+## func\_with\_no\_args
+
+```python
+def func_with_no_args()
+```
+
+This function has no args.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.func_with_args"></a>
+## func\_with\_args
+
+```python
+def func_with_args(a: int, b: int, c: int = 3) -> int
+```
+
+This function has some args.
+
+<strong>Parameters</strong>
+
+
+- __a__ : `int` <br>
+    A number.
+- __b__ : `int` <br>
+    Another number.
+- __c__ : `int`, optional (default = `3`) <br>
+    Yet another number.
+
+<strong>Returns</strong>
+
+
+- `int` <br>
+    The result of `a + b * c`.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass"></a>
+## SomeClass Objects
+
+```python
+class SomeClass():
+ | def __init__(self) -> None
+```
+
+I'm a class!
+
+<strong>Paramaters</strong>
+
+
+x : `float`
+    This attribute is called `x`.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass.some_class_level_variable"></a>
+### some\_class\_level\_variable
+
+```python
+some_class_level_variable = 1
+```
+
+This is how you document a class-level variable.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass.some_class_level_var_with_type"></a>
+### some\_class\_level\_var\_with\_type
+
+```python
+some_class_level_var_with_type = 1
+```
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass.some_method"></a>
+### some\_method
+
+```python
+ | def some_method(self) -> None
+```
+
+I'm a method!
+
+But I don't do anything.
+
+<strong>Returns</strong>
+
+
+- `None` <br>
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass.method_with_alternative_return_section"></a>
+### method\_with\_alternative\_return\_section
+
+```python
+ | def method_with_alternative_return_section(self) -> int
+```
+
+Another method.
+
+<strong>Returns</strong>
+
+
+- A completely arbitrary number. <br>
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.SomeClass.method_with_alternative_return_section3"></a>
+### method\_with\_alternative\_return\_section3
+
+```python
+ | def method_with_alternative_return_section3(self) -> int
+```
+
+Another method.
+
+<strong>Returns</strong>
+
+
+- __number__ : `int` <br>
+    A completely arbitrary number.
+
+<a name=".allennlp.tests.fixtures.tools.py2md.basic_example.AnotherClassWithReallyLongConstructor"></a>
+## AnotherClassWithReallyLongConstructor Objects
+
+```python
+class AnotherClassWithReallyLongConstructor():
+ | def __init__(
+ |     self,
+ |     a_really_long_argument_name: int = 0,
+ |     another_long_name: float = 2,
+ |     these_variable_names_are_terrible: str = "yea I know",
+ |     **kwargs
+ | ) -> None
+```
+
diff --git a/allennlp/tests/tools/__init__.py b/allennlp/tests/tools/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/allennlp/tests/tools/py2md_test.py b/allennlp/tests/tools/py2md_test.py
new file mode 100644
index 00000000000..02dca7c516d
--- /dev/null
+++ b/allennlp/tests/tools/py2md_test.py
@@ -0,0 +1,56 @@
+from typing import Optional
+
+import pytest
+
+from allennlp.tools.py2md import py2md, Param, DocstringError
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestPy2md(AllenNlpTestCase):
+    def test_basic_example(self, capsys):
+        py2md("allennlp.tests.fixtures.tools.py2md.basic_example")
+        captured = capsys.readouterr()
+
+        with open(self.FIXTURES_ROOT / "tools" / "py2md" / "basic_example_expected_output.md") as f:
+            expected = f.read()
+
+        assert captured.out.split("\n") == expected.split("\n")
+
+
+@pytest.mark.parametrize(
+    "line_in, line_out",
+    [
+        (
+            "a : `int`, optional (default = `None`)",
+            "- __a__ : `int`, optional (default = `None`) <br>",
+        ),
+        (
+            "foo : `Tuple[int, ...]`, optional (default = `()`)",
+            "- __foo__ : `Tuple[int, ...]`, optional (default = `()`) <br>",
+        ),
+        ("a : `int`, required", "- __a__ : `int` <br>"),
+        ("a : `int`", "- __a__ : `int` <br>"),
+        ("_a : `int`", "- __\\_a__ : `int` <br>"),
+        ("a_ : `int`", "- __a\\___ : `int` <br>"),
+    ],
+)
+def test_param_from_and_to_line(line_in: str, line_out: Optional[str]):
+    param = Param.from_line(line_in)
+    assert param is not None
+    assert param.to_line() == line_out
+
+
+@pytest.mark.parametrize(
+    "line",
+    [
+        "a : `int`, optional (default = None)",
+        "a : `int`, optional (default = `None)",
+        "a : `int`, optional (default = None`)",
+        "a : int",
+        "a : `int",
+        "a : int`",
+    ],
+)
+def test_param_from_bad_line_raises(line: str):
+    with pytest.raises(DocstringError):
+        Param.from_line(line)
diff --git a/scripts/py2md.py b/allennlp/tools/py2md.py
similarity index 55%
rename from scripts/py2md.py
rename to allennlp/tools/py2md.py
index 2d3eb094d57..8654149f331 100755
--- a/scripts/py2md.py
+++ b/allennlp/tools/py2md.py
@@ -1,17 +1,22 @@
 #!/usr/bin/env python
 
 """
-Turn docstring from a single module into a markdown file.
+Turn docstrings from a single module into a markdown file.
+
+We do this with PydocMarkdown, using custom processors and renderers defined here.
 """
 
 import argparse
-from collections import deque
+from collections import OrderedDict
+from dataclasses import dataclass
+from enum import Enum
 import logging
 from multiprocessing import Pool, cpu_count
 import os
 from pathlib import Path
 import re
-from typing import Optional, Tuple
+import sys
+from typing import Optional, Tuple, List
 
 from nr.databind.core import Struct
 from nr.interface import implements, override
@@ -23,6 +28,142 @@
 
 
 logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("py2md")
+
+
+class DocstringError(Exception):
+    pass
+
+
+def emphasize(s: str) -> str:
+    # Need to escape underscores.
+    s = s.replace("_", "\\_")
+    return f"__{s}__"
+
+
+class Section(Enum):
+    ARGUMENTS = "ARGUMENTS"
+    PARAMETERS = "PARAMETERS"
+    ATTRIBUTES = "ATTRIBUTES"
+    MEMBERS = "MEMBERS"
+    RETURNS = "RETURNS"
+    RAISES = "RAISES"
+    EXAMPLES = "EXAMPLES"
+    OTHER = "OTHER"
+
+    @classmethod
+    def from_str(cls, section: str) -> "Section":
+        section = section.upper()
+        for member in cls:
+            if section == member.value:
+                return member
+        return cls.OTHER
+
+
+REQUIRED_PARAM_RE = re.compile(r"^`([^`]+)`(, required\.?)?$")
+
+OPTIONAL_PARAM_RE = re.compile(
+    r"^`([^`]+)`,?\s+(optional,?\s)?\(\s?(optional,\s)?default\s?=\s?`([^`]+)`\s?\)\.?$"
+)
+
+OPTIONAL_PARAM_NO_DEFAULT_RE = re.compile(r"^`([^`]+)`,?\s+optional\.?$")
+
+
+@dataclass
+class Param:
+    ident: str
+    ty: Optional[str] = None
+    required: bool = False
+    default: Optional[str] = None
+
+    @classmethod
+    def from_line(cls, line: str) -> Optional["Param"]:
+        if ":" not in line:
+            return None
+
+        ident, description = line.split(":", 1)
+        ident = ident.strip()
+        description = description.strip()
+
+        if " " in ident:
+            return None
+
+        maybe_match = REQUIRED_PARAM_RE.match(description)
+        if maybe_match:
+            ty = maybe_match.group(1)
+            return cls(ident=ident, ty=ty, required=True)
+
+        maybe_match = OPTIONAL_PARAM_RE.match(description)
+        if maybe_match:
+            ty = maybe_match.group(1)
+            default = maybe_match.group(4)
+            return cls(ident=ident, ty=ty, required=False, default=default)
+
+        maybe_match = OPTIONAL_PARAM_NO_DEFAULT_RE.match(description)
+        if maybe_match:
+            ty = maybe_match.group(1)
+            return cls(ident=ident, ty=ty, required=False)
+
+        raise DocstringError(
+            f"Invalid parameter / attribute description: '{line}'\n"
+            "Make sure types are enclosed in backticks.\n"
+            "Required parameters should be documented like: '{ident} : `{type}`'\n"
+            "Optional parameters should be documented like: '{ident} : `{type}`, optional (default = `{expr}`)'\n"
+        )
+
+    def to_line(self) -> str:
+        line: str = f"- {emphasize(self.ident)} :"
+        if self.ty:
+            line += f" `{self.ty}`"
+            if not self.required:
+                line += ", optional"
+                if self.default:
+                    line += f" (default = `{self.default}`)"
+        line += " <br>"
+        return line
+
+
+# For now we handle attributes / members in the same way as parameters / arguments.
+Attrib = Param
+
+
+@dataclass
+class RetVal:
+    description: Optional[str] = None
+    ident: Optional[str] = None
+    ty: Optional[str] = None
+
+    @classmethod
+    def from_line(cls, line: str) -> "RetVal":
+        if ": " not in line:
+            return cls(description=line)
+        ident, ty = line.split(":", 1)
+        ident = ident.strip()
+        ty = ty.strip()
+        if ty and not ty.startswith("`"):
+            raise DocstringError(f"Type should be enclosed in backticks: '{line}'")
+        return cls(ident=ident, ty=ty)
+
+    def to_line(self) -> str:
+        if self.description:
+            line = f"- {self.description} <br>"
+        elif self.ident:
+            line = f"- {emphasize(self.ident)}"
+            if self.ty:
+                line += f" : {self.ty} <br>"
+            else:
+                line += " <br>"
+        else:
+            raise DocstringError("RetVal must have either description or ident")
+        return line
+
+
+@dataclass
+class ProcessorState:
+    parameters: "OrderedDict[str, Param]"
+    current_section: Optional[Section] = None
+    codeblock_opened: bool = False
+    consecutive_blank_line_count: int = 0
 
 
 @implements(Processor)
@@ -40,48 +181,52 @@ def process(self, graph, resolver):
     def process_node(self, node):
         if not getattr(node, "docstring", None):
             return
-        lines = []
-        codeblock_opened = False
-        current_section = None
-        consecutive_blank_line_count = 0
+
+        lines: List[str] = []
+        state: ProcessorState = ProcessorState(parameters=OrderedDict())
+
         for line in node.docstring.split("\n"):
+            # Check if we're starting or ending a codeblock.
             if line.startswith("```"):
-                codeblock_opened = not codeblock_opened
-            if not codeblock_opened:
+                state.codeblock_opened = not state.codeblock_opened
+
+            if not state.codeblock_opened:
+                # If we're not in a codeblock, we'll do some pre-processing.
                 if not line.strip():
-                    consecutive_blank_line_count += 1
-                    # Two blank lines ends a section.
-                    if consecutive_blank_line_count >= 2:
-                        current_section = None
+                    state.consecutive_blank_line_count += 1
+                    if state.consecutive_blank_line_count >= 2:
+                        state.current_section = None
                 else:
-                    consecutive_blank_line_count = 0
-                line, current_section = self._preprocess_line(line, current_section)
+                    state.consecutive_blank_line_count = 0
+                line = self._preprocess_line(line, state)
+
             lines.append(line)
+
+        # Now set the docstring to our preprocessed version of it.
         node.docstring = "\n".join(lines)
 
-    def _preprocess_line(self, line, current_section):
+    def _preprocess_line(self, line, state: ProcessorState) -> str:
         match = re.match(r"#+ (.*)$", line)
         if match:
-            current_section = match.group(1).strip().lower()
-            line = re.sub(r"#+ (.*)$", r"__\1__\n", line)
+            state.current_section = Section.from_str(match.group(1).strip())
+            line = re.sub(r"#+ (.*)$", r"<strong>\1</strong>\n", line)
         else:
             if line and not line.startswith(" ") and not line.startswith("!!! "):
-                if (
-                    current_section
-                    in ("arguments", "parameters", "attributes", "members", "returns")
-                    and ":" in line
-                ):
-                    ident, ty = line.split(":", 1)
-                    if ty:
-                        line = f"- __{ident}__ : {ty}<br>"
-                    else:
-                        line = f"- __{ident}__ :<br>"
-                elif current_section in ("returns", "raises"):
-                    line = f"- {line} <br>"
+                if state.current_section in (Section.ARGUMENTS, Section.PARAMETERS,):
+                    param = Param.from_line(line)
+                    if param:
+                        line = param.to_line()
+                elif state.current_section in (Section.ATTRIBUTES, Section.MEMBERS):
+                    attrib = Attrib.from_line(line)
+                    if attrib:
+                        line = attrib.to_line()
+                elif state.current_section in (Section.RETURNS, Section.RAISES):
+                    retval = RetVal.from_line(line)
+                    line = retval.to_line()
 
             line = self._transform_cross_references(line)
 
-        return line, current_section
+        return line
 
     def _transform_cross_references(self, line: str) -> str:
         """
@@ -109,8 +254,6 @@ class AllenNlpFilterProcessor(Struct):
     Used to filter out nodes that we don't want to document.
     """
 
-    SPECIAL_MEMBERS = ("__path__", "__annotations__", "__name__", "__all__", "__init__")
-
     def process(self, graph, _resolver):
         graph.visit(self._process_node)
 
@@ -118,9 +261,7 @@ def _process_node(self, node):
         def _check(node):
             if node.parent and node.parent.name.startswith("_"):
                 return False
-            if node.name.startswith("_") and not node.name.endswith("_"):
-                return False
-            if node.name in self.SPECIAL_MEMBERS:
+            if node.name.startswith("_"):
                 return False
             if node.name == "logger" and isinstance(node.parent, Module):
                 return False
@@ -217,7 +358,11 @@ def _render_object(self, fp, level, obj):
             fp.write("\n\n")
 
 
-def py2md(module: str, out: Optional[str] = None) -> None:
+def py2md(module: str, out: Optional[str] = None) -> bool:
+    """
+    Returns `True` if module successfully processed, otherwise `False`.
+    """
+    logger.debug("Processing %s", module)
     pydocmd = PydocMarkdown(
         loaders=[PythonLoader(modules=[module])],
         processors=[AllenNlpFilterProcessor(), AllenNlpDocstringProcessor()],
@@ -236,12 +381,16 @@ def py2md(module: str, out: Optional[str] = None) -> None:
         os.makedirs(out_path.parent, exist_ok=True)
 
     pydocmd.load_modules()
-    pydocmd.process()
+    try:
+        pydocmd.process()
+    except DocstringError as err:
+        logger.exception("Failed to process %s.\n%s", module, err)
+        return False
     pydocmd.render()
-    logging.info("Processed %s", module)
+    return True
 
 
-def _py2md_wrapper(x: Tuple[str, str]):
+def _py2md_wrapper(x: Tuple[str, str]) -> bool:
     """
     Used to wrap py2md since we can't pickle a lambda (needed for multiprocessing).
     """
@@ -269,18 +418,26 @@ def main():
     if len(outputs) != len(opts.modules):
         raise ValueError("Number inputs and outputs should be the same.")
     n_threads = cpu_count()
+    errors: int = 0
     if len(opts.modules) > n_threads and opts.out:
         # If writing to files, can process in parallel.
         chunk_size = max([1, int(len(outputs) / n_threads)])
-        logging.info("Using %d threads", n_threads)
+        logger.info("Using %d threads", n_threads)
         with Pool(n_threads) as p:
-            deque(p.imap(_py2md_wrapper, zip(opts.modules, outputs), chunk_size), maxlen=0)
+            for result in p.imap(_py2md_wrapper, zip(opts.modules, outputs), chunk_size):
+                if not result:
+                    errors += 1
     else:
         # If writing to stdout, need to process sequentially. Otherwise the output
         # could get intertwined.
         for module, out in zip(opts.modules, outputs):
-            py2md(module, out)
-    logging.info("Processed %d modules", len(opts.modules))
+            result = py2md(module, out)
+            if not result:
+                errors += 1
+    logger.info("Processed %d modules", len(opts.modules))
+    if errors:
+        logger.error("Found %d errors", errors)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/allennlp/training/checkpointer.py b/allennlp/training/checkpointer.py
index b1dba73cb80..64b67fe63b5 100644
--- a/allennlp/training/checkpointer.py
+++ b/allennlp/training/checkpointer.py
@@ -26,16 +26,16 @@ class Checkpointer(Registrable):
 
     # Parameters
 
-    num_serialized_models_to_keep : `int`, optional (default=2)
+    num_serialized_models_to_keep : `int`, optional (default=`2`)
         Number of previous model checkpoints to retain.  Default is to keep 2 checkpoints.
         A value of None or -1 means all checkpoints will be kept.
-    keep_serialized_model_every_num_seconds : `int`, optional (default=None)
+    keep_serialized_model_every_num_seconds : `int`, optional (default=`None`)
         If num_serialized_models_to_keep is not None, then occasionally it's useful to
         save models at a given interval in addition to the last num_serialized_models_to_keep.
         To do so, specify keep_serialized_model_every_num_seconds as the number of seconds
         between permanently saved checkpoints.  Note that this option is only used if
         num_serialized_models_to_keep is not None, otherwise all checkpoints are kept.
-    model_save_interval : `float`, optional (default=None)
+    model_save_interval : `float`, optional (default=`None`)
         If provided, then serialize models every `model_save_interval`
         seconds within single epochs.  In all cases, models are also saved
         at the end of every epoch if `serialization_dir` is provided.
diff --git a/allennlp/training/learning_rate_schedulers/cosine.py b/allennlp/training/learning_rate_schedulers/cosine.py
index f41d5d33398..542c381c30b 100644
--- a/allennlp/training/learning_rate_schedulers/cosine.py
+++ b/allennlp/training/learning_rate_schedulers/cosine.py
@@ -25,15 +25,15 @@ class CosineWithRestarts(LearningRateScheduler):
     optimizer : `torch.optim.Optimizer`
     t_initial : `int`
         The number of iterations (epochs) within the first cycle.
-    t_mul : `float`, optional (default=1)
+    t_mul : `float`, optional (default=`1`)
         Determines the number of iterations (epochs) in the i-th decay cycle,
         which is the length of the last cycle multiplied by `t_mul`.
-    eta_min : `float`, optional (default=0)
+    eta_min : `float`, optional (default=`0`)
         The minimum learning rate.
-    eta_mul : `float`, optional (default=1)
+    eta_mul : `float`, optional (default=`1`)
         Determines the initial learning rate for the i-th decay cycle, which is the
         last initial learning rate multiplied by `m_mul`.
-    last_epoch : `int`, optional (default=-1)
+    last_epoch : `int`, optional (default=`-1`)
         The index of the last epoch. This is used when restarting.
     """
 
diff --git a/allennlp/training/learning_rate_schedulers/noam.py b/allennlp/training/learning_rate_schedulers/noam.py
index 451d52d7b77..99d9933cd86 100644
--- a/allennlp/training/learning_rate_schedulers/noam.py
+++ b/allennlp/training/learning_rate_schedulers/noam.py
@@ -20,7 +20,7 @@ class NoamLR(LearningRateScheduler):
         The hidden size parameter which dominates the number of parameters in your model.
     warmup_steps : `int`, required.
         The number of steps to linearly increase the learning rate.
-    factor : `float`, optional (default = 1.0).
+    factor : `float`, optional (default = `1.0`).
         The overall scale factor for the learning rate decay.
     """
 
diff --git a/allennlp/training/learning_rate_schedulers/slanted_triangular.py b/allennlp/training/learning_rate_schedulers/slanted_triangular.py
index 730c0f5c94b..57ed40915cf 100644
--- a/allennlp/training/learning_rate_schedulers/slanted_triangular.py
+++ b/allennlp/training/learning_rate_schedulers/slanted_triangular.py
@@ -33,16 +33,16 @@ class SlantedTriangular(LearningRateScheduler):
         The total number of epochs for which the model should be trained.
     num_steps_per_epoch : `int`, required.
         The number of steps (updates, batches) per training epoch.
-    cut_frac : `float`, optional (default = 0.1).
+    cut_frac : `float`, optional (default = `0.1`).
         The fraction of the steps to increase the learning rate.
-    ratio : `float`, optional (default = 32).
+    ratio : `float`, optional (default = `32`).
         The ratio of the smallest to the (largest) base learning rate.
-    gradual_unfreezing : `bool`, optional (default = False).
+    gradual_unfreezing : `bool`, optional (default = `False`).
         Whether gradual unfreezing should be used.
-    discriminative_fine_tuning : `bool`, optional (default = False).
+    discriminative_fine_tuning : `bool`, optional (default = `False`).
         Whether discriminative fine-tuning (different learning rates per layer)
         are used.
-    decay_factor : `float`, optional (default = 0.38).
+    decay_factor : `float`, optional (default = `0.38`).
         The decay factor by which the learning rate is reduced with
         discriminative fine-tuning when going a layer deeper.
     """
diff --git a/allennlp/training/metric_tracker.py b/allennlp/training/metric_tracker.py
index 8d5c568788f..bb8289a98d2 100644
--- a/allennlp/training/metric_tracker.py
+++ b/allennlp/training/metric_tracker.py
@@ -16,15 +16,15 @@ class MetricTracker:
 
     # Parameters
 
-    patience : int, optional (default = None)
+    patience : `int`, optional (default = `None`)
         If provided, then `should_stop_early()` returns True if we go this
         many epochs without seeing a new best value.
-    metric_name : str, optional (default = None)
+    metric_name : `str`, optional (default = `None`)
         If provided, it's used to infer whether we expect the metric values to
         increase (if it starts with "+") or decrease (if it starts with "-").
         It's an error if it doesn't start with one of those. If it's not provided,
         you should specify `should_decrease` instead.
-    should_decrease : str, optional (default = None)
+    should_decrease : `str`, optional (default = `None`)
         If `metric_name` isn't provided (in which case we can't infer `should_decrease`),
         then you have to specify it here.
     """
diff --git a/allennlp/training/metrics/attachment_scores.py b/allennlp/training/metrics/attachment_scores.py
index f7f8c8167f2..21f6016b481 100644
--- a/allennlp/training/metrics/attachment_scores.py
+++ b/allennlp/training/metrics/attachment_scores.py
@@ -17,7 +17,7 @@ class AttachmentScores(Metric):
 
     # Parameters
 
-    ignore_classes : `List[int]`, optional (default = None)
+    ignore_classes : `List[int]`, optional (default = `None`)
         A list of label ids to ignore when computing metrics.
     """
 
@@ -50,7 +50,7 @@ def __call__(  # type: ignore
             A tensor of the same shape as `predicted_indices`.
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predicted_labels`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predicted_indices`.
         """
         detached = self.detach_tensors(
diff --git a/allennlp/training/metrics/auc.py b/allennlp/training/metrics/auc.py
index 49678830159..82187540478 100644
--- a/allennlp/training/metrics/auc.py
+++ b/allennlp/training/metrics/auc.py
@@ -36,7 +36,7 @@ def __call__(
             A one-dimensional label tensor of shape (batch_size), with {1, 0}
             entries for positive and negative class. If it's not binary,
             `positive_label` should be passed in the initialization.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A one-dimensional label tensor of shape (batch_size).
         """
 
diff --git a/allennlp/training/metrics/bleu.py b/allennlp/training/metrics/bleu.py
index 972293a5a9c..41d56daeab7 100644
--- a/allennlp/training/metrics/bleu.py
+++ b/allennlp/training/metrics/bleu.py
@@ -14,20 +14,19 @@ class BLEU(Metric):
     Bilingual Evaluation Understudy (BLEU).
 
     BLEU is a common metric used for evaluating the quality of machine translations
-    against a set of reference translations. See [Papineni et. al.,
-    "BLEU: a method for automatic evaluation of machine translation", 2002]
-    (https://www.semanticscholar.org/paper/8ff93cfd37dced279134c9d642337a2085b31f59/).
+    against a set of reference translations. See
+    [Papineni et. al., "BLEU: a method for automatic evaluation of machine translation", 2002][1].
 
     # Parameters
 
-    ngram_weights : `Iterable[float]`, optional (default = (0.25, 0.25, 0.25, 0.25))
+    ngram_weights : `Iterable[float]`, optional (default = `(0.25, 0.25, 0.25, 0.25)`)
         Weights to assign to scores for each ngram size.
-    exclude_indices : `Set[int]`, optional (default = None)
+    exclude_indices : `Set[int]`, optional (default = `None`)
         Indices to exclude when calculating ngrams. This should usually include
         the indices of the start, end, and pad tokens.
 
-    Notes
-    -----
+    # Notes
+
     We chose to implement this from scratch instead of wrapping an existing implementation
     (such as `nltk.translate.bleu_score`) for a two reasons. First, so that we could
     pass tensors directly to this metric instead of first converting the tensors to lists of strings.
@@ -37,6 +36,9 @@ class BLEU(Metric):
 
     This implementation only considers a reference set of size 1, i.e. a single
     gold target sequence for each predicted sequence.
+
+
+    [1]: https://www.semanticscholar.org/paper/8ff93cfd37dced279134c9d642337a2085b31f59/
     """
 
     def __init__(
diff --git a/allennlp/training/metrics/boolean_accuracy.py b/allennlp/training/metrics/boolean_accuracy.py
index 298c70ff80e..fe2d5f31ed9 100644
--- a/allennlp/training/metrics/boolean_accuracy.py
+++ b/allennlp/training/metrics/boolean_accuracy.py
@@ -40,7 +40,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, ...).
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predictions`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predictions`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/categorical_accuracy.py b/allennlp/training/metrics/categorical_accuracy.py
index 1722ac1acf2..9decf9e8a1f 100644
--- a/allennlp/training/metrics/categorical_accuracy.py
+++ b/allennlp/training/metrics/categorical_accuracy.py
@@ -42,7 +42,7 @@ def __call__(
         gold_labels : `torch.Tensor`, required.
             A tensor of integer class label of shape (batch_size, ...). It must be the same
             shape as the `predictions` tensor without the `num_classes` dimension.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor the same size as `gold_labels`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/covariance.py b/allennlp/training/metrics/covariance.py
index e0fa7d45ed7..e1c592d9b6b 100644
--- a/allennlp/training/metrics/covariance.py
+++ b/allennlp/training/metrics/covariance.py
@@ -15,12 +15,12 @@ class Covariance(Metric):
     covariance is calculated between the vectors).
 
     This implementation is mostly modeled after the streaming_covariance function in Tensorflow. See:
-    https://github.com/tensorflow/tensorflow/blob/v1.10.1/tensorflow/contrib/metrics/python/ops/metric_ops.py#L3127
+    <https://github.com/tensorflow/tensorflow/blob/v1.10.1/tensorflow/contrib/metrics/python/ops/metric_ops.py#L3127>
 
     The following is copied from the Tensorflow documentation:
 
     The algorithm used for this online computation is described in
-    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online .
+    <https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online>.
     Specifically, the formula used to combine two sample comoments is
     `C_AB = C_A + C_B + (E[x_A] - E[x_B]) * (E[y_A] - E[y_B]) * n_A * n_B / n_AB`
     The comoment for a single batch of data is simply `sum((x - E[x]) * (y - E[y]))`, optionally masked.
@@ -45,7 +45,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, ...).
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predictions`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predictions`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/entropy.py b/allennlp/training/metrics/entropy.py
index 033e677d389..25edff6daaf 100644
--- a/allennlp/training/metrics/entropy.py
+++ b/allennlp/training/metrics/entropy.py
@@ -23,7 +23,7 @@ def __call__(
 
         logits : `torch.Tensor`, required.
             A tensor of unnormalized log probabilities of shape (batch_size, ..., num_classes).
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor of shape (batch_size, ...).
         """
         logits, mask = self.detach_tensors(logits, mask)
diff --git a/allennlp/training/metrics/evalb_bracketing_scorer.py b/allennlp/training/metrics/evalb_bracketing_scorer.py
index 4457cb0d498..af7ce075e28 100644
--- a/allennlp/training/metrics/evalb_bracketing_scorer.py
+++ b/allennlp/training/metrics/evalb_bracketing_scorer.py
@@ -43,11 +43,11 @@ class EvalbBracketingScorer(Metric):
 
     evalb_directory_path : `str`, required.
         The directory containing the EVALB executable.
-    evalb_param_filename : `str`, optional (default = "COLLINS.prm")
+    evalb_param_filename : `str`, optional (default = `"COLLINS.prm"`)
         The relative name of the EVALB configuration file used when scoring the trees.
         By default, this uses the COLLINS.prm configuration file which comes with EVALB.
         This configuration ignores POS tags and some punctuation labels.
-    evalb_num_errors_to_kill : `int`, optional (default = "10")
+    evalb_num_errors_to_kill : `int`, optional (default = `"10"`)
         The number of errors to tolerate from EVALB before terminating evaluation.
     """
 
diff --git a/allennlp/training/metrics/fbeta_measure.py b/allennlp/training/metrics/fbeta_measure.py
index 464bed58621..82f77ffae96 100644
--- a/allennlp/training/metrics/fbeta_measure.py
+++ b/allennlp/training/metrics/fbeta_measure.py
@@ -34,10 +34,10 @@ class FBetaMeasure(Metric):
 
     # Parameters
 
-    beta : `float`, optional (default = 1.0)
+    beta : `float`, optional (default = `1.0`)
         The strength of recall versus precision in the F-score.
 
-    average : string, [None (default), 'micro', 'macro', 'weighted']
+    average : `str`, optional (default = `None`)
         If `None`, the scores for each class are returned. Otherwise, this
         determines the type of averaging performed on the data:
 
@@ -104,7 +104,7 @@ def __call__(
         gold_labels : `torch.Tensor`, required.
             A tensor of integer class label of shape (batch_size, ...). It must be the same
             shape as the `predictions` tensor without the `num_classes` dimension.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor the same size as `gold_labels`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/mean_absolute_error.py b/allennlp/training/metrics/mean_absolute_error.py
index e464137aa37..945cea2c739 100644
--- a/allennlp/training/metrics/mean_absolute_error.py
+++ b/allennlp/training/metrics/mean_absolute_error.py
@@ -29,7 +29,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, ...).
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predictions`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predictions`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/metric.py b/allennlp/training/metrics/metric.py
index e9dc72846d2..82853dd6f75 100644
--- a/allennlp/training/metrics/metric.py
+++ b/allennlp/training/metrics/metric.py
@@ -21,7 +21,7 @@ def __call__(
             A tensor of predictions.
         gold_labels : `torch.Tensor`, required.
             A tensor corresponding to some gold label to evaluate against.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A mask can be passed, in order to deal with metrics which are
             computed over potentially padded elements, such as sequence labels.
         """
diff --git a/allennlp/training/metrics/pearson_correlation.py b/allennlp/training/metrics/pearson_correlation.py
index 5e4803acc26..dc5b81388c1 100644
--- a/allennlp/training/metrics/pearson_correlation.py
+++ b/allennlp/training/metrics/pearson_correlation.py
@@ -19,7 +19,7 @@ class PearsonCorrelation(Metric):
     between the vectors).
 
     This implementation is mostly modeled after the streaming_pearson_correlation function in Tensorflow. See
-    https://github.com/tensorflow/tensorflow/blob/v1.10.1/tensorflow/contrib/metrics/python/ops/metric_ops.py#L3267
+    <https://github.com/tensorflow/tensorflow/blob/v1.10.1/tensorflow/contrib/metrics/python/ops/metric_ops.py#L3267>.
 
     This metric delegates to the Covariance metric the tracking of three [co]variances:
 
@@ -52,7 +52,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, ...).
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predictions`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predictions`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/sequence_accuracy.py b/allennlp/training/metrics/sequence_accuracy.py
index d03fa55a6cf..6ebdc660b11 100644
--- a/allennlp/training/metrics/sequence_accuracy.py
+++ b/allennlp/training/metrics/sequence_accuracy.py
@@ -31,7 +31,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, k, sequence_length).
         gold_labels : `torch.Tensor`, required.
             A tensor of integer class label of shape (batch_size, sequence_length).
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor the same size as `gold_labels`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/span_based_f1_measure.py b/allennlp/training/metrics/span_based_f1_measure.py
index 53ed4c8c58b..d16a9a285ff 100644
--- a/allennlp/training/metrics/span_based_f1_measure.py
+++ b/allennlp/training/metrics/span_based_f1_measure.py
@@ -53,7 +53,7 @@ def __init__(
             This metric assumes that a BIO format is used in which the
             labels are of the format: ["B-LABEL", "I-LABEL"].
 
-        ignore_classes : List[str], optional.
+        ignore_classes : `List[str]`, optional.
             Span labels which will be ignored when computing span metrics.
             A "span label" is the part that comes after the BIO label, so it
             would be "ARG1" for the tag "B-ARG1". For example by passing:
@@ -67,7 +67,7 @@ def __init__(
             This is helpful for instance, to avoid computing metrics for "V"
             spans in a BIO tagging scheme which are typically not included.
 
-        label_encoding : `str`, optional (default = "BIO")
+        label_encoding : `str`, optional (default = `"BIO"`)
             The encoding used to specify label span endpoints in the sequence.
             Valid options are "BIO", "IOB1", "BIOUL" or "BMES".
 
@@ -115,9 +115,9 @@ def __call__(
         gold_labels : `torch.Tensor`, required.
             A tensor of integer class label of shape (batch_size, sequence_length). It must be the same
             shape as the `predictions` tensor without the `num_classes` dimension.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor the same size as `gold_labels`.
-        prediction_map : `torch.Tensor`, optional (default = None).
+        prediction_map : `torch.Tensor`, optional (default = `None`).
             A tensor of size (batch_size, num_classes) which provides a mapping from the index of predictions
             to the indices of the label vocabulary. If provided, the output label at each timestep will be
             `vocabulary.get_index_to_token_vocabulary(prediction_map[batch, argmax(predictions[batch, t]))`,
diff --git a/allennlp/training/metrics/spearman_correlation.py b/allennlp/training/metrics/spearman_correlation.py
index 4b64fc86bae..0b45e91bdc1 100644
--- a/allennlp/training/metrics/spearman_correlation.py
+++ b/allennlp/training/metrics/spearman_correlation.py
@@ -16,7 +16,7 @@ class SpearmanCorrelation(Metric):
     implicitly flattened into vectors and the correlation is calculated
     between the vectors).
 
-    https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient
+    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>
     """
 
     def __init__(self) -> None:
@@ -37,7 +37,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, ...).
         gold_labels : `torch.Tensor`, required.
             A tensor of the same shape as `predictions`.
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A tensor of the same shape as `predictions`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/metrics/unigram_recall.py b/allennlp/training/metrics/unigram_recall.py
index 29e8489a46c..a3eecca3849 100644
--- a/allennlp/training/metrics/unigram_recall.py
+++ b/allennlp/training/metrics/unigram_recall.py
@@ -35,7 +35,7 @@ def __call__(
             A tensor of predictions of shape (batch_size, k, sequence_length).
         gold_labels : `torch.Tensor`, required.
             A tensor of integer class label of shape (batch_size, sequence_length).
-        mask : `torch.BoolTensor`, optional (default = None).
+        mask : `torch.BoolTensor`, optional (default = `None`).
             A masking tensor the same size as `gold_labels`.
         """
         predictions, gold_labels, mask = self.detach_tensors(predictions, gold_labels, mask)
diff --git a/allennlp/training/moving_average.py b/allennlp/training/moving_average.py
index ff2c023eb6e..f862547141a 100644
--- a/allennlp/training/moving_average.py
+++ b/allennlp/training/moving_average.py
@@ -53,12 +53,12 @@ class ExponentialMovingAverage(MovingAverage):
 
     parameters : `Iterable[Tuple[str, Parameter]]`, required
         The parameters whose averages we'll be tracking.
-    decay : `float`, optional (default = 0.9999)
+    decay : `float`, optional (default = `0.9999`)
         The decay rate that will be used if `num_updates` is not passed
         (and that will be used as an upper bound if `num_updates` is passed).
-    numerator : `float`, optional (default = 1.0)
+    numerator : `float`, optional (default = `1.0`)
         The numerator used to compute the decay rate if `num_updates` is passed.
-    denominator : `float`, optional (default = 10.0)
+    denominator : `float`, optional (default = `10.0`)
         The denominator used to compute the decay rate if `num_updates` is passed.
     """
 
@@ -85,7 +85,7 @@ def apply(self, num_updates: Optional[int] = None) -> None:
             `min(decay, (numerator + num_updates) / (denominator + num_updates))`
 
         (This logic is based on the Tensorflow exponential moving average
-         https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage)
+         <https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage>)
         """
         if num_updates is not None:
             decay = min(
diff --git a/allennlp/training/optimizers.py b/allennlp/training/optimizers.py
index 73057736f81..162883da695 100644
--- a/allennlp/training/optimizers.py
+++ b/allennlp/training/optimizers.py
@@ -1,21 +1,20 @@
 """
 AllenNLP just uses
-`PyTorch optimizers <https://pytorch.org/docs/master/optim.html>`_ ,
+[PyTorch optimizers](https://pytorch.org/docs/master/optim.html),
 with a thin wrapper to allow registering them and instantiating them `from_params`.
 
 The available optimizers are
 
-* `"adadelta" <https://pytorch.org/docs/master/optim.html#torch.optim.Adadelta>`_
-* `"adagrad" <https://pytorch.org/docs/master/optim.html#torch.optim.Adagrad>`_
-* `"adam" <https://pytorch.org/docs/master/optim.html#torch.optim.Adam>`_
-* `"adamw" <https://pytorch.org/docs/master/optim.html#torch.optim.AdamW>`_
-* `"huggingface_adamw"
-  <https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.AdamW>`_
-* `"sparse_adam" <https://pytorch.org/docs/master/optim.html#torch.optim.SparseAdam>`_
-* `"sgd" <https://pytorch.org/docs/master/optim.html#torch.optim.SGD>`_
-* `"rmsprop <https://pytorch.org/docs/master/optim.html#torch.optim.RMSprop>`_
-* `"adamax <https://pytorch.org/docs/master/optim.html#torch.optim.Adamax>`_
-* `"averaged_sgd <https://pytorch.org/docs/master/optim.html#torch.optim.ASGD>`_
+* [adadelta](https://pytorch.org/docs/master/optim.html#torch.optim.Adadelta)
+* [adagrad](https://pytorch.org/docs/master/optim.html#torch.optim.Adagrad)
+* [adam](https://pytorch.org/docs/master/optim.html#torch.optim.Adam)
+* [adamw](https://pytorch.org/docs/master/optim.html#torch.optim.AdamW)
+* [huggingface_adamw](https://huggingface.co/transformers/main_classes/optimizer_schedules.html#transformers.AdamW)
+* [sparse_adam](https://pytorch.org/docs/master/optim.html#torch.optim.SparseAdam)
+* [sgd](https://pytorch.org/docs/master/optim.html#torch.optim.SGD)
+* [rmsprop](https://pytorch.org/docs/master/optim.html#torch.optim.RMSprop)
+* [adamax](https://pytorch.org/docs/master/optim.html#torch.optim.Adamax)
+* [averaged_sgd](https://pytorch.org/docs/master/optim.html#torch.optim.ASGD)
 """
 
 import logging
@@ -56,7 +55,7 @@ def make_parameter_groups(
     dict contains a "parameter group" and groups specific options, e.g., {'params': [list of
     parameters], 'lr': 1e-3, ...}.  Any config option not specified in the additional options (e.g.
     for the default group) is inherited from the top level arguments given in the constructor.  See:
-    https://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options.  See also our
+    <https://pytorch.org/docs/0.3.0/optim.html?#per-parameter-options>.  See also our
     `test_optimizer_parameter_groups` test for an example of how this works in this code.
 
     The dictionary's return type is labeled as `Any`, because it can be a `List[torch.nn.Parameter]`
@@ -421,12 +420,12 @@ class DenseSparseAdam(Optimizer, torch.optim.Optimizer):
 
     params : `iterable`
         iterable of parameters to optimize or dicts defining parameter groups
-    lr : `float`, optional (default: 1e-3)
+    lr : `float`, optional (default = `1e-3`)
         The learning rate.
-    betas : `Tuple[float, float]`, optional (default: (0.9, 0.999))
+    betas : `Tuple[float, float]`, optional (default = `(0.9, 0.999)`)
         coefficients used for computing running averages of gradient
         and its square.
-    eps : `float`, optional, (default: 1e-8)
+    eps : `float`, optional, (default = `1e-8`)
         A term added to the denominator to improve numerical stability.
     """
 
diff --git a/allennlp/training/tensorboard_writer.py b/allennlp/training/tensorboard_writer.py
index d842c6d63e3..1cd39821100 100644
--- a/allennlp/training/tensorboard_writer.py
+++ b/allennlp/training/tensorboard_writer.py
@@ -21,11 +21,11 @@ class TensorboardWriter(FromParams):
 
     # Parameters
 
-    serialization_dir : str, optional (default = None)
+    serialization_dir : `str`, optional (default = `None`)
         If provided, this is where the Tensorboard logs will be written.
-    summary_interval : int, optional (default = 100)
+    summary_interval : `int`, optional (default = `100`)
         Most statistics will be written out only every this many batches.
-    histogram_interval : int, optional (default = None)
+    histogram_interval : `int`, optional (default = `None`)
         If provided, activation histograms will be written out every this many batches.
         If None, activation histograms will not be written out.
         When this parameter is specified, the following additional logging is enabled:
@@ -42,12 +42,12 @@ class TensorboardWriter(FromParams):
         with tensors as values currently support activation logging.
     batch_size_interval : `int`, optional, (default = `None`)
         If defined, how often to log the average batch size.
-    should_log_parameter_statistics : bool, optional (default = True)
+    should_log_parameter_statistics : `bool`, optional (default = `True`)
         Whether to log parameter statistics (mean and standard deviation of parameters and
         gradients).
-    should_log_learning_rate : bool, optional (default = False)
+    should_log_learning_rate : `bool`, optional (default = `False`)
         Whether to log (parameter-specific) learning rate.
-    get_batch_num_total : Callable[[], int], optional (default = None)
+    get_batch_num_total : `Callable[[], int]`, optional (default = `None`)
         A thunk that returns the number of batches so far. Most likely this will
         be a closure around an instance variable in your `Trainer` class.  Because of circular
         dependencies in constructing this object and the `Trainer`, this is typically `None` when
diff --git a/allennlp/training/trainer.py b/allennlp/training/trainer.py
index 3f144306e79..edfdc6bb2ab 100644
--- a/allennlp/training/trainer.py
+++ b/allennlp/training/trainer.py
@@ -175,38 +175,38 @@ class GradientDescentTrainer(Trainer):
     data_loader : `DataLoader`, required.
         A pytorch `DataLoader` containing your `Dataset`, yielding padded indexed batches.
 
-    patience : `Optional[int] > 0`, optional (default=None)
+    patience : `Optional[int] > 0`, optional (default=`None`)
         Number of epochs to be patient before early stopping: the training is stopped
         after `patience` epochs with no improvement. If given, it must be `> 0`.
         If None, early stopping is disabled.
 
-    validation_metric : `str`, optional (default="loss")
+    validation_metric : `str`, optional (default=`"loss"`)
         Validation metric to measure for whether to stop training using patience
         and whether to serialize an `is_best` model each epoch. The metric name
         must be prepended with either "+" or "-", which specifies whether the metric
         is an increasing or decreasing function.
 
-    validation_data_loader : `DataLoader`, optional (default=None)
+    validation_data_loader : `DataLoader`, optional (default=`None`)
         A `DataLoader` to use for the validation set.  If `None`, then
         use the training `DataLoader` with the validation data.
 
-    num_epochs : `int`, optional (default = 20)
+    num_epochs : `int`, optional (default = `20`)
         Number of training epochs.
 
-    serialization_dir : str, optional (default=None)
+    serialization_dir : `str`, optional (default=`None`)
         Path to directory for saving and loading model files. Models will not be saved if
         this parameter is not passed.
 
-    checkpointer : `Checkpointer`, optional (default=None)
+    checkpointer : `Checkpointer`, optional (default=`None`)
         A `Checkpointer` is responsible for periodically saving model weights.  If none is given
         here, we will construct one with default parameters.
 
-    cuda_device : `int`, optional (default = -1)
+    cuda_device : `int`, optional (default = `-1`)
         An integer specifying the CUDA device(s) to use for this process. If -1, the CPU is used.
         Data parallelism is controlled at the allennlp train level, so each trainer will have a single
         GPU.
 
-    grad_norm : `float`, optional, (default = None).
+    grad_norm : `float`, optional, (default = `None`).
         If provided, gradient norms will be rescaled to have a maximum of this value.
 
     grad_clipping : `float`, optional (default = `None`).
@@ -214,7 +214,7 @@ class GradientDescentTrainer(Trainer):
         maximum of this value.  If you are getting `NaNs` in your gradients during training
         that are not solved by using `grad_norm`, you may need this.
 
-    learning_rate_scheduler : `LearningRateScheduler`, optional (default = None)
+    learning_rate_scheduler : `LearningRateScheduler`, optional (default = `None`)
         If specified, the learning rate will be decayed with respect to
         this schedule at the end of each epoch (or batch, if the scheduler implements
         the `step_batch` method). If you use `torch.optim.lr_scheduler.ReduceLROnPlateau`,
@@ -222,7 +222,7 @@ class GradientDescentTrainer(Trainer):
         To support updating the learning rate on every batch, this can optionally implement
         `step_batch(batch_num_total)` which updates the learning rate given the batch number.
 
-    momentum_scheduler : `MomentumScheduler`, optional (default = None)
+    momentum_scheduler : `MomentumScheduler`, optional (default = `None`)
         If specified, the momentum will be updated at the end of each batch or epoch
         according to the schedule.
 
@@ -230,7 +230,7 @@ class GradientDescentTrainer(Trainer):
         If this is not provided, we will construct a `TensorboardWriter` with default
         parameters and use that.
 
-    moving_average : `MovingAverage`, optional, (default = None)
+    moving_average : `MovingAverage`, optional, (default = `None`)
         If provided, we will maintain moving averages for all parameters. During training, we
         employ a shadow variable for each parameter, which maintains the moving average. During
         evaluation, we backup the original parameters and assign the moving averages to corresponding
@@ -238,26 +238,26 @@ class GradientDescentTrainer(Trainer):
         parameters. This is necessary because we want the saved model to perform as well as the validated
         model if we load it later. But this may cause problems if you restart the training from checkpoint.
 
-    batch_callbacks : `List[BatchCallback]`, optional (default = None)
+    batch_callbacks : `List[BatchCallback]`, optional (default = `None`)
         A list of callbacks that will be called at the end of every batch, during both train and
         validation.
 
-    epoch_callbacks : `List[EpochCallback]`, optional (default = None)
+    epoch_callbacks : `List[EpochCallback]`, optional (default = `None`)
         A list of callbacks that will be called at the end of every epoch, and at the start of
         training (with epoch = -1).
 
-    distributed : `bool`, optional, (default = False)
+    distributed : `bool`, optional, (default = `False`)
         If set, PyTorch's `DistributedDataParallel` is used to train the model in multiple GPUs. This also
         requires `world_size` to be greater than 1.
 
-    local_rank : `int`, optional, (default = 0)
+    local_rank : `int`, optional, (default = `0`)
         This is the unique identifier of the `Trainer` in a distributed process group. The GPU device id is
         used as the rank.
 
-    world_size : `int`, (default = 1)
+    world_size : `int`, (default = `1`)
         The number of `Trainer` workers participating in the distributed training.
 
-    num_gradient_accumulation_steps : `int`, optional, (default = 1)
+    num_gradient_accumulation_steps : `int`, optional, (default = `1`)
         Gradients are accumulated for the given number of steps before doing an optimizer step. This can
         be useful to accommodate batches that are larger than the RAM size. Refer [Thomas Wolf's
         post][0] for details on Gradient Accumulation.
diff --git a/allennlp/training/util.py b/allennlp/training/util.py
index c34697023f6..7d1e9cd9af1 100644
--- a/allennlp/training/util.py
+++ b/allennlp/training/util.py
@@ -322,10 +322,10 @@ def evaluate(
     data_loader : `DataLoader`
         The `DataLoader` that will iterate over the evaluation data (data loaders already contain
         their data).
-    cuda_device : `int`, optional (default=-1)
+    cuda_device : `int`, optional (default=`-1`)
         The cuda device to use for this evaluation.  The model is assumed to already be using this
         device; this parameter is only used for moving the input data to the correct device.
-    batch_weight_key : `str`, optional (default=None)
+    batch_weight_key : `str`, optional (default=`None`)
         If given, this is a key in the output dictionary for each batch that specifies how to weight
         the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
     """
diff --git a/dev-requirements.txt b/dev-requirements.txt
index c5c8676d598..29b72214cc7 100644
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -31,7 +31,7 @@ git+https://github.com/NVIDIA/apex.git@master
 ruamel.yaml
 
 # Generating markdown files from Python modules.
-git+https://github.com/NiklasRosenstein/pydoc-markdown.git@b469839ceb598df3d7e7126b81bff88dfd1343e3
+git+https://github.com/NiklasRosenstein/pydoc-markdown.git@f0bf8af1db4f11581c19d206d4ed1ab34b4854c1
 
 markdown-include==0.5.1
 # Package for the material theme for mkdocs