diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 270b0666..2422e3b5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -47,7 +47,9 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install Python dependencies
       run: python -m pip install -r requirements.txt
-    - name: Test Python library
+    - name: Install LIT package
+      run: python -m pip install -e .
+    - name: Test LIT
       run: |
         python -m pip install pytest
         pytest -v
diff --git a/lit_nlp/components/ablation_flip_int_test.py b/lit_nlp/components/ablation_flip_int_test.py
index 7d4ee332..10cf9f9e 100644
--- a/lit_nlp/components/ablation_flip_int_test.py
+++ b/lit_nlp/components/ablation_flip_int_test.py
@@ -19,7 +19,7 @@
 from absl.testing import absltest
 from lit_nlp.api import types
 from lit_nlp.components import ablation_flip
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 import numpy as np
 
 
@@ -66,12 +66,12 @@ def setUp(self):
     self.classification_model = glue_models.SST2Model(BERT_TINY_PATH)
     self.classification_config = {ablation_flip.PREDICTION_KEY: 'probas'}
 
-    # Clasification model with the 'sentence' field marked as
+    # Classification model with the 'sentence' field marked as
     # non-required.
     self.classification_model_non_required_field = SST2ModelNonRequiredField(
         BERT_TINY_PATH)
 
-    # Clasification model with a counter to count number of predict calls.
+    # Classification model with a counter to count number of predict calls.
     # TODO(ataly): Consider setting up a Mock object to count number of
     # predict calls.
     self.classification_model_with_predict_counter = (
diff --git a/lit_nlp/components/hotflip_int_test.py b/lit_nlp/components/hotflip_int_test.py
index b97eed41..c5d7758b 100644
--- a/lit_nlp/components/hotflip_int_test.py
+++ b/lit_nlp/components/hotflip_int_test.py
@@ -18,7 +18,7 @@
 from absl.testing import parameterized
 from lit_nlp.components import hotflip
 # TODO(lit-dev): Move glue_models out of lit_nlp/examples
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 import numpy as np
 
 from lit_nlp.lib import file_cache
diff --git a/lit_nlp/components/tcav_int_test.py b/lit_nlp/components/tcav_int_test.py
index 592311e5..041a6a84 100644
--- a/lit_nlp/components/tcav_int_test.py
+++ b/lit_nlp/components/tcav_int_test.py
@@ -20,7 +20,7 @@
 from absl.testing import parameterized
 from lit_nlp.api import dataset as lit_dataset
 from lit_nlp.components import tcav
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.lib import caching  # for hash id fn
 from lit_nlp.lib import testing_utils
 
diff --git a/lit_nlp/components/thresholder_int_test.py b/lit_nlp/components/thresholder_int_test.py
index 3c0c9ca9..9775b79c 100644
--- a/lit_nlp/components/thresholder_int_test.py
+++ b/lit_nlp/components/thresholder_int_test.py
@@ -19,7 +19,7 @@
 from lit_nlp.api import dataset as lit_dataset
 from lit_nlp.api import types as lit_types
 from lit_nlp.components import thresholder
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.lib import caching  # for hash id fn
 
 
diff --git a/lit_nlp/examples/blank_slate_demo.py b/lit_nlp/examples/blank_slate_demo.py
index 32642a74..470f57a9 100644
--- a/lit_nlp/examples/blank_slate_demo.py
+++ b/lit_nlp/examples/blank_slate_demo.py
@@ -1,4 +1,4 @@
-r"""An blank demo ready to load models and datasets.
+r"""A blank demo ready to load models and datasets.
 
 The currently supported models and datasets are:
 - classification model on SST-2, with the Stanford Sentiment Treebank dataset.
@@ -30,9 +30,9 @@
 from lit_nlp import dev_server
 from lit_nlp import server_flags
 from lit_nlp.examples.datasets import classification
-from lit_nlp.examples.datasets import glue
 from lit_nlp.examples.datasets import lm
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import data as glue_data
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.examples.models import pretrained_lms
 from lit_nlp.examples.penguin import data as penguin_data
 from lit_nlp.examples.penguin import model as penguin_model
@@ -99,9 +99,9 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
   dataset_loaders: lit_app.DatasetLoadersMap = {}
 
   # glue demo dataset loaders.
-  dataset_loaders["sst2"] = (glue.SST2Data, glue.SST2Data.init_spec())
-  dataset_loaders["stsb"] = (glue.STSBData, glue.STSBData.init_spec())
-  dataset_loaders["mnli"] = (glue.MNLIData, glue.MNLIData.init_spec())
+  dataset_loaders["sst2"] = (glue_data.SST2Data, glue_data.SST2Data.init_spec())
+  dataset_loaders["stsb"] = (glue_data.STSBData, glue_data.STSBData.init_spec())
+  dataset_loaders["mnli"] = (glue_data.MNLIData, glue_data.MNLIData.init_spec())
 
   # penguin demo dataset loaders.
   dataset_loaders["penguin"] = (
@@ -111,8 +111,8 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
 
   # lm demo dataset loaders.
   dataset_loaders["sst (lm)"] = (
-      glue.SST2DataForLM,
-      glue.SST2DataForLM.init_spec(),
+      glue_data.SST2DataForLM,
+      glue_data.SST2DataForLM.init_spec(),
   )
   dataset_loaders["imdb (lm)"] = (
       classification.IMDBData,
diff --git a/lit_nlp/examples/custom_module/potato_demo.py b/lit_nlp/examples/custom_module/potato_demo.py
index bbaeb141..b9a50594 100644
--- a/lit_nlp/examples/custom_module/potato_demo.py
+++ b/lit_nlp/examples/custom_module/potato_demo.py
@@ -23,8 +23,8 @@
 from lit_nlp import dev_server
 from lit_nlp import server_flags
 from lit_nlp.api import layout
-from lit_nlp.examples.datasets import glue
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import data as glue_data
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.lib import file_cache
 
 # NOTE: additional flags defined in server_flags.py
@@ -84,7 +84,7 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
         model, extract_compressed_file=True)
 
   models = {"sst": glue_models.SST2Model(model)}
-  datasets = {"sst_dev": glue.SST2Data("validation")}
+  datasets = {"sst_dev": glue_data.SST2Data("validation")}
 
   # Start the LIT server. See server_flags.py for server options.
   lit_demo = dev_server.Server(
diff --git a/lit_nlp/examples/datasets/glue.py b/lit_nlp/examples/glue/data.py
similarity index 100%
rename from lit_nlp/examples/datasets/glue.py
rename to lit_nlp/examples/glue/data.py
diff --git a/lit_nlp/examples/glue_demo.py b/lit_nlp/examples/glue/demo.py
similarity index 87%
rename from lit_nlp/examples/glue_demo.py
rename to lit_nlp/examples/glue/demo.py
index 83b0474f..8fe9d703 100644
--- a/lit_nlp/examples/glue_demo.py
+++ b/lit_nlp/examples/glue/demo.py
@@ -1,14 +1,15 @@
 r"""Example demo loading a handful of GLUE models.
 
 For a quick-start set of models, run:
-  python -m lit_nlp.examples.glue_demo \
+  blaze run -c opt --config=cuda examples/glue:demo -- \
     --quickstart --port=5432
 
 To run with the 'normal' defaults, including full-size BERT models:
-  python -m lit_nlp.examples.glue_demo --port=5432
+  blaze run -c opt --config=cuda examples/glue:demo -- --port=5432
 
 Then navigate to localhost:5432 to access the demo UI.
 """
+
 from collections.abc import Sequence
 import sys
 from typing import Optional
@@ -19,8 +20,8 @@
 from lit_nlp import app as lit_app
 from lit_nlp import dev_server
 from lit_nlp import server_flags
-from lit_nlp.examples.datasets import glue
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import data as glue_data
+from lit_nlp.examples.glue import models as glue_models
 
 # NOTE: additional flags defined in server_flags.py
 
@@ -29,8 +30,10 @@
 FLAGS.set_default("development_demo", True)
 
 _QUICKSTART = flags.DEFINE_bool(
-    "quickstart", False,
-    "Quick-start mode, loads smaller models and a subset of the full data.")
+    "quickstart",
+    False,
+    "Quick-start mode, loads smaller models and a subset of the full data.",
+)
 
 _MODELS = flags.DEFINE_list(
     "models",
@@ -50,9 +53,12 @@
 )
 
 _MAX_EXAMPLES = flags.DEFINE_integer(
-    "max_examples", None, "Maximum number of examples to load into LIT. "
+    "max_examples",
+    None,
+    "Maximum number of examples to load into LIT. "
     "Note: MNLI eval set is 10k examples, so will take a while to run and may "
-    "be slow on older machines. Set --max_examples=200 for a quick start.")
+    "be slow on older machines. Set --max_examples=200 for a quick start.",
+)
 
 MODELS_BY_TASK = {
     "sst2": glue_models.SST2Model,
@@ -123,24 +129,33 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
     # split = 'validation' will also work, but this will cause TDFS to download
     # the entire dataset which can be very slow.
     split = "https://storage.googleapis.com/what-if-tool-resources/lit-data/sst2.validation.csv"
-    datasets["sst_dev"] = glue.SST2Data(split)
-    dataset_loaders["sst2"] = (glue.SST2Data, glue.SST2Data.init_spec())
+    datasets["sst_dev"] = glue_data.SST2Data(split)
+    dataset_loaders["sst2"] = (
+        glue_data.SST2Data,
+        glue_data.SST2Data.init_spec(),
+    )
 
   if "stsb" in tasks_to_load:
     logging.info("Loading data for STS-B task.")
     # split = 'validation' will also work, but this will cause TDFS to download
     # the entire dataset which can be very slow.
     split = "https://storage.googleapis.com/what-if-tool-resources/lit-data/stsb.validation.csv"
-    datasets["stsb_dev"] = glue.STSBData(split)
-    dataset_loaders["stsb"] = (glue.STSBData, glue.STSBData.init_spec())
+    datasets["stsb_dev"] = glue_data.STSBData(split)
+    dataset_loaders["stsb"] = (
+        glue_data.STSBData,
+        glue_data.STSBData.init_spec(),
+    )
 
   if "mnli" in tasks_to_load:
     logging.info("Loading data for MultiNLI task.")
     # split = 'validation_matched' will also work, but this will cause TDFS to
     # download the entire dataset which can be very slow.
     split = "https://storage.googleapis.com/what-if-tool-resources/lit-data/mnli.validation_matched.csv"
-    datasets["mnli_dev"] = glue.MNLIData(split)
-    dataset_loaders["mnli"] = (glue.MNLIData, glue.MNLIData.init_spec())
+    datasets["mnli_dev"] = glue_data.MNLIData(split)
+    dataset_loaders["mnli"] = (
+        glue_data.MNLIData,
+        glue_data.MNLIData.init_spec(),
+    )
 
   # Truncate datasets if --max_examples is set.
   if _MAX_EXAMPLES.value is not None:
diff --git a/lit_nlp/examples/models/glue_models_int_test.py b/lit_nlp/examples/glue/model_int_test.py
similarity index 82%
rename from lit_nlp/examples/models/glue_models_int_test.py
rename to lit_nlp/examples/glue/model_int_test.py
index fcac0d42..366a9428 100644
--- a/lit_nlp/examples/models/glue_models_int_test.py
+++ b/lit_nlp/examples/glue/model_int_test.py
@@ -1,8 +1,8 @@
-r"""Integration tests for lit_nlp.examples.models.glue_models.
+r"""Integration tests for lit_nlp.examples.glue.models.
 
 Test locally with:
 
-blaze test //third_party/py/lit_nlp/examples/models:integration_tests \
+blaze test //third_party/py/lit_nlp/examples/glue:integration_tests \
     --guitar_cluster=LOCAL \
     --test_output=streamed \
     --guitar_detach
@@ -11,7 +11,7 @@
 from typing import Any
 from absl.testing import absltest
 from absl.testing import parameterized
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.lib import file_cache
 
 
@@ -24,7 +24,8 @@ def __init__(self, *args: Any, **kwargs: Any):
     model_path = "https://storage.googleapis.com/what-if-tool-resources/lit-models/sst2_tiny.tar.gz"  # pylint: disable=line-too-long
     if model_path.endswith(".tar.gz"):
       model_path = file_cache.cached_path(
-          model_path, extract_compressed_file=True)
+          model_path, extract_compressed_file=True
+      )
     self.sst2_model = glue_models.SST2Model(model_path)
 
   @parameterized.named_parameters(
@@ -48,17 +49,14 @@ def __init__(self, *args: Any, **kwargs: Any):
       # Common multiple cases
       dict(
           testcase_name="no_attention_or_embeddings",
-          config={
-              "output_attention": False,
-              "output_embeddings": False
-          },
+          config={"output_attention": False, "output_embeddings": False},
       ),
       dict(
           testcase_name="no_attention_or_embeddings_or_gradients",
           config={
               "compute_grads": False,
               "output_attention": False,
-              "output_embeddings": False
+              "output_embeddings": False,
           },
       ),
   )
@@ -66,13 +64,14 @@ def test_sst2_model_predict(self, config: dict[str, bool]):
     # Configure model.
     if config:
       self.sst2_model.config = glue_models.GlueModelConfig(
-          # Include the SST-2 defaut config options
+          # Include the SST-2 default config options
           text_a_name="sentence",
           text_b_name=None,
           labels=["0", "1"],
           null_label_idx=0,
           # Add the output-affecting config options
-          **config)
+          **config
+      )
 
     # Run prediction to ensure no failure.
     model_in = [{"sentence": "test sentence"}]
@@ -83,5 +82,6 @@ def test_sst2_model_predict(self, config: dict[str, bool]):
     for key in self.sst2_model.output_spec().keys():
       self.assertIn(key, model_out[0])
 
+
 if __name__ == "__main__":
   absltest.main()
diff --git a/lit_nlp/examples/models/glue_models.py b/lit_nlp/examples/glue/models.py
similarity index 84%
rename from lit_nlp/examples/models/glue_models.py
rename to lit_nlp/examples/glue/models.py
index a70cb1f0..9f14fe83 100644
--- a/lit_nlp/examples/models/glue_models.py
+++ b/lit_nlp/examples/glue/models.py
@@ -1,4 +1,5 @@
 """Wrapper for fine-tuned HuggingFace models in LIT."""
+
 # TODO(b/261736863): Update to PEP 585 typings, consider using f-strings, and
 # make common substrings into module CONSTANTS.
 
@@ -28,6 +29,7 @@
 @attr.s(auto_attribs=True, kw_only=True)
 class GlueModelConfig(object):
   """Config options for a GlueModel."""
+
   # Preprocessing options
   max_seq_length: int = 128
   inference_batch_size: int = 32
@@ -79,7 +81,6 @@ class GlueModel(lit_model.BatchedModel):
 
   This is a full-featured implementation, which includes embeddings, attention,
   gradients, as well as support for the different input and output types above.
-  For a more minimal example, see ../simple_tf2_demo.py.
   """
 
   def _verify_num_layers(self, hidden_states: Sequence[Any]):
@@ -102,9 +103,7 @@ def is_regression(self) -> bool:
 
   # TODO(b/254110131): Move file_cache.cached_path() call inside this __init__
   # function to reduce boilerplate in other locations (e.g., TCAV tests).
-  def __init__(self,
-               model_name_or_path="bert-base-uncased",
-               **config_kw):
+  def __init__(self, model_name_or_path="bert-base-uncased", **config_kw):
     self.config = GlueModelConfig(**config_kw)
     self._load_model(model_name_or_path)
     self._lock = threading.Lock()
@@ -119,9 +118,11 @@ def _load_model(self, model_name_or_path):
       )
 
     self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_name_or_path)
+        model_name_or_path
+    )
     self.vocab = self.tokenizer.convert_ids_to_tokens(
-        range(len(self.tokenizer)))
+        range(len(self.tokenizer))
+    )
     model_config = transformers.AutoConfig.from_pretrained(
         model_name_or_path,
         num_labels=1 if self.is_regression else len(self.config.labels),
@@ -131,12 +132,14 @@ def _load_model(self, model_name_or_path):
     self.model = model_utils.load_pretrained(
         transformers.TFAutoModelForSequenceClassification,
         model_name_or_path,
-        config=model_config)
+        config=model_config,
+    )
 
   def _get_tokens(self, ex: JsonDict, field_name: str) -> list[str]:
     with self._lock:
-      return (ex.get("tokens_" + field_name) or
-              self.tokenizer.tokenize(ex[field_name]))
+      return ex.get("tokens_" + field_name) or self.tokenizer.tokenize(
+          ex[field_name]
+      )
 
   def _preprocess(self, inputs: Iterable[JsonDict]) -> dict[str, tf.Tensor]:
     # Use pretokenized input if available.
@@ -152,20 +155,26 @@ def _preprocess(self, inputs: Iterable[JsonDict]) -> dict[str, tf.Tensor]:
         self.tokenizer,
         tokens_a,
         tokens_b,
-        max_length=self.config.max_seq_length)
+        max_length=self.config.max_seq_length,
+    )
     return encoded_input  # pytype: disable=bad-return-type
 
   def _make_dataset(self, inputs: Iterable[JsonDict]) -> tf.data.Dataset:
     """Make a tf.data.Dataset from inputs in LIT format."""
     encoded_input = self._preprocess(inputs)
     if self.is_regression:
-      labels = tf.constant([ex[self.config.label_name] for ex in inputs],
-                           dtype=tf.float32)
+      labels = tf.constant(
+          [ex[self.config.label_name] for ex in inputs], dtype=tf.float32
+      )
     else:
-      labels = tf.constant([
-          self.config.labels.index(ex[self.config.label_name]) for ex in inputs
-      ],
-                           dtype=tf.int64)
+      indexes = []
+      if self.config.labels is not None:
+        for ex in inputs:
+          indexes.append(self.config.labels.index(ex[self.config.label_name]))
+      labels = tf.constant(
+          indexes,
+          dtype=tf.int64,
+      )
     # encoded_input is actually a transformers.BatchEncoding
     # object, which tf.data.Dataset doesn't like. Convert to a regular dict.
     return tf.data.Dataset.from_tensor_slices((dict(encoded_input), labels))
@@ -180,13 +189,18 @@ def train(
       keras_callbacks=None,
   ):
     """Run fine-tuning."""
-    train_dataset = self._make_dataset(train_inputs).shuffle(128).batch(
-        batch_size).repeat(-1)
+    train_dataset = (
+        self._make_dataset(train_inputs)
+        .shuffle(128)
+        .batch(batch_size)
+        .repeat(-1)
+    )
     # Use larger batch for validation since inference is about 1/2 memory usage
     # of backprop.
     eval_batch_size = 2 * batch_size
     validation_dataset = self._make_dataset(validation_inputs).batch(
-        eval_batch_size)
+        eval_batch_size
+    )
 
     # Prepare model for training.
     opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
@@ -207,7 +221,8 @@ def train(
         validation_data=validation_dataset,
         validation_steps=validation_steps,
         callbacks=keras_callbacks,
-        verbose=2)
+        verbose=2,
+    )
     return history
 
   def save(self, path: str):
@@ -248,7 +263,8 @@ def _postprocess(self, output: dict[str, Any]):
     """Per-example postprocessing, on NumPy output."""
     ntok = output.pop("ntok")
     output["tokens"] = self.tokenizer.convert_ids_to_tokens(
-        output.pop("input_ids")[:ntok])
+        output.pop("input_ids")[:ntok]
+    )
 
     # Tokens for each segment, individually.
     slicer_a, slicer_b = self._segment_slicers(output["tokens"])
@@ -258,21 +274,25 @@ def _postprocess(self, output: dict[str, Any]):
 
     # Embeddings for each segment, individually.
     if self.config.output_embeddings:
-      output["input_embs_" + self.config.text_a_name] = (
-          output["input_embs"][slicer_a])
+      output["input_embs_" + self.config.text_a_name] = output["input_embs"][
+          slicer_a
+      ]
       if self.config.text_b_name:
-        output["input_embs_" + self.config.text_b_name] = (
-            output["input_embs"][slicer_b])
+        output["input_embs_" + self.config.text_b_name] = output["input_embs"][
+            slicer_b
+        ]
 
     # Gradients for each segment, individually.
     if self.config.compute_grads:
       # Gradients for the CLS token.
       output["cls_grad"] = output["input_emb_grad"][0]
-      output["token_grad_" +
-             self.config.text_a_name] = output["input_emb_grad"][slicer_a]
+      output["token_grad_" + self.config.text_a_name] = output[
+          "input_emb_grad"
+      ][slicer_a]
       if self.config.text_b_name:
-        output["token_grad_" +
-               self.config.text_b_name] = output["input_emb_grad"][slicer_b]
+        output["token_grad_" + self.config.text_b_name] = output[
+            "input_emb_grad"
+        ][slicer_b]
 
       # TODO(b/294613507): remove output[self.config.label_name] once TCAV
       # is updated.
@@ -302,8 +322,9 @@ def _postprocess(self, output: dict[str, Any]):
 
     return output
 
-  def _scatter_embs(self, passed_input_embs, input_embs, batch_indices,
-                    offsets):
+  def _scatter_embs(
+      self, passed_input_embs, input_embs, batch_indices, offsets
+  ):
     """Scatters custom passed embeddings into the default model embeddings.
 
     Args:
@@ -331,9 +352,10 @@ def _scatter_embs(self, passed_input_embs, input_embs, batch_indices,
     # values that should be scattered in, i.e. one for each of the
     # (scatter_batch_size * num_tokens) word embeddings.
     scatter_indices = []
-    for (batch_index, sentence_embs, offset) in zip(batch_indices,
-                                                    filtered_embs, offsets):
-      for (token_index, _) in enumerate(sentence_embs):
+    for batch_index, sentence_embs, offset in zip(
+        batch_indices, filtered_embs, offsets
+    ):
+      for token_index, _ in enumerate(sentence_embs):
         scatter_indices.append([batch_index, token_index + offset])
 
     # Scatters passed word embeddings into embeddings gathered from tokens.
@@ -351,26 +373,36 @@ def scatter_all_embeddings(self, inputs, input_embs):
       The default model embeddings with scattered custom embeddings.
     """
     # Gets batch indices of any word embeddings that were passed for text_a.
-    passed_input_embs_a = [ex.get("input_embs_" + self.config.text_a_name)
-                           for ex in inputs]
-    batch_indices_a = [index for (index, emb) in enumerate(
-        passed_input_embs_a) if emb is not None]
+    passed_input_embs_a = [
+        ex.get("input_embs_" + self.config.text_a_name) for ex in inputs
+    ]
+    batch_indices_a = [
+        index
+        for (index, emb) in enumerate(passed_input_embs_a)
+        if emb is not None
+    ]
 
     # If word embeddings were passed in for text_a, scatter them into the
     # embeddings, gathered from the input ids. 1 is passed in as the offset
     # for each, since text_a starts at index 1, after the [CLS] token.
     if batch_indices_a:
       input_embs = self._scatter_embs(
-          passed_input_embs_a, input_embs, batch_indices_a,
-          offsets=np.ones(len(batch_indices_a), dtype=np.int64))
+          passed_input_embs_a,
+          input_embs,
+          batch_indices_a,
+          offsets=np.ones(len(batch_indices_a), dtype=np.int64),
+      )
 
     if self.config.text_b_name:
       # Gets batch indices of any word embeddings that were passed for text_b.
-      passed_input_embs_b = [ex.get("input_embs_" + self.config.text_b_name)
-                             for ex in inputs]
+      passed_input_embs_b = [
+          ex.get("input_embs_" + self.config.text_b_name) for ex in inputs
+      ]
       batch_indices_b = [
-          index for (index, emb) in enumerate(passed_input_embs_b)
-          if emb is not None]
+          index
+          for (index, emb) in enumerate(passed_input_embs_b)
+          if emb is not None
+      ]
 
       # If word embeddings were also passed in for text_b, scatter them into the
       # embeddings gathered from the input ids. The offsets are the [lengths
@@ -378,11 +410,15 @@ def scatter_all_embeddings(self, inputs, input_embs):
       # [CLS] [text_a tokens] [SEP]. (This assumes that text_b embeddings
       # will only be passed together with text_a embeddings.)
       if batch_indices_b:
-        lengths = np.array([len(embed) for embed in passed_input_embs_a
-                            if embed is not None])
+        lengths = np.array(
+            [len(embed) for embed in passed_input_embs_a if embed is not None]
+        )
         input_embs = self._scatter_embs(
-            passed_input_embs_b, input_embs, batch_indices_b,
-            offsets=(lengths + 2))
+            passed_input_embs_b,
+            input_embs,
+            batch_indices_b,
+            offsets=(lengths + 2),
+        )
     return input_embs
 
   def get_target_scores(self, inputs: Iterable[JsonDict], scores):
@@ -401,10 +437,12 @@ def get_target_scores(self, inputs: Iterable[JsonDict], scores):
         for (i, ex) in enumerate(inputs)
     ]
     # Convert the class names to indices if needed.
-    grad_idxs = [
-        self.config.labels.index(label) if isinstance(label, str) else label
-        for label in grad_classes
-    ]
+    grad_idxs = []
+    for label in grad_classes:
+      if isinstance(label, str) and self.config.labels is not None:
+        grad_idxs.append(self.config.labels.index(label))
+      else:
+        grad_idxs.append(label)
     # list of tuples (batch idx, label idx)
     gather_indices = list(enumerate(grad_idxs))
     # <tf.float32>[batch_size]
@@ -427,7 +465,8 @@ def predict_minibatch(self, inputs: Iterable[JsonDict]):
     # Use watch_accessed_variables to save memory by having the tape do nothing
     # if we don't need gradients.
     with tf.GradientTape(
-        watch_accessed_variables=self.config.compute_grads) as tape:
+        watch_accessed_variables=self.config.compute_grads
+    ) as tape:
       encoded_input = self._preprocess(inputs)
 
       # Gathers word embeddings from BERT model embedding layer using input ids
@@ -451,7 +490,8 @@ def predict_minibatch(self, inputs: Iterable[JsonDict]):
           training=False,
           output_hidden_states=True,
           output_attentions=True,
-          return_dict=True)
+          return_dict=True,
+      )
 
       batched_outputs = {
           "input_ids": encoded_input["input_ids"],
@@ -466,21 +506,25 @@ def predict_minibatch(self, inputs: Iterable[JsonDict]):
 
         # <float32>[batch_size, num_tokens, 1]
         token_mask = tf.expand_dims(
-            tf.cast(encoded_input["attention_mask"], tf.float32), axis=2)
+            tf.cast(encoded_input["attention_mask"], tf.float32), axis=2
+        )
         # <float32>[batch_size, 1]
         denom = tf.reduce_sum(token_mask, axis=1)
         for i, layer_output in enumerate(out.hidden_states):
           # layer_output is <float32>[batch_size, num_tokens, emb_dim]
           # average over tokens to get <float32>[batch_size, emb_dim]
-          batched_outputs[f"layer_{i}/avg_emb"] = tf.reduce_sum(
-              layer_output * token_mask, axis=1) / denom
+          batched_outputs[f"layer_{i}/avg_emb"] = (
+              tf.reduce_sum(layer_output * token_mask, axis=1) / denom
+          )
 
       if self.config.output_attention:
         if len(out.attentions) != self.model.config.num_hidden_layers:
-          raise ValueError("Unexpected size of attentions. Should be the same "
-                           "size as the number of hidden layers. Expected "
-                           f"{self.model.config.num_hidden_layers}, got "
-                           f"{len(out.attentions)}.")
+          raise ValueError(
+              "Unexpected size of attentions. Should be the same "
+              "size as the number of hidden layers. Expected "
+              f"{self.model.config.num_hidden_layers}, got "
+              f"{len(out.attentions)}."
+          )
         for i, layer_attention in enumerate(out.attentions):
           batched_outputs[f"layer_{i+1}/attention"] = layer_attention
 
@@ -511,7 +555,11 @@ def predict_minibatch(self, inputs: Iterable[JsonDict]):
           scalar_targets, input_embs
       )
 
-    detached_outputs = {k: v.numpy() for k, v in batched_outputs.items()}
+    detached_outputs = {
+        k: v.numpy()
+        for k, v in batched_outputs.items()
+        if v is not None
+    }
     # Sequence of dicts, one per example.
     unbatched_outputs = utils.unbatch_preds(detached_outputs)
     return map(self._postprocess, unbatched_outputs)
@@ -520,24 +568,28 @@ def input_spec(self) -> Spec:
     ret = {}
     ret[self.config.text_a_name] = lit_types.TextSegment()
     ret["tokens_" + self.config.text_a_name] = lit_types.Tokens(
-        parent=self.config.text_a_name, required=False)
+        parent=self.config.text_a_name, required=False
+    )
 
     if self.config.text_b_name:
       ret[self.config.text_b_name] = lit_types.TextSegment()
       ret["tokens_" + self.config.text_b_name] = lit_types.Tokens(
-          parent=self.config.text_b_name, required=False)
+          parent=self.config.text_b_name, required=False
+      )
 
     if self.is_regression:
       ret[self.config.label_name] = lit_types.Scalar(required=False)
     else:
       ret[self.config.label_name] = lit_types.CategoryLabel(
-          required=False, vocab=self.config.labels)
+          required=False, vocab=self.config.labels
+      )
 
     if self.config.output_embeddings:
       # The input_embs_ fields are used for Integrated Gradients.
       text_a_embs = "input_embs_" + self.config.text_a_name
       ret[text_a_embs] = lit_types.TokenEmbeddings(
-          align="tokens", required=False)
+          align="tokens", required=False
+      )
       if self.config.text_b_name:
         text_b_embs = "input_embs_" + self.config.text_b_name
         ret[text_b_embs] = lit_types.TokenEmbeddings(
@@ -548,17 +600,20 @@ def input_spec(self) -> Spec:
   def output_spec(self) -> Spec:
     ret = {"tokens": lit_types.Tokens()}
     ret["tokens_" + self.config.text_a_name] = lit_types.Tokens(
-        parent=self.config.text_a_name)
+        parent=self.config.text_a_name
+    )
     if self.config.text_b_name:
       ret["tokens_" + self.config.text_b_name] = lit_types.Tokens(
-          parent=self.config.text_b_name)
+          parent=self.config.text_b_name
+      )
     if self.is_regression:
       ret["score"] = lit_types.RegressionScore(parent=self.config.label_name)
     else:
       ret["probas"] = lit_types.MulticlassPreds(
           parent=self.config.label_name,
           vocab=self.config.labels,
-          null_idx=self.config.null_label_idx)
+          null_idx=self.config.null_label_idx,
+      )
 
     if self.config.output_embeddings:
       ret["cls_emb"] = lit_types.Embeddings()
@@ -568,11 +623,13 @@ def output_spec(self) -> Spec:
 
       # The input_embs_ fields are used for Integrated Gradients.
       ret["input_embs_" + self.config.text_a_name] = lit_types.TokenEmbeddings(
-          align="tokens_" + self.config.text_a_name)
+          align="tokens_" + self.config.text_a_name
+      )
       if self.config.text_b_name:
         text_b_embs = "input_embs_" + self.config.text_b_name
-        ret[text_b_embs] = lit_types.TokenEmbeddings(align="tokens_" +
-                                                     self.config.text_b_name)
+        ret[text_b_embs] = lit_types.TokenEmbeddings(
+            align="tokens_" + self.config.text_b_name
+        )
 
     # Gradients, if requested.
     if self.config.compute_grads:
@@ -604,7 +661,8 @@ def output_spec(self) -> Spec:
       # Attention heads, one field for each layer.
       for i in range(self.model.config.num_hidden_layers):
         ret[f"layer_{i+1}/attention"] = lit_types.AttentionHeads(
-            align_in="tokens", align_out="tokens")
+            align_in="tokens", align_out="tokens"
+        )
     return ret
 
 
@@ -618,7 +676,8 @@ def __init__(self, *args, **kw):
         text_b_name=None,
         labels=["0", "1"],
         null_label_idx=0,
-        **kw)
+        **kw,
+    )
 
 
 class MNLIModel(GlueModel):
@@ -630,7 +689,8 @@ def __init__(self, *args, **kw):
         text_a_name="premise",
         text_b_name="hypothesis",
         labels=["entailment", "neutral", "contradiction"],
-        **kw)
+        **kw,
+    )
 
 
 class STSBModel(GlueModel):
@@ -642,7 +702,8 @@ def __init__(self, *args, **kw):
         text_a_name="sentence1",
         text_b_name="sentence2",
         labels=None,
-        **kw)
+        **kw,
+    )
 
   def input_spec(self):
     ret = super().input_spec()
diff --git a/lit_nlp/examples/models/glue_models_test.py b/lit_nlp/examples/glue/models_test.py
similarity index 61%
rename from lit_nlp/examples/models/glue_models_test.py
rename to lit_nlp/examples/glue/models_test.py
index 1ce1ddbf..68d9b61a 100644
--- a/lit_nlp/examples/models/glue_models_test.py
+++ b/lit_nlp/examples/glue/models_test.py
@@ -3,7 +3,7 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 import attr
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import models as glue_models
 import numpy as np
 
 
@@ -60,10 +60,7 @@ class GlueModelsTest(parameterized.TestCase):
       # Common multiple cases
       dict(
           testcase_name="no_attention_or_embeddings",
-          config={
-              "output_attention": False,
-              "output_embeddings": False
-          },
+          config={"output_attention": False, "output_embeddings": False},
           expect_attention=False,
           expect_embs=False,
           expect_grads=True,
@@ -73,18 +70,23 @@ class GlueModelsTest(parameterized.TestCase):
           config={
               "compute_grads": False,
               "output_attention": False,
-              "output_embeddings": False
+              "output_embeddings": False,
           },
           expect_attention=False,
           expect_embs=False,
           expect_grads=False,
       ),
   )
-  def test_spec_affecting_config_options(self, config: dict[str, bool],
-                                         expect_attention: bool,
-                                         expect_embs: bool, expect_grads: bool):
+  def test_spec_affecting_config_options(
+      self,
+      config: dict[str, bool],
+      expect_attention: bool,
+      expect_embs: bool,
+      expect_grads: bool,
+  ):
     model = GlueModelForTesting(
-        model_name_or_path="bert-base-uncased", **config)
+        model_name_or_path="bert-base-uncased", **config
+    )
     input_spec = model.input_spec()
     output_spec = model.output_spec()
 
@@ -139,27 +141,34 @@ def test_spec_affecting_config_options(self, config: dict[str, bool],
 
   def test_scatter_all_embeddings_single_input(self):
     glue_model = GlueModelForTesting(
-        model_name_or_path="bert-base-uncased",
-        text_a_name="sentence1")
+        model_name_or_path="bert-base-uncased", text_a_name="sentence1"
+    )
     emb_size = 10
     # We'll inject zeros for the embeddings of 'hi',
     # while special tokens get vectors of 1s.
     embs_a = np.zeros((1, emb_size))
     input_embs = np.ones((1, 3, emb_size))
     # Scatter embs_a into input_embs
-    result = glue_model.scatter_all_embeddings([{"sentence1": "hi",
-                                                 "input_embs_sentence1": embs_a,
-                                                 }], input_embs)
-    target = [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]
+    result = glue_model.scatter_all_embeddings(
+        [{
+            "sentence1": "hi",
+            "input_embs_sentence1": embs_a,
+        }],
+        input_embs,
+    )
+    target = [[
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    ]]
     np.testing.assert_almost_equal(result, target)
 
   def test_scatter_all_embeddings_both_inputs(self):
     glue_model = GlueModelForTesting(
         model_name_or_path="bert-base-uncased",
         text_a_name="sentence1",
-        text_b_name="sentence2")
+        text_b_name="sentence2",
+    )
     emb_size = 10
     # Inject zeros at positions corresponding to real tokens
     # in each segment. Special tokens get vectors of 1s.
@@ -167,74 +176,86 @@ def test_scatter_all_embeddings_both_inputs(self):
     embs_b = np.zeros((3, emb_size))
     input_embs = np.ones((1, 7, emb_size))
     # Scatter embs_a and embs_b into input_embs
-    result = glue_model.scatter_all_embeddings([{"sentence1": "hi",
-                                                 "input_embs_sentence1": embs_a,
-                                                 "sentence2": "how are you",
-                                                 "input_embs_sentence2": embs_b
-                                                 }], input_embs)
-    target = [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-               [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-               [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]
+    result = glue_model.scatter_all_embeddings(
+        [{
+            "sentence1": "hi",
+            "input_embs_sentence1": embs_a,
+            "sentence2": "how are you",
+            "input_embs_sentence2": embs_b,
+        }],
+        input_embs,
+    )
+    target = [[
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+    ]]
     np.testing.assert_almost_equal(result, target)
 
   def test_scatter_all_embeddings_multi_batch(self):
     glue_model = GlueModelForTesting(
-        model_name_or_path="bert-base-uncased",
-        text_a_name="sentence1")
+        model_name_or_path="bert-base-uncased", text_a_name="sentence1"
+    )
     emb_size = 4
     embs_a = np.zeros((1, emb_size))
     embs_b = np.zeros((2, emb_size))
     input_embs = np.ones((2, 4, emb_size))
     # Scatter embs_a and embs_b into input_embs
-    result = glue_model.scatter_all_embeddings([{"sentence1": "hi",
-                                                 "input_embs_sentence1": embs_a,
-                                                 },
-                                                {"sentence1": "hi there",
-                                                 "input_embs_sentence1": embs_b,
-                                                 }], input_embs)
-    target = [[[1, 1, 1, 1],
-               [0, 0, 0, 0],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1]],
-              [[1, 1, 1, 1],
-               [0, 0, 0, 0],
-               [0, 0, 0, 0],
-               [1, 1, 1, 1]]]
+    result = glue_model.scatter_all_embeddings(
+        [
+            {
+                "sentence1": "hi",
+                "input_embs_sentence1": embs_a,
+            },
+            {
+                "sentence1": "hi there",
+                "input_embs_sentence1": embs_b,
+            },
+        ],
+        input_embs,
+    )
+    target = [
+        [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+        [[1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1]],
+    ]
     np.testing.assert_almost_equal(result, target)
 
     # Scatter only embs_a into input_embs
-    result = glue_model.scatter_all_embeddings([{"sentence1": "hi",
-                                                 "input_embs_sentence1": embs_a,
-                                                 },
-                                                {"sentence1": "hi there"
-                                                 }], input_embs)
-    target = [[[1, 1, 1, 1],
-               [0, 0, 0, 0],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1]],
-              [[1, 1, 1, 1],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1]]]
+    result = glue_model.scatter_all_embeddings(
+        [
+            {
+                "sentence1": "hi",
+                "input_embs_sentence1": embs_a,
+            },
+            {"sentence1": "hi there"},
+        ],
+        input_embs,
+    )
+    target = [
+        [[1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+        [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+    ]
     np.testing.assert_almost_equal(result, target)
 
     # Scatter only embs_b into input_embs
-    result = glue_model.scatter_all_embeddings([{"sentence1": "hi"},
-                                                {"sentence1": "hi there",
-                                                 "input_embs_sentence1": embs_b,
-                                                 }], input_embs)
-    target = [[[1, 1, 1, 1],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1],
-               [1, 1, 1, 1]],
-              [[1, 1, 1, 1],
-               [0, 0, 0, 0],
-               [0, 0, 0, 0],
-               [1, 1, 1, 1]]]
+    result = glue_model.scatter_all_embeddings(
+        [
+            {"sentence1": "hi"},
+            {
+                "sentence1": "hi there",
+                "input_embs_sentence1": embs_b,
+            },
+        ],
+        input_embs,
+    )
+    target = [
+        [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+        [[1, 1, 1, 1], [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1]],
+    ]
     np.testing.assert_almost_equal(result, target)
 
 
diff --git a/lit_nlp/examples/lm_demo.py b/lit_nlp/examples/lm_demo.py
index e6b3d422..5af2156b 100644
--- a/lit_nlp/examples/lm_demo.py
+++ b/lit_nlp/examples/lm_demo.py
@@ -18,15 +18,14 @@
 from absl import app
 from absl import flags
 from absl import logging
-
 from lit_nlp import app as lit_app
 from lit_nlp import dev_server
 from lit_nlp import server_flags
 from lit_nlp.api import layout
 from lit_nlp.components import word_replacer
 from lit_nlp.examples.datasets import classification
-from lit_nlp.examples.datasets import glue
 from lit_nlp.examples.datasets import lm
+from lit_nlp.examples.glue import data as glue_data
 from lit_nlp.examples.models import pretrained_lms
 
 # NOTE: additional flags defined in server_flags.py
@@ -46,7 +45,8 @@
 )
 
 _TOP_K = flags.DEFINE_integer(
-    "top_k", 10, "Rank to which the output distribution is pruned.")
+    "top_k", 10, "Rank to which the output distribution is pruned."
+)
 
 _MAX_EXAMPLES = flags.DEFINE_integer(
     "max_examples",
@@ -130,7 +130,7 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
 
   datasets = {
       # Single sentences from movie reviews (SST dev set).
-      "sst_dev": glue.SST2Data("validation").remap({"sentence": "text"}),
+      "sst_dev": glue_data.SST2Data("validation").remap({"sentence": "text"}),
       # Longer passages from movie reviews (IMDB dataset, test split).
       "imdb_train": classification.IMDBData("test"),
       # Empty dataset, if you just want to type sentences into the UI.
@@ -138,7 +138,7 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
   }
 
   dataset_loaders: lit_app.DatasetLoadersMap = {
-      "sst_dev": (glue.SST2DataForLM, glue.SST2DataForLM.init_spec()),
+      "sst_dev": (glue_data.SST2DataForLM, glue_data.SST2DataForLM.init_spec()),
       "imdb_train": (
           classification.IMDBData,
           classification.IMDBData.init_spec(),
@@ -154,14 +154,15 @@ def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
   if _LOAD_BWB.value:
     # A few sentences from the Billion Word Benchmark (Chelba et al. 2013).
     datasets["bwb"] = lm.BillionWordBenchmark(
-        "train", max_examples=_MAX_EXAMPLES.value)
+        "train", max_examples=_MAX_EXAMPLES.value
+    )
     dataset_loaders["bwb"] = (
         lm.BillionWordBenchmark,
         lm.BillionWordBenchmark.init_spec(),
     )
 
   for name in datasets:
-    datasets[name] = datasets[name].slice[:_MAX_EXAMPLES.value]
+    datasets[name] = datasets[name].slice[: _MAX_EXAMPLES.value]
     logging.info("Dataset: '%s' with %d examples", name, len(datasets[name]))
 
   generators = {"word_replacer": word_replacer.WordReplacer()}
diff --git a/lit_nlp/examples/notebooks/LIT_components_example.ipynb b/lit_nlp/examples/notebooks/LIT_components_example.ipynb
index 81da1a29..3a73924f 100644
--- a/lit_nlp/examples/notebooks/LIT_components_example.ipynb
+++ b/lit_nlp/examples/notebooks/LIT_components_example.ipynb
@@ -26,8 +26,8 @@
     "import pandas as pd\n",
     "\n",
     "from lit_nlp import notebook\n",
-    "from lit_nlp.examples.datasets import glue\n",
-    "from lit_nlp.examples.models import glue_models\n",
+    "from lit_nlp.examples.glue import data\n",
+    "from lit_nlp.examples.glue import models\n",
     "\n",
     "# Hide INFO and lower logs. Comment this out for debugging.\n",
     "from absl import logging\n",
@@ -57,7 +57,7 @@
    },
    "outputs": [],
    "source": [
-    "sst_data = glue.SST2Data('validation')\n",
+    "sst_data = data.SST2Data('validation')\n",
     "sst_data.spec()"
    ]
   },
@@ -130,7 +130,7 @@
     "!mkdir sst2_tiny\n",
     "!tar -xvf sst2_tiny.tar.gz -C sst2_tiny\n",
     "\n",
-    "sentiment_model = glue_models.SST2Model('./sst2_tiny')\n",
+    "sentiment_model = models.SST2Model('./sst2_tiny')\n",
     "sentiment_model.input_spec(), sentiment_model.output_spec()"
    ]
   },
diff --git a/lit_nlp/examples/notebooks/LIT_sentiment_classifier.ipynb b/lit_nlp/examples/notebooks/LIT_sentiment_classifier.ipynb
index 516a6e3b..cab395fe 100644
--- a/lit_nlp/examples/notebooks/LIT_sentiment_classifier.ipynb
+++ b/lit_nlp/examples/notebooks/LIT_sentiment_classifier.ipynb
@@ -25,8 +25,8 @@
    "outputs": [],
    "source": [
     "from lit_nlp import notebook\n",
-    "from lit_nlp.examples.datasets import glue\n",
-    "from lit_nlp.examples.models import glue_models\n",
+    "from lit_nlp.examples.glue import data\n",
+    "from lit_nlp.examples.glue import models\n",
     "\n",
     "# Hide INFO and lower logs. Comment this out for debugging.\n",
     "from absl import logging\n",
@@ -55,8 +55,8 @@
    "outputs": [],
    "source": [
     "# Create the LIT widget with the model and dataset to analyze.\n",
-    "datasets = {'sst_dev': glue.SST2Data('validation')}\n",
-    "models = {'sst_tiny': glue_models.SST2Model('./')}\n",
+    "datasets = {'sst_dev': data.SST2Data('validation')}\n",
+    "models = {'sst_tiny': models.SST2Model('./')}\n",
     "\n",
     "widget = notebook.LitWidget(models, datasets, port=8890)"
    ]
diff --git a/lit_nlp/examples/quickstart_sst_demo.py b/lit_nlp/examples/quickstart_sst_demo.py
deleted file mode 100644
index 0276d1b8..00000000
--- a/lit_nlp/examples/quickstart_sst_demo.py
+++ /dev/null
@@ -1,82 +0,0 @@
-r"""Quick-start demo for a sentiment analysis model.
-
-This demo fine-tunes a small Transformer (BERT-tiny) on the Stanford Sentiment
-Treebank (SST-2), and starts a LIT server.
-
-To run locally:
-  python -m lit_nlp.examples.quickstart_sst_demo --port=5432
-
-Training should take less than 5 minutes on a single GPU. Once you see the
-ASCII-art LIT logo, navigate to localhost:5432 to access the demo UI.
-"""
-
-from collections.abc import Sequence
-import sys
-import tempfile
-from typing import Optional
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from lit_nlp import dev_server
-from lit_nlp import server_flags
-from lit_nlp.examples.datasets import glue
-from lit_nlp.examples.models import glue_models
-
-# NOTE: additional flags defined in server_flags.py
-
-FLAGS = flags.FLAGS
-
-FLAGS.set_default("development_demo", True)
-
-_ENCODER_NAME = flags.DEFINE_string(
-    "encoder_name", "google/bert_uncased_L-2_H-128_A-2",
-    "Encoder name to use for fine-tuning. See https://huggingface.co/models.")
-
-_MODEL_PATH = flags.DEFINE_string("model_path", None,
-                                  "Path to save trained model.")
-
-
-def get_wsgi_app() -> Optional[dev_server.LitServerType]:
-  """Returns a LitApp instance for consumption by gunicorn."""
-  FLAGS.set_default("server_type", "external")
-  FLAGS.set_default("demo_mode", True)
-  # Parse flags without calling app.run(main), to avoid conflict with
-  # gunicorn command line flags.
-  unused = flags.FLAGS(sys.argv, known_only=True)
-  if unused:
-    logging.info(
-        "quickstart_sst_demo:get_wsgi_app() called with unused "
-        "args: %s", unused)
-  return main([])
-
-
-def run_finetuning(train_path):
-  """Fine-tune a transformer model."""
-  train_data = glue.SST2Data("train")
-  val_data = glue.SST2Data("validation")
-  model = glue_models.SST2Model(_ENCODER_NAME.value)
-  model.train(train_data.examples, validation_inputs=val_data.examples)
-  model.save(train_path)
-
-
-def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  model_path = _MODEL_PATH.value or tempfile.mkdtemp()
-  logging.info("Working directory: %s", model_path)
-  run_finetuning(model_path)
-
-  # Load our trained model.
-  models = {"sst": glue_models.SST2Model(model_path)}
-  datasets = {"sst_dev": glue.SST2Data("validation")}
-
-  # Start the LIT server. See server_flags.py for server options.
-  lit_demo = dev_server.Server(models, datasets, **server_flags.get_flags())
-  return lit_demo.serve()
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/lit_nlp/examples/simple_pytorch_demo.py b/lit_nlp/examples/simple_pytorch_demo.py
deleted file mode 100644
index cde642db..00000000
--- a/lit_nlp/examples/simple_pytorch_demo.py
+++ /dev/null
@@ -1,197 +0,0 @@
-r"""Code example for a custom model, using PyTorch.
-
-This demo shows how to use a custom model with LIT, in just a few lines of code.
-We'll use a transformers model, with a minimal amount of code to implement the
-LIT API. Compared to models/glue_models.py, this has fewer features, but the
-code is more readable.
-
-This demo is equivalent in functionality to simple_tf2_demo.py, but uses PyTorch
-instead of TensorFlow 2. The models behave identically as far as LIT is
-concerned, and the implementation is quite similar - to see changes, run:
-  git diff --no-index simple_tf2_demo.py simple_pytorch_demo.py
-
-The transformers library can load weights from either,
-so you can use any saved model compatible with the underlying model class
-(AutoModelForSequenceClassification). To train something for this demo, you can:
-- Use quickstart_sst_demo.py, and set --model_path to somewhere durable
-- Or: Use tools/glue_trainer.py
-- Or: Use any fine-tuning code that works with transformers, such as
-https://github.com/huggingface/transformers#quick-tour-of-the-fine-tuningusage-scripts
-
-To run locally:
-  python -m lit_nlp.examples.simple_pytorch_demo \
-      --port=5432 --model_path=/path/to/saved/model
-
-Then navigate to localhost:5432 to access the demo UI.
-
-NOTE: this demo still uses TensorFlow Datasets (which depends on TensorFlow) to
-load the data. However, the output of glue.SST2Data is just NumPy arrays and
-plain Python data, and you can easily replace this with a different library or
-directly loading from CSV.
-"""
-
-from collections.abc import Sequence
-import sys
-from typing import Optional
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from lit_nlp import dev_server
-from lit_nlp import server_flags
-from lit_nlp.api import model as lit_model
-from lit_nlp.api import types as lit_types
-# Use the regular GLUE data loaders, because these are very simple already.
-from lit_nlp.examples.datasets import glue
-from lit_nlp.lib import file_cache
-from lit_nlp.lib import utils
-
-import torch
-import transformers
-
-# NOTE: additional flags defined in server_flags.py
-
-FLAGS = flags.FLAGS
-
-FLAGS.set_default("development_demo", True)
-
-_MODEL_PATH = flags.DEFINE_string(
-    "model_path",
-    "https://storage.googleapis.com/what-if-tool-resources/lit-models/sst2_tiny.tar.gz",
-    "Path to trained model, in standard transformers format, e.g. as "
-    "saved by model.save_pretrained() and tokenizer.save_pretrained()"
-)
-
-SequenceClassifierOutput = (
-    transformers.modeling_outputs.SequenceClassifierOutput
-)
-
-
-def _from_pretrained(cls, *args, **kw):
-  """Load a transformers model in PyTorch, with fallback to TF2/Keras weights."""
-  try:
-    return cls.from_pretrained(*args, **kw)
-  except OSError as e:
-    logging.warning("Caught OSError loading model: %s", e)
-    logging.warning(
-        "Re-trying to convert from TensorFlow checkpoint (from_tf=True)")
-    return cls.from_pretrained(*args, from_tf=True, **kw)
-
-
-class SimpleSentimentModel(lit_model.BatchedModel):
-  """Simple sentiment analysis model."""
-
-  LABELS = ["0", "1"]  # negative, positive
-
-  def __init__(self, model_name_or_path):
-    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_name_or_path)
-    model_config = transformers.AutoConfig.from_pretrained(
-        model_name_or_path,
-        num_labels=2,
-        output_hidden_states=True,
-        output_attentions=True,
-    )
-    # This is a just a regular PyTorch model.
-    self.model = _from_pretrained(
-        transformers.AutoModelForSequenceClassification,
-        model_name_or_path,
-        config=model_config)
-    self.model.eval()
-
-  ##
-  # LIT API implementation
-  def max_minibatch_size(self):
-    # This tells lit_model.BatchedModel.predict() how to batch inputs to
-    # predict_minibatch().
-    # Alternately, you can just override predict() and handle batching yourself.
-    return 32
-
-  def predict_minibatch(self, inputs):
-    # Preprocess to ids and masks, and make the input batch.
-    encoded_input = self.tokenizer.batch_encode_plus(
-        [ex["sentence"] for ex in inputs],
-        return_tensors="pt",
-        add_special_tokens=True,
-        max_length=128,
-        padding="longest",
-        truncation="longest_first")
-
-    # Check and send to cuda (GPU) if available
-    if torch.cuda.is_available():
-      self.model.cuda()
-      for tensor in encoded_input:
-        encoded_input[tensor] = encoded_input[tensor].cuda()
-    # Run a forward pass.
-    with torch.no_grad():  # remove this if you need gradients.
-      out: SequenceClassifierOutput = self.model(**encoded_input)
-
-    # Post-process outputs.
-    batched_outputs = {
-        "probas": torch.nn.functional.softmax(out.logits, dim=-1),
-        "input_ids": encoded_input["input_ids"],
-        "ntok": torch.sum(encoded_input["attention_mask"], dim=1),
-        "cls_emb": out.hidden_states[-1][:, 0],  # last layer, first token
-    }
-    # Return as NumPy for further processing.
-    detached_outputs = {k: v.cpu().numpy() for k, v in batched_outputs.items()}
-    # Unbatch outputs so we get one record per input example.
-    for output in utils.unbatch_preds(detached_outputs):
-      ntok = output.pop("ntok")
-      output["tokens"] = self.tokenizer.convert_ids_to_tokens(
-          output.pop("input_ids")[1:ntok - 1])
-      yield output
-
-  def input_spec(self) -> lit_types.Spec:
-    return {
-        "sentence": lit_types.TextSegment(),
-        "label": lit_types.CategoryLabel(vocab=self.LABELS, required=False)
-    }
-
-  def output_spec(self) -> lit_types.Spec:
-    return {
-        "tokens": lit_types.Tokens(),
-        "probas": lit_types.MulticlassPreds(parent="label", vocab=self.LABELS,
-                                            null_idx=0),
-        "cls_emb": lit_types.Embeddings()
-    }
-
-
-def get_wsgi_app() -> Optional[dev_server.LitServerType]:
-  """Returns a LitApp instance for consumption by gunicorn."""
-  FLAGS.set_default("server_type", "external")
-  FLAGS.set_default("demo_mode", True)
-  # Parse flags without calling app.run(main), to avoid conflict with
-  # gunicorn command line flags.
-  unused = flags.FLAGS(sys.argv, known_only=True)
-  if unused:
-    logging.info(
-        "simple_pytorch_demo:get_wsgi_app() called with unused "
-        "args: %s", unused)
-  return main([])
-
-
-def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  # Normally path is a directory; if it's an archive file, download and
-  # extract to the transformers cache.
-  model_path = _MODEL_PATH.value
-  if model_path.endswith(".tar.gz"):
-    model_path = file_cache.cached_path(
-        model_path, extract_compressed_file=True)
-
-  # Load the model we defined above.
-  models = {"sst": SimpleSentimentModel(model_path)}
-  # Load SST-2 validation set from TFDS.
-  datasets = {"sst_dev": glue.SST2Data("validation")}
-
-  # Start the LIT server. See server_flags.py for server options.
-  lit_demo = dev_server.Server(models, datasets, **server_flags.get_flags())
-  return lit_demo.serve()
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/lit_nlp/examples/simple_tf2_demo.py b/lit_nlp/examples/simple_tf2_demo.py
deleted file mode 100644
index c606958b..00000000
--- a/lit_nlp/examples/simple_tf2_demo.py
+++ /dev/null
@@ -1,184 +0,0 @@
-r"""Code example for a custom model, using TensorFlow 2.
-
-This demo shows how to use a custom model with LIT, in just a few lines of code.
-We'll use a transformers model, with a minimal amount of code to implement the
-LIT API. Compared to models/glue_models.py, this has fewer features, but the
-code is more readable.
-
-This demo is equivalent in functionality to simple_pytorch_demo.py, but uses
-TensorFlow 2 instead of PyTorch. The models behave identically as far as LIT is
-concerned, and the implementation is quite similar - to see changes, run:
-  git diff --no-index simple_pytorch_demo.py simple_tf2_demo.py
-
-This uses the same underlying model class
-(transformers.TFAutoModelForSequenceClassification) as models/glue_models.py, so
-you can load from the same weights. To train something for this demo, you can:
-- Use quickstart_sst_demo.py, and set --model_path to somewhere durable
-- Or: Use tools/glue_trainer.py
-- Or: Use any fine-tuning code that works with transformers, such as
-https://github.com/huggingface/transformers#quick-tour-of-the-fine-tuningusage-scripts
-
-To run locally:
-  python -m lit_nlp.examples.simple_tf2_demo \
-      --port=5432 --model_path=/path/to/saved/model
-
-Then navigate to localhost:5432 to access the demo UI.
-"""
-
-from collections.abc import Sequence
-import sys
-from typing import Optional
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from lit_nlp import dev_server
-from lit_nlp import server_flags
-from lit_nlp.api import model as lit_model
-from lit_nlp.api import types as lit_types
-# Use the regular GLUE data loaders, because these are very simple already.
-from lit_nlp.examples.datasets import glue
-from lit_nlp.lib import file_cache
-from lit_nlp.lib import utils
-
-import tensorflow as tf
-import transformers
-
-# NOTE: additional flags defined in server_flags.py
-
-FLAGS = flags.FLAGS
-
-FLAGS.set_default("development_demo", True)
-
-_MODEL_PATH = flags.DEFINE_string(
-    "model_path",
-    "https://storage.googleapis.com/what-if-tool-resources/lit-models/sst2_tiny.tar.gz",
-    "Path to trained model, in standard transformers format, e.g. as "
-    "saved by model.save_pretrained() and tokenizer.save_pretrained()"
-)
-
-TFSequenceClassifierOutput = (
-    transformers.modeling_tf_outputs.TFSequenceClassifierOutput
-)
-
-
-def _from_pretrained(cls, *args, **kw):
-  """Load a transformers model in TF2, with fallback to PyTorch weights."""
-  try:
-    return cls.from_pretrained(*args, **kw)
-  except OSError as e:
-    logging.warning("Caught OSError loading model: %s", e)
-    logging.warning(
-        "Re-trying to convert from PyTorch checkpoint (from_pt=True)")
-    return cls.from_pretrained(*args, from_pt=True, **kw)
-
-
-class SimpleSentimentModel(lit_model.BatchedModel):
-  """Simple sentiment analysis model."""
-
-  LABELS = ["0", "1"]  # negative, positive
-
-  def __init__(self, model_name_or_path):
-    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_name_or_path)
-    model_config = transformers.AutoConfig.from_pretrained(
-        model_name_or_path,
-        num_labels=2,
-        output_hidden_states=True,
-        output_attentions=True,
-    )
-    # This is a just a regular Keras model.
-    self.model = _from_pretrained(
-        transformers.TFAutoModelForSequenceClassification,
-        model_name_or_path,
-        config=model_config)
-
-  ##
-  # LIT API implementation
-  def max_minibatch_size(self):
-    # This tells lit_model.BatchedModel.predict() how to batch inputs to
-    # predict_minibatch().
-    # Alternately, you can just override predict() and handle batching yourself.
-    return 32
-
-  def predict_minibatch(self, inputs):
-    # Preprocess to ids and masks, and make the input batch.
-    encoded_input = self.tokenizer.batch_encode_plus(
-        [ex["sentence"] for ex in inputs],
-        return_tensors="tf",
-        add_special_tokens=True,
-        max_length=128,
-        padding="longest",
-        truncation="longest_first")
-
-    # Run a forward pass.
-    out: TFSequenceClassifierOutput = self.model(encoded_input, training=False)
-
-    # Post-process outputs.
-    batched_outputs = {
-        "probas": tf.nn.softmax(out.logits, axis=-1),
-        "input_ids": encoded_input["input_ids"],
-        "ntok": tf.reduce_sum(encoded_input["attention_mask"], axis=1),
-        "cls_emb": out.hidden_states[-1][:, 0],  # last layer, first token
-    }
-    # Return as NumPy for further processing.
-    detached_outputs = {k: v.numpy() for k, v in batched_outputs.items()}
-    # Unbatch outputs so we get one record per input example.
-    for output in utils.unbatch_preds(detached_outputs):
-      ntok = output.pop("ntok")
-      output["tokens"] = self.tokenizer.convert_ids_to_tokens(
-          output.pop("input_ids")[1:ntok - 1])
-      yield output
-
-  def input_spec(self) -> lit_types.Spec:
-    return {
-        "sentence": lit_types.TextSegment(),
-        "label": lit_types.CategoryLabel(vocab=self.LABELS, required=False)
-    }
-
-  def output_spec(self) -> lit_types.Spec:
-    return {
-        "tokens": lit_types.Tokens(),
-        "probas": lit_types.MulticlassPreds(parent="label", vocab=self.LABELS,
-                                            null_idx=0),
-        "cls_emb": lit_types.Embeddings()
-    }
-
-
-def get_wsgi_app() -> Optional[dev_server.LitServerType]:
-  """Returns a LitApp instance for consumption by gunicorn."""
-  FLAGS.set_default("server_type", "external")
-  FLAGS.set_default("demo_mode", True)
-  # Parse flags without calling app.run(main), to avoid conflict with
-  # gunicorn command line flags.
-  unused = flags.FLAGS(sys.argv, known_only=True)
-  if unused:
-    logging.info("simplet_tf2_demo:get_wsgi_app() called with unused args: %s",
-                 unused)
-  return main([])
-
-
-def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  # Normally path is a directory; if it's an archive file, download and
-  # extract to the transformers cache.
-  model_path = _MODEL_PATH.value
-  if model_path.endswith(".tar.gz"):
-    model_path = file_cache.cached_path(
-        model_path, extract_compressed_file=True)
-
-  # Load the model we defined above.
-  models = {"sst": SimpleSentimentModel(model_path)}
-  # Load SST-2 validation set from TFDS.
-  datasets = {"sst_dev": glue.SST2Data("validation")}
-
-  # Start the LIT server. See server_flags.py for server options.
-  lit_demo = dev_server.Server(models, datasets, **server_flags.get_flags())
-  return lit_demo.serve()
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/lit_nlp/examples/sst_pytorch_demo.py b/lit_nlp/examples/sst_pytorch_demo.py
deleted file mode 100644
index dede8a61..00000000
--- a/lit_nlp/examples/sst_pytorch_demo.py
+++ /dev/null
@@ -1,237 +0,0 @@
-r"""Code example for a custom model, using PyTorch.
-
-This demo shows how to use a custom model with LIT, in just a few lines of code.
-We'll use a transformers model, with a minimal amount of code to implement the
-LIT API. Compared to models/glue_models.py, this has fewer features, but the
-code is more readable.
-This demo is similar in functionality to simple_tf2_demo.py, but uses PyTorch
-instead of TensorFlow 2.
-The transformers library can load weights from either,
-so you can use any saved model compatible with the underlying model class
-(AutoModelForSequenceClassification). To train something for this demo, you can:
-- Use quickstart_sst_demo.py, and set --model_path to somewhere durable
-- Or: Use tools/glue_trainer.py
-- Or: Use any fine-tuning code that works with transformers, such as
-https://github.com/huggingface/transformers#quick-tour-of-the-fine-tuningusage-scripts
-To run locally:
-  python -m lit_nlp.examples.simple_pytorch_demo \
-      --port=5432 --model_path=/path/to/saved/model
-Then navigate to localhost:5432 to access the demo UI.
-NOTE: this demo still uses TensorFlow Datasets (which depends on TensorFlow) to
-load the data. However, the output of glue.SST2Data is just NumPy arrays and
-plain Python data, and you can easily replace this with a different library or
-directly loading from CSV.
-"""
-
-from collections.abc import Sequence
-import re
-import sys
-from typing import Optional
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from lit_nlp import dev_server
-from lit_nlp import server_flags
-from lit_nlp.api import model as lit_model
-from lit_nlp.api import types as lit_types
-from lit_nlp.examples.datasets import glue
-from lit_nlp.lib import file_cache
-from lit_nlp.lib import utils
-import torch
-import transformers
-
-# NOTE: additional flags defined in server_flags.py
-
-FLAGS = flags.FLAGS
-
-FLAGS.set_default("development_demo", True)
-
-_MODEL_PATH = flags.DEFINE_string(
-    "model_path", None,
-    "Path to trained model, in standard transformers format, e.g. as "
-    "saved by model.save_pretrained() and tokenizer.save_pretrained()"
-)
-
-SequenceClassifierOutput = (
-    transformers.modeling_outputs.SequenceClassifierOutput
-)
-
-
-def _from_pretrained(cls, *args, **kw):
-  """Load a transformers model in PyTorch, with fallback to TF2/Keras weights."""
-  try:
-    return cls.from_pretrained(*args, **kw)
-  except OSError as e:
-    logging.warning("Caught OSError loading model: %s", e)
-    logging.warning(
-        "Re-trying to convert from TensorFlow checkpoint (from_tf=True)")
-    return cls.from_pretrained(*args, from_tf=True, **kw)
-
-
-class SimpleSentimentModel(lit_model.BatchedModel):
-  """Simple sentiment analysis model."""
-
-  LABELS = ["0", "1"]  # negative, positive
-  compute_grads: bool = True  # if True, compute and return gradients.
-
-  def __init__(self, model_name_or_path):
-    self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_name_or_path)
-    model_config = transformers.AutoConfig.from_pretrained(
-        model_name_or_path,
-        num_labels=2,
-        output_hidden_states=True,
-        output_attentions=True,
-    )
-    # This is a just a regular PyTorch model.
-    self.model = _from_pretrained(
-        transformers.AutoModelForSequenceClassification,
-        model_name_or_path,
-        config=model_config)
-    self.model.eval()
-
-  ##
-  # LIT API implementation
-  def max_minibatch_size(self):
-    # This tells lit_model.BatchedModel.predict() how to batch inputs to
-    # predict_minibatch().
-    # Alternately, you can just override predict() and handle batching yourself.
-    return 32
-
-  def predict_minibatch(self, inputs):
-
-    # Preprocess to ids and masks, and make the input batch.
-    encoded_input = self.tokenizer.batch_encode_plus(
-        [ex["sentence"] for ex in inputs],
-        return_tensors="pt",
-        add_special_tokens=True,
-        max_length=128,
-        padding="longest",
-        truncation="longest_first")
-
-    # Check and send to cuda (GPU) if available
-    if torch.cuda.is_available():
-      self.model.cuda()
-      for tensor in encoded_input:
-        encoded_input[tensor] = encoded_input[tensor].cuda()
-
-    # Run a forward pass.
-    with torch.set_grad_enabled(self.compute_grads):
-      out: SequenceClassifierOutput = self.model(**encoded_input)
-
-    # Post-process outputs.
-    batched_outputs = {
-        "probas": torch.nn.functional.softmax(out.logits, dim=-1),
-        "input_ids": encoded_input["input_ids"],
-        "ntok": torch.sum(encoded_input["attention_mask"], dim=1),
-        "cls_emb": out.hidden_states[-1][:, 0],  # last layer, first token
-    }
-
-    # Add attention layers to batched_outputs
-    assert len(out.attentions) == self.model.config.num_hidden_layers
-    for i, layer_attention in enumerate(out.attentions):
-      batched_outputs[f"layer_{i}/attention"] = layer_attention
-
-    # Request gradients after the forward pass.
-    # Note: hidden_states[0] includes position and segment encodings, as well as
-    # subword embeddings.
-    if self.compute_grads:
-      # <torch.float32>[batch_size, num_tokens, emb_dim]
-      scalar_pred_for_gradients = torch.max(
-          batched_outputs["probas"], dim=1, keepdim=False, out=None)[0]
-      batched_outputs["input_emb_grad"] = torch.autograd.grad(
-          scalar_pred_for_gradients,
-          out.hidden_states[0],
-          grad_outputs=torch.ones_like(scalar_pred_for_gradients))[0]
-
-    # Post-process outputs.
-    # Return as NumPy for further processing.
-    detached_outputs = {
-        k: v.cpu().detach().numpy() for k, v in batched_outputs.items()}
-
-    # Unbatch outputs so we get one record per input example.
-    for output in utils.unbatch_preds(detached_outputs):
-      ntok = output.pop("ntok")
-      output["tokens"] = self.tokenizer.convert_ids_to_tokens(
-          output.pop("input_ids")[:ntok])
-
-      # set token gradients
-      if self.compute_grads:
-        output["token_grad_sentence"] = output["input_emb_grad"][:ntok]
-
-      # Process attention.
-      for key in output:
-        if not re.match(r"layer_(\d+)/attention", key):
-          continue
-        # Select only real tokens, since most of this matrix is padding.
-        # <float32>[num_heads, max_seq_length, max_seq_length]
-        # -> <float32>[num_heads, num_tokens, num_tokens]
-        output[key] = output[key][:, :ntok, :ntok].transpose((0, 2, 1))
-        # Make a copy of this array to avoid memory leaks, since NumPy otherwise
-        # keeps a pointer around that prevents the source array from being GCed.
-        output[key] = output[key].copy()
-      yield output
-
-  def input_spec(self) -> lit_types.Spec:
-    return {
-        "sentence": lit_types.TextSegment(),
-        "label": lit_types.CategoryLabel(vocab=self.LABELS, required=False)
-    }
-
-  def output_spec(self) -> lit_types.Spec:
-    ret = {
-        "tokens": lit_types.Tokens(),
-        "probas": lit_types.MulticlassPreds(parent="label", vocab=self.LABELS,
-                                            null_idx=0),
-        "cls_emb": lit_types.Embeddings()
-    }
-    # Gradients, if requested.
-    if self.compute_grads:
-      ret["token_grad_sentence"] = lit_types.TokenGradients(
-          align="tokens")
-
-    # Attention heads, one field for each layer.
-    for i in range(self.model.config.num_hidden_layers):
-      ret[f"layer_{i}/attention"] = lit_types.AttentionHeads(
-          align_in="tokens", align_out="tokens")
-    return ret
-
-
-def get_wsgi_app() -> Optional[dev_server.LitServerType]:
-  """Returns a LitApp instance for consumption by gunicorn."""
-  FLAGS.set_default("server_type", "external")
-  FLAGS.set_default("demo_mode", True)
-  # Parse flags without calling app.run(main), to avoid conflict with
-  # gunicorn command line flags.
-  unused = flags.FLAGS(sys.argv, known_only=True)
-  if unused:
-    logging.info("sst_pytorch_demo:get_wsgi_app() called with unused args: %s",
-                 unused)
-  return main([])
-
-
-def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  # Normally path is a directory; if it's an archive file, download and
-  # extract to the transformers cache.
-  model_path = _MODEL_PATH.value
-  if model_path.endswith(".tar.gz"):
-    model_path = file_cache.cached_path(
-        model_path, extract_compressed_file=True)
-
-  # Load the model we defined above.
-  models = {"sst": SimpleSentimentModel(model_path)}
-  # Load SST-2 validation set from TFDS.
-  datasets = {"sst_dev": glue.SST2Data("validation")}
-
-  # Start the LIT server. See server_flags.py for server options.
-  lit_demo = dev_server.Server(models, datasets, **server_flags.get_flags())
-  return lit_demo.serve()
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/lit_nlp/examples/tools/glue_trainer.py b/lit_nlp/examples/tools/glue_trainer.py
index e217f8fb..6e57af58 100644
--- a/lit_nlp/examples/tools/glue_trainer.py
+++ b/lit_nlp/examples/tools/glue_trainer.py
@@ -25,8 +25,8 @@
 from absl import flags
 from absl import logging
 
-from lit_nlp.examples.datasets import glue
-from lit_nlp.examples.models import glue_models
+from lit_nlp.examples.glue import data as glue_data
+from lit_nlp.examples.glue import models as glue_models
 from lit_nlp.lib import serialize
 import tensorflow as tf
 
@@ -118,16 +118,16 @@ def main(argv: Sequence[str]) -> None:
   # TODO(lit-dev): add remaining GLUE tasks? These three cover all the major
   # features (single segment, two segment, classification, regression).
   if _TASK.value == "sst2":
-    train_data = glue.SST2Data("train")
-    val_data = glue.SST2Data("validation")
+    train_data = glue_data.SST2Data("train")
+    val_data = glue_data.SST2Data("validation")
     model = glue_models.SST2Model(_ENCODER_NAME.value)
   elif _TASK.value == "mnli":
-    train_data = glue.MNLIData("train")
-    val_data = glue.MNLIData("validation_matched")
+    train_data = glue_data.MNLIData("train")
+    val_data = glue_data.MNLIData("validation_matched")
     model = glue_models.MNLIModel(_ENCODER_NAME.value)
   elif _TASK.value == "stsb":
-    train_data = glue.STSBData("train")
-    val_data = glue.STSBData("validation")
+    train_data = glue_data.STSBData("train")
+    val_data = glue_data.STSBData("validation")
     model = glue_models.STSBModel(_ENCODER_NAME.value)
   else:
     raise ValueError(f"Unrecognized task name: '{_TASK.value:s}'")
diff --git a/pyproject.toml b/pyproject.toml
index 97e19939..13a7a967 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,3 +103,6 @@ test = [
 [tool.setuptools.packages.find]
 include = ["lit_nlp*"]
 exclude = ["*_test"]
+
+[tool.pytest.ini_options]
+addopts = "--import-mode=importlib"
diff --git a/website/sphinx_src/components.md b/website/sphinx_src/components.md
index 1e6f3410..e4ef2044 100644
--- a/website/sphinx_src/components.md
+++ b/website/sphinx_src/components.md
@@ -106,8 +106,6 @@ LIT supports many features for classification tasks, including common metrics,
 confusion matrices, and custom thresholding via the UI. Classification is
 implemented with the `MulticlassPreds` and `CategoryLabel` types.
 
-*   For a basic example on a binary sentiment task, see
-    [examples/simple_tf2_demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/simple_tf2_demo.py).
 *   Models should define a `MulticlassPreds` field in their output spec with the
     `vocab=` attribute as the set of class labels, and for each example should
     return a vector of probabilities for each class.
@@ -138,7 +136,7 @@ faceting, and scatterplots of scalar output. Regression is implemented with the
     which contains numerical targets, and the model's `RegressionScore` field
     should set `parent=` to the name of this field.
 *   For an example, see the STS-B textual similarity task in
-    [examples/glue_demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue_demo.py).
+    [examples/glue/demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue/demo.py).
 
 ### Multi-label classification
 
@@ -210,7 +208,7 @@ and otherwise to different parts of the input.
     [Dataset class documentation](./api.md#datasets) and the corresponding
     [Model](./api.md#models).
 *   For a more involved code example including per-token gradients, see
-    [examples/glue_demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue_demo.py).
+    [examples/glue/demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue/demo.py).
 
 ### Tabular data
 
@@ -358,7 +356,7 @@ An example spec would look like:
 
 For a more concrete example that also supports multiple segments with separate
 gradients, see our
-[BERT classifier demo model](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/models/glue_models.py),
+[BERT classifier demo model](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue/models.py),
 or contact the LIT team for assistance.
 
 ### LIME
diff --git a/website/sphinx_src/docker.md b/website/sphinx_src/docker.md
index dae9c90b..f83a6f14 100644
--- a/website/sphinx_src/docker.md
+++ b/website/sphinx_src/docker.md
@@ -1,6 +1,6 @@
 # Running LIT in a Docker container
 
-<!--* freshness: { owner: 'lit-dev' reviewed: '2023-07-10' } *-->
+<!--* freshness: { owner: 'lit-dev' reviewed: '2024-06-04' } *-->
 
 Users might want to deploy LIT onto servers for public-facing, long-running
 instances. This is how we host the LIT demos found on
@@ -23,12 +23,14 @@ the WSGI app to serve. The options provided to gunicorn for our use-case can be
 found in
 [`gunicorn_config.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/gunicorn_config.py).
 You can find a reference implementation in
-[`glue_demo.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue_demo.py) or
+[`glue/demo.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue/demo.py) or
 [`lm_demo.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/lm_demo.py).
 
-Use the following shell commands to build the default Docker image for LIT from
-the provided `Dockerfile`, and then run a container from that image. Comments
-are provided in-line to help explain what each step does.
+Use the following shell
+[.github/workflows/ci.ymlcommands](https://github.com/PAIR-code/lit/blob/main/lit_nlp/.github/workflows/ci.ymlcommands) to build the
+default Docker image for LIT from the provided `Dockerfile`, and then run a
+container from that image. Comments are provided in-line to help explain what
+each step does.
 
 ```shell
 # Build the docker image using the -t argument to name the image. Remember to
diff --git a/website/sphinx_src/faq.md b/website/sphinx_src/faq.md
index 395dd90d..7ebc1ae1 100644
--- a/website/sphinx_src/faq.md
+++ b/website/sphinx_src/faq.md
@@ -1,6 +1,6 @@
 # Frequently Asked Questions
 
-<!--* freshness: { owner: 'lit-dev' reviewed: '2023-09-19' } *-->
+<!--* freshness: { owner: 'lit-dev' reviewed: '2024-06-03' } *-->
 
 <!-- [TOC] placeholder - DO NOT REMOVE -->
 
@@ -171,7 +171,5 @@ official training APIs. However, to facilitate code reuse you can easily add
 training methods to your model class. In fact, several of our demos do exactly
 this, using LIT's `Dataset` objects to manage training data along with standard
 training APIs (such as Keras' `model.fit()`). See
-[`quickstart_sst_demo.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/quickstart_sst_demo.py)
-and/or
-[`glue_models.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/models/glue_models.py)
+[`glue/models.py`](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/glue/models.py)
 for examples.