diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1f060178..612cb219 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,7 +46,21 @@ jobs:
       - run: mix deps.compile
       - run: mix compile --warnings-as-errors
         if: ${{ matrix.lint }}
+      - name: Restore bumblebee cache
+        id: cache-bumblebee-restore
+        uses: actions/cache/restore@v3
+        with:
+          path: bumblebee_cache
+          key: ${{ runner.os }}-bumblebee-cache-${{ matrix.pair.elixir }}-${{ matrix.pair.otp }}
       - run: mix test
+        env:
+          BUMBLEBEE_CACHE_DIR: ${{ github.workspace }}/bumblebee_cache
+      - name: Save bumblebee cache
+        id: cache-bumblebee-save
+        uses: actions/cache/save@v3
+        with:
+          path: bumblebee_cache
+          key: ${{ steps.cache-bumblebee-restore.outputs.cache-primary-key }}
       - uses: technote-space/get-diff-action@v6
         with:
           PATTERNS: test/**/*_test.exs
diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
index 136b2892..03d28ed1 100644
--- a/lib/bumblebee.ex
+++ b/lib/bumblebee.ex
@@ -124,10 +124,11 @@ defmodule Bumblebee do
       {Bumblebee.Text.Distilbert, :for_sequence_classification},
     "DistilBertForQuestionAnswering" => {Bumblebee.Text.Distilbert, :for_question_answering},
     "DistilBertForTokenClassification" => {Bumblebee.Text.Distilbert, :for_token_classification},
+    "DistilBertForMultipleChoice" => {Bumblebee.Text.Distilbert, :for_multiple_choice},
     "GPT2ForSequenceClassification" => {Bumblebee.Text.Gpt2, :for_sequence_classification},
     "GPT2ForTokenClassification" => {Bumblebee.Text.Gpt2, :for_token_classification},
     "GPT2LMHeadModel" => {Bumblebee.Text.Gpt2, :for_causal_language_modeling},
-    "GPT2Model" => {BumbleBee.Text.Gpt2, :base},
+    "GPT2Model" => {Bumblebee.Text.Gpt2, :base},
     "GPTBigCodeModel" => {Bumblebee.Text.GptBigCode, :base},
     "GPTBigCodeForCausalLM" => {Bumblebee.Text.GptBigCode, :for_causal_language_modeling},
     "GPTBigCodeForSequenceClassification" =>
@@ -137,8 +138,7 @@ defmodule Bumblebee do
     "GPTNeoXForCausalLM" => {Bumblebee.Text.GptNeoX, :for_causal_language_modeling},
     "GPTNeoXForSequenceClassification" => {Bumblebee.Text.GptNeoX, :for_sequence_classification},
     "GPTNeoXForTokenClassification" => {Bumblebee.Text.GptNeoX, :for_token_classification},
-    "LayoutLMForMaskedLanguageModeling" =>
-      {Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
+    "LayoutLMForMaskedLM" => {Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
     "LayoutLMForQuestionAnswering" => {Bumblebee.Multimodal.LayoutLm, :for_question_answering},
     "LayoutLMForSequenceClassification" =>
       {Bumblebee.Multimodal.LayoutLm, :for_sequence_classification},
diff --git a/lib/bumblebee/huggingface/hub.ex b/lib/bumblebee/huggingface/hub.ex
index 85028dec..bd2ce305 100644
--- a/lib/bumblebee/huggingface/hub.ex
+++ b/lib/bumblebee/huggingface/hub.ex
@@ -51,7 +51,7 @@ defmodule Bumblebee.HuggingFace.Hub do
   @spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()}
   def cached_download(url, opts \\ []) do
     cache_dir = opts[:cache_dir] || Bumblebee.cache_dir()
-    offline = opts[:offline] || bumblebee_offline?()
+    offline = Keyword.get(opts, :offline, bumblebee_offline?())
     auth_token = opts[:auth_token]
 
     dir = Path.join(cache_dir, "huggingface")
diff --git a/lib/bumblebee/multimodal/layout_lm.ex b/lib/bumblebee/multimodal/layout_lm.ex
index 0fea9dba..7a620238 100644
--- a/lib/bumblebee/multimodal/layout_lm.ex
+++ b/lib/bumblebee/multimodal/layout_lm.ex
@@ -255,12 +255,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )
@@ -542,7 +537,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
         "language_modeling_head.norm" => "cls.predictions.transform.LayerNorm",
         "language_modeling_head.output" => "cls.predictions.decoder",
         "language_modeling_head.bias" => "cls.predictions",
-        "sequence_classification_head.output" => "cls.seq_relationship",
+        "sequence_classification_head.output" => "classifier",
         "token_classification_head.output" => "classifier",
         "multiple_choice_head.output" => "classifier",
         "question_answering_head.output" => "qa_outputs"
diff --git a/lib/bumblebee/text/bart.ex b/lib/bumblebee/text/bart.ex
index 9e48a62f..b8188147 100644
--- a/lib/bumblebee/text/bart.ex
+++ b/lib/bumblebee/text/bart.ex
@@ -652,7 +652,7 @@ defmodule Bumblebee.Text.Bart do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Model do
-    def params_mapping(_spec) do
+    def params_mapping(spec) do
       %{
         "encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
         "encoder_embedder.position_embedding" => "model.encoder.embed_positions",
@@ -690,7 +690,11 @@ defmodule Bumblebee.Text.Bart do
         "decoder.blocks.{n}.ffn.intermediate" => "model.decoder.layers.{n}.fc1",
         "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
         "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
-        "language_modeling_head.output" => "model.shared",
+        "language_modeling_head.output" =>
+          case spec.architecture do
+            :for_causal_language_modeling -> "lm_head"
+            _other -> "model.shared"
+          end,
         "language_modeling_head.logits_bias" => %{
           "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
         },
diff --git a/lib/bumblebee/text/bert.ex b/lib/bumblebee/text/bert.ex
index de0baf43..b8efeb41 100644
--- a/lib/bumblebee/text/bert.ex
+++ b/lib/bumblebee/text/bert.ex
@@ -272,12 +272,7 @@ defmodule Bumblebee.Text.Bert do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )
diff --git a/lib/bumblebee/text/blip_text.ex b/lib/bumblebee/text/blip_text.ex
index e145daca..ba8de9ff 100644
--- a/lib/bumblebee/text/blip_text.ex
+++ b/lib/bumblebee/text/blip_text.ex
@@ -387,8 +387,8 @@ defmodule Bumblebee.Text.BlipText do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Config do
-    # Support loading from the entire Clip configuration
-    def load(spec, %{"model_type" => "clip", "text_config" => data}) do
+    # Support loading from the entire Blip configuration
+    def load(spec, %{"model_type" => "blip", "text_config" => data}) do
       load(spec, data)
     end
 
@@ -419,7 +419,7 @@ defmodule Bumblebee.Text.BlipText do
     def params_mapping(spec) do
       prefix =
         case spec.architecture do
-          :base -> "text_encoder."
+          :base -> "text_model."
           :for_causal_language_modeling -> "text_decoder.bert."
         end
 
diff --git a/lib/bumblebee/text/distilbert.ex b/lib/bumblebee/text/distilbert.ex
index 918d1e98..fffd586a 100644
--- a/lib/bumblebee/text/distilbert.ex
+++ b/lib/bumblebee/text/distilbert.ex
@@ -52,10 +52,6 @@ defmodule Bumblebee.Text.Distilbert do
         doc:
           "the dropout rate for the classification head. If not specified, the value of `:dropout_rate` is used instead"
       ],
-      layer_norm_epsilon: [
-        default: 1.0e-12,
-        doc: "the epsilon used by the layer normalization layers"
-      ],
       initializer_scale: [
         default: 0.02,
         doc:
@@ -361,7 +357,7 @@ defmodule Bumblebee.Text.Distilbert do
       )
 
     Axon.add([inputs_embeddings, position_embeddings])
-    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
+    |> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
     |> Axon.dropout(rate: spec.dropout_rate, name: join(name, "dropout"))
   end
 
@@ -385,7 +381,7 @@ defmodule Bumblebee.Text.Distilbert do
       dropout_rate: spec.dropout_rate,
       attention_dropout_rate: spec.attention_dropout_rate,
       layer_norm: [
-        epsilon: spec.layer_norm_epsilon
+        epsilon: 1.0e-12
       ],
       ffn: [
         intermediate_size: spec.intermediate_size,
@@ -421,7 +417,7 @@ defmodule Bumblebee.Text.Distilbert do
       name: join(name, "dense")
     )
     |> Layers.activation(spec.activation, name: join(name, "activation"))
-    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
+    |> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
     # We reuse the kernel of input embeddings and add bias for each token
     |> Layers.dense_transposed(spec.vocab_size,
       kernel_initializer: kernel_initializer(spec),
@@ -446,15 +442,14 @@ defmodule Bumblebee.Text.Distilbert do
         convert!(data,
           vocab_size: {"vocab_size", number()},
           max_positions: {"max_position_embeddings", number()},
-          hidden_size: {"hidden_size", number()},
-          num_blocks: {"num_hidden_layers", number()},
-          num_attention_heads: {"num_attention_heads", number()},
-          intermediate_size: {"intermediate_size", number()},
-          activation: {"hidden_act", activation()},
-          dropout_rate: {"hidden_dropout_prob", number()},
-          attention_dropout_rate: {"attention_probs_dropout_prob", number()},
-          classifier_dropout_rate: {"classifier_dropout", optional(number())},
-          layer_norm_epsilon: {"layer_norm_eps", number()},
+          hidden_size: {"dim", number()},
+          num_blocks: {"n_layers", number()},
+          num_attention_heads: {"n_heads", number()},
+          intermediate_size: {"hidden_dim", number()},
+          activation: {"activation", activation()},
+          dropout_rate: {"dropout", number()},
+          attention_dropout_rate: {"attention_dropout", number()},
+          classifier_dropout_rate: {"seq_classif_dropout", optional(number())},
           initializer_scale: {"initializer_range", number()}
         ) ++ Shared.common_options_from_transformers(data, spec)
 
diff --git a/lib/bumblebee/text/llama.ex b/lib/bumblebee/text/llama.ex
index f82bfc57..0ac663ce 100644
--- a/lib/bumblebee/text/llama.ex
+++ b/lib/bumblebee/text/llama.ex
@@ -209,7 +209,8 @@ defmodule Bumblebee.Text.Llama do
     logits =
       Axon.dense(outputs.hidden_state, spec.num_labels,
         kernel_initializer: kernel_initializer(spec),
-        name: "sequence_classification_head.output"
+        name: "sequence_classification_head.output",
+        use_bias: false
       )
 
     pooled_logits =
diff --git a/lib/bumblebee/text/mbart.ex b/lib/bumblebee/text/mbart.ex
index 56797b62..bc0b2c6e 100644
--- a/lib/bumblebee/text/mbart.ex
+++ b/lib/bumblebee/text/mbart.ex
@@ -690,7 +690,7 @@ defmodule Bumblebee.Text.Mbart do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Model do
-    def params_mapping(_spec) do
+    def params_mapping(spec) do
       %{
         "encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
         "encoder_embedder.position_embedding" => "model.encoder.embed_positions",
@@ -730,7 +730,11 @@ defmodule Bumblebee.Text.Mbart do
         "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
         "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
         "decoder.norm" => "model.decoder.layer_norm",
-        "language_modeling_head.output" => "model.shared",
+        "language_modeling_head.output" =>
+          case spec.architecture do
+            :for_causal_language_modeling -> "lm_head"
+            _other -> "model.shared"
+          end,
         "language_modeling_head.logits_bias" => %{
           "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
         },
diff --git a/lib/bumblebee/text/mistral.ex b/lib/bumblebee/text/mistral.ex
index 4c580d19..bea90bff 100644
--- a/lib/bumblebee/text/mistral.ex
+++ b/lib/bumblebee/text/mistral.ex
@@ -202,7 +202,8 @@ defmodule Bumblebee.Text.Mistral do
     logits =
       Axon.dense(outputs.hidden_state, spec.num_labels,
         kernel_initializer: kernel_initializer(spec),
-        name: "sequence_classification_head.output"
+        name: "sequence_classification_head.output",
+        use_bias: false
       )
 
     pooled_logits =
diff --git a/lib/bumblebee/text/roberta.ex b/lib/bumblebee/text/roberta.ex
index c5b91d85..2ec209ff 100644
--- a/lib/bumblebee/text/roberta.ex
+++ b/lib/bumblebee/text/roberta.ex
@@ -262,12 +262,7 @@ defmodule Bumblebee.Text.Roberta do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )
diff --git a/lib/bumblebee/text/t5.ex b/lib/bumblebee/text/t5.ex
index f2741cbd..dba28f81 100644
--- a/lib/bumblebee/text/t5.ex
+++ b/lib/bumblebee/text/t5.ex
@@ -580,9 +580,8 @@ defmodule Bumblebee.Text.T5 do
   defimpl Bumblebee.HuggingFace.Transformers.Model do
     def params_mapping(spec) do
       %{
-        # encoder
-        "encoder_embedder.token_embedding" =>
-          if(spec.tie_word_embeddings, do: "shared", else: "encoder.embed_tokens"),
+        # Encoder and decoder embeddings are always shared
+        "encoder_embedder.token_embedding" => "shared",
         "encoder.blocks.{n}.self_attention_norm" => "encoder.block.{n}.layer.0.layer_norm",
         "encoder.blocks.{n}.self_attention.query" => "encoder.block.{n}.layer.0.SelfAttention.q",
         "encoder.blocks.{n}.self_attention.key" => "encoder.block.{n}.layer.0.SelfAttention.k",
@@ -599,9 +598,7 @@ defmodule Bumblebee.Text.T5 do
           ),
         "encoder.blocks.{n}.ffn.output" => "encoder.block.{n}.layer.1.DenseReluDense.wo",
         "encoder.output_norm" => "encoder.final_layer_norm",
-        # decoder
-        "decoder_embedder.token_embedding" =>
-          if(spec.tie_word_embeddings, do: "shared", else: "decoder.embed_tokens"),
+        "decoder_embedder.token_embedding" => "shared",
         "decoder.blocks.{n}.self_attention_norm" => "decoder.block.{n}.layer.0.layer_norm",
         "decoder.blocks.{n}.self_attention.query" => "decoder.block.{n}.layer.0.SelfAttention.q",
         "decoder.blocks.{n}.self_attention.key" => "decoder.block.{n}.layer.0.SelfAttention.k",
@@ -626,7 +623,6 @@ defmodule Bumblebee.Text.T5 do
           ),
         "decoder.blocks.{n}.ffn.output" => "decoder.block.{n}.layer.2.DenseReluDense.wo",
         "decoder.output_norm" => "decoder.final_layer_norm",
-        # language modeling
         "language_modeling_head.output" =>
           if(spec.tie_word_embeddings, do: "shared", else: "lm_head")
       }
diff --git a/lib/bumblebee/vision/resnet.ex b/lib/bumblebee/vision/resnet.ex
index 7a871c6f..ececad87 100644
--- a/lib/bumblebee/vision/resnet.ex
+++ b/lib/bumblebee/vision/resnet.ex
@@ -196,7 +196,7 @@ defmodule Bumblebee.Vision.ResNet do
         name: join(name, "blocks.0")
       )
 
-    for idx <- 1..(depth - 1), reduce: hidden_state do
+    for idx <- 1..(depth - 1)//1, reduce: hidden_state do
       hidden_state ->
         residual_block.(hidden_state, out_channels, out_channels,
           activation: spec.activation,
diff --git a/test/bumblebee/audio/speech_to_text_whisper_test.exs b/test/bumblebee/audio/speech_to_text_whisper_test.exs
index a8c35154..d9630c5a 100644
--- a/test/bumblebee/audio/speech_to_text_whisper_test.exs
+++ b/test/bumblebee/audio/speech_to_text_whisper_test.exs
@@ -3,211 +3,102 @@ defmodule Bumblebee.Audio.SpeechToTextWhisperTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
   @audio_dir Path.expand("../../fixtures/audio", __DIR__)
 
-  describe "integration" do
-    test "generates transcription" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+  test "generates transcription" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
 
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          defn_options: [compiler: EXLA]
-        )
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        defn_options: [compiler: EXLA]
+      )
 
-      audio =
-        Path.join(
-          @audio_dir,
-          "common_voice/a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687_pcm_f32le_16000.bin"
-        )
-        |> File.read!()
-        |> Nx.from_binary(:f32)
+    audio =
+      Path.join(
+        @audio_dir,
+        "common_voice/a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687_pcm_f32le_16000.bin"
+      )
+      |> File.read!()
+      |> Nx.from_binary(:f32)
 
-      assert Nx.Serving.run(serving, audio) == %{
-               chunks: [
-                 %{
-                   text: " Tower of strength.",
-                   start_timestamp_seconds: nil,
-                   end_timestamp_seconds: nil
-                 }
-               ]
-             }
-    end
-
-    test "supports compilation" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
-
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          defn_options: [compiler: EXLA],
-          compile: [batch_size: 1]
-        )
-
-      audio =
-        Path.join(
-          @audio_dir,
-          "common_voice/a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687_pcm_f32le_16000.bin"
-        )
-        |> File.read!()
-        |> Nx.from_binary(:f32)
-
-      assert Nx.Serving.run(serving, audio) == %{
-               chunks: [
-                 %{
-                   text: " Tower of strength.",
-                   start_timestamp_seconds: nil,
-                   end_timestamp_seconds: nil
-                 }
-               ]
-             }
-    end
-
-    test "long-form transcription with chunking" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
-
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          chunk_num_seconds: 30,
-          defn_options: [compiler: EXLA]
-        )
-
-      audio =
-        Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
-        |> File.read!()
-        |> Nx.from_binary(:f32)
-
-      assert Nx.Serving.run(serving, audio) == %{
-               chunks: [
-                 %{
-                   text:
-                     " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonja. An awakening by Alice Pirlong. O spring will wake in the heart of me with the rapture of blown violets, when the green bud quickens on every tree to spring will wake in the heart of me, and queues of honey",
-                   start_timestamp_seconds: nil,
-                   end_timestamp_seconds: nil
-                 },
-                 %{
-                   text:
-                     " will reign on the lee, tangling the grasses in silver nets. Yes, spring will awaken the heart of me with the rapture of blown violets. End of an awakening, this recording is in the public domain.",
-                   start_timestamp_seconds: nil,
-                   end_timestamp_seconds: nil
-                 }
-               ]
-             }
-    end
-
-    test "long-form transcription with timestamps" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+    assert Nx.Serving.run(serving, audio) == %{
+             chunks: [
+               %{
+                 text: " Tower of strength.",
+                 start_timestamp_seconds: nil,
+                 end_timestamp_seconds: nil
+               }
+             ]
+           }
+  end
 
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          chunk_num_seconds: 30,
-          defn_options: [compiler: EXLA],
-          timestamps: :segments
-        )
+  test "supports compilation" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
 
-      audio =
-        Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
-        |> File.read!()
-        |> Nx.from_binary(:f32)
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        defn_options: [compiler: EXLA],
+        compile: [batch_size: 1]
+      )
 
-      assert Nx.Serving.run(serving, audio) == %{
-               chunks: [
-                 %{
-                   text:
-                     " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonia.",
-                   start_timestamp_seconds: 0.0,
-                   end_timestamp_seconds: 7.0
-                 },
-                 %{
-                   text: " An awakening by Alice Pirlong.",
-                   start_timestamp_seconds: 7.0,
-                   end_timestamp_seconds: 11.0
-                 },
-                 %{
-                   text:
-                     " O spring will wake in the heart of me with the rapture of blown violets, when the green bud",
-                   start_timestamp_seconds: 11.0,
-                   end_timestamp_seconds: 18.12
-                 },
-                 %{
-                   text:
-                     " quickens on every tree to spring will wake in the heart of me, and queues of honey will reign on the lee,",
-                   start_timestamp_seconds: 18.12,
-                   end_timestamp_seconds: 25.92
-                 },
-                 %{
-                   text:
-                     " tangling the grasses in silver nets. Yes, spring will awaken the heart of me",
-                   start_timestamp_seconds: 25.92,
-                   end_timestamp_seconds: 32.48
-                 },
-                 %{
-                   text: " with the rapture of blown violets.",
-                   start_timestamp_seconds: 32.48,
-                   end_timestamp_seconds: 34.88
-                 },
-                 %{
-                   text: " End of an awakening, this recording is in the public domain.",
-                   start_timestamp_seconds: 36.96,
-                   end_timestamp_seconds: 40.72
-                 }
-               ]
-             }
-    end
+    audio =
+      Path.join(
+        @audio_dir,
+        "common_voice/a6c7706a220eeea7ee3687c1122fe7ac17962d2449d25b6db37cc41cdaace442683e11945b6f581e73941c3083cd4eecfafc938840459cd8c571dae7774ee687_pcm_f32le_16000.bin"
+      )
+      |> File.read!()
+      |> Nx.from_binary(:f32)
 
-    test "streaming without timestamps" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+    assert Nx.Serving.run(serving, audio) == %{
+             chunks: [
+               %{
+                 text: " Tower of strength.",
+                 start_timestamp_seconds: nil,
+                 end_timestamp_seconds: nil
+               }
+             ]
+           }
+  end
 
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          chunk_num_seconds: 30,
-          defn_options: [compiler: EXLA],
-          stream: true
-        )
+  test "long-form transcription with chunking" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
 
-      audio =
-        Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
-        |> File.read!()
-        |> Nx.from_binary(:f32)
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        chunk_num_seconds: 30,
+        defn_options: [compiler: EXLA]
+      )
 
-      stream = Nx.Serving.run(serving, audio)
+    audio =
+      Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
+      |> File.read!()
+      |> Nx.from_binary(:f32)
 
-      assert Enum.to_list(stream) == [
+    assert Nx.Serving.run(serving, audio) == %{
+             chunks: [
                %{
                  text:
                    " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonja. An awakening by Alice Pirlong. O spring will wake in the heart of me with the rapture of blown violets, when the green bud quickens on every tree to spring will wake in the heart of me, and queues of honey",
@@ -221,34 +112,33 @@ defmodule Bumblebee.Audio.SpeechToTextWhisperTest do
                  end_timestamp_seconds: nil
                }
              ]
-    end
-
-    test "streaming with timestamps" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+           }
+  end
 
-      serving =
-        Bumblebee.Audio.speech_to_text_whisper(
-          model_info,
-          featurizer,
-          tokenizer,
-          generation_config,
-          chunk_num_seconds: 30,
-          defn_options: [compiler: EXLA],
-          timestamps: :segments,
-          stream: true
-        )
+  test "long-form transcription with timestamps" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
 
-      audio =
-        Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
-        |> File.read!()
-        |> Nx.from_binary(:f32)
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        chunk_num_seconds: 30,
+        defn_options: [compiler: EXLA],
+        timestamps: :segments
+      )
 
-      stream = Nx.Serving.run(serving, audio)
+    audio =
+      Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
+      |> File.read!()
+      |> Nx.from_binary(:f32)
 
-      assert Enum.to_list(stream) == [
+    assert Nx.Serving.run(serving, audio) == %{
+             chunks: [
                %{
                  text:
                    " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonia.",
@@ -289,6 +179,114 @@ defmodule Bumblebee.Audio.SpeechToTextWhisperTest do
                  end_timestamp_seconds: 40.72
                }
              ]
-    end
+           }
+  end
+
+  test "streaming without timestamps" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        chunk_num_seconds: 30,
+        defn_options: [compiler: EXLA],
+        stream: true
+      )
+
+    audio =
+      Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
+      |> File.read!()
+      |> Nx.from_binary(:f32)
+
+    stream = Nx.Serving.run(serving, audio)
+
+    assert Enum.to_list(stream) == [
+             %{
+               text:
+                 " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonja. An awakening by Alice Pirlong. O spring will wake in the heart of me with the rapture of blown violets, when the green bud quickens on every tree to spring will wake in the heart of me, and queues of honey",
+               start_timestamp_seconds: nil,
+               end_timestamp_seconds: nil
+             },
+             %{
+               text:
+                 " will reign on the lee, tangling the grasses in silver nets. Yes, spring will awaken the heart of me with the rapture of blown violets. End of an awakening, this recording is in the public domain.",
+               start_timestamp_seconds: nil,
+               end_timestamp_seconds: nil
+             }
+           ]
+  end
+
+  test "streaming with timestamps" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
+
+    serving =
+      Bumblebee.Audio.speech_to_text_whisper(
+        model_info,
+        featurizer,
+        tokenizer,
+        generation_config,
+        chunk_num_seconds: 30,
+        defn_options: [compiler: EXLA],
+        timestamps: :segments,
+        stream: true
+      )
+
+    audio =
+      Path.join(@audio_dir, "librivox/46s_pcm_f32le_16000.bin")
+      |> File.read!()
+      |> Nx.from_binary(:f32)
+
+    stream = Nx.Serving.run(serving, audio)
+
+    assert Enum.to_list(stream) == [
+             %{
+               text:
+                 " An awakening from the book of Irish poetry part 1, read for LibriVox.org by Sonia.",
+               start_timestamp_seconds: 0.0,
+               end_timestamp_seconds: 7.0
+             },
+             %{
+               text: " An awakening by Alice Pirlong.",
+               start_timestamp_seconds: 7.0,
+               end_timestamp_seconds: 11.0
+             },
+             %{
+               text:
+                 " O spring will wake in the heart of me with the rapture of blown violets, when the green bud",
+               start_timestamp_seconds: 11.0,
+               end_timestamp_seconds: 18.12
+             },
+             %{
+               text:
+                 " quickens on every tree to spring will wake in the heart of me, and queues of honey will reign on the lee,",
+               start_timestamp_seconds: 18.12,
+               end_timestamp_seconds: 25.92
+             },
+             %{
+               text:
+                 " tangling the grasses in silver nets. Yes, spring will awaken the heart of me",
+               start_timestamp_seconds: 25.92,
+               end_timestamp_seconds: 32.48
+             },
+             %{
+               text: " with the rapture of blown violets.",
+               start_timestamp_seconds: 32.48,
+               end_timestamp_seconds: 34.88
+             },
+             %{
+               text: " End of an awakening, this recording is in the public domain.",
+               start_timestamp_seconds: 36.96,
+               end_timestamp_seconds: 40.72
+             }
+           ]
   end
 end
diff --git a/test/bumblebee/audio/whisper_featurizer_test.exs b/test/bumblebee/audio/whisper_featurizer_test.exs
index 28a22082..4e065f97 100644
--- a/test/bumblebee/audio/whisper_featurizer_test.exs
+++ b/test/bumblebee/audio/whisper_featurizer_test.exs
@@ -4,7 +4,7 @@ defmodule Bumblebee.Audio.WhisperFeaturizerTest do
   import Bumblebee.TestHelpers
 
   describe "integration" do
-    test "encoding model input" do
+    test "encodes text" do
       assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
 
       assert %Bumblebee.Audio.WhisperFeaturizer{} = featurizer
diff --git a/test/bumblebee/audio/whisper_test.exs b/test/bumblebee/audio/whisper_test.exs
index 7152aa27..c1973e7d 100644
--- a/test/bumblebee/audio/whisper_test.exs
+++ b/test/bumblebee/audio/whisper_test.exs
@@ -1,109 +1,59 @@
 defmodule Bumblebee.Text.WhisperTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/whisper-tiny"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-WhisperModel"})
 
-      assert %Bumblebee.Audio.Whisper{architecture: :base} = spec
+    assert %Bumblebee.Audio.Whisper{architecture: :base} = spec
 
-      input_features = Nx.sin(Nx.iota({1, 3000, 80}, type: :f32))
-      decoder_input_ids = Nx.tensor([[50258, 50259, 50359, 50363]])
+    inputs = %{
+      "input_features" => Nx.sin(Nx.iota({1, 60, 80}, type: :f32)),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_features" => input_features,
-        "decoder_input_ids" => decoder_input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.hidden_state) == {1, 8, 16}
 
-      assert Nx.shape(outputs.hidden_state) == {1, 4, 384}
-
-      assert_all_close(
-        outputs.hidden_state[[.., .., 1..3]],
-        Nx.tensor([
-          [
-            [9.1349, 0.5695, 8.7758],
-            [0.0160, -7.0785, 1.1313],
-            [6.1074, -2.0481, -1.5687],
-            [5.6247, -10.3924, 7.2008]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "base model with safetensors" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "openai/whisper-tiny"},
-                 architecture: :base,
-                 params_filename: "model.safetensors"
-               )
-
-      assert %Bumblebee.Audio.Whisper{architecture: :base} = spec
-
-      input_features = Nx.sin(Nx.iota({1, 3000, 80}, type: :f32))
-      decoder_input_ids = Nx.tensor([[50258, 50259, 50359, 50363]])
-
-      inputs = %{
-        "input_features" => input_features,
-        "decoder_input_ids" => decoder_input_ids
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.hidden_state) == {1, 4, 384}
-
-      assert_all_close(
-        outputs.hidden_state[[.., .., 1..3]],
-        Nx.tensor([
-          [
-            [9.1349, 0.5695, 8.7758],
-            [0.0160, -7.0785, 1.1313],
-            [6.1074, -2.0481, -1.5687],
-            [5.6247, -10.3924, 7.2008]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "for conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.3791, -1.6131, -0.6913], [0.1247, -1.3631, 0.0034], [-0.0097, 0.2039, 1.9897]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Audio.Whisper{architecture: :for_conditional_generation} = spec
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-WhisperForConditionalGeneration"}
+             )
 
-      input_features = Nx.sin(Nx.iota({1, 3000, 80}, type: :f32))
-      decoder_input_ids = Nx.tensor([[50258, 50259, 50359, 50363]])
+    assert %Bumblebee.Audio.Whisper{architecture: :for_conditional_generation} = spec
 
-      inputs = %{
-        "input_features" => input_features,
-        "decoder_input_ids" => decoder_input_ids
-      }
+    inputs = %{
+      "input_features" => Nx.sin(Nx.iota({1, 60, 80}, type: :f32)),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 4, 51865}
+    assert Nx.shape(outputs.logits) == {1, 8, 50257}
 
-      assert_all_close(
-        outputs.logits[[.., .., 1..3]],
-        Nx.tensor([
-          [
-            [2.0805, 6.0644, 7.0570],
-            [-7.8065, -3.0313, -5.1049],
-            [17.4098, 16.2510, 16.0446],
-            [-7.7142, -5.9466, -6.1812]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0942, 0.1288, 0.0243], [-0.1667, -0.1401, 0.1191], [0.0398, -0.0449, -0.0574]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/diffusion/stable_diffusion/safety_checker_test.exs b/test/bumblebee/diffusion/stable_diffusion/safety_checker_test.exs
index af54d218..1a27c070 100644
--- a/test/bumblebee/diffusion/stable_diffusion/safety_checker_test.exs
+++ b/test/bumblebee/diffusion/stable_diffusion/safety_checker_test.exs
@@ -5,185 +5,187 @@ defmodule Bumblebee.Diffusion.StableDiffusion.SafetyCheckerTest do
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "CompVis/stable-diffusion-v1-4", subdir: "safety_checker"}
-               )
+  # Note that here we use a full-sized model because the output is a
+  # binary answer and we want to validate that it actually works
+  @moduletag slow: true, timeout: 600_000
 
-      assert {:ok, featurizer} =
-               Bumblebee.load_featurizer(
-                 {:hf, "CompVis/stable-diffusion-v1-4", subdir: "feature_extractor"}
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "CompVis/stable-diffusion-v1-4", subdir: "safety_checker"}
+             )
 
-      assert %Bumblebee.Diffusion.StableDiffusion.SafetyChecker{architecture: :base} = spec
+    assert {:ok, featurizer} =
+             Bumblebee.load_featurizer(
+               {:hf, "CompVis/stable-diffusion-v1-4", subdir: "feature_extractor"}
+             )
 
-      safe1 = Nx.broadcast(100, {1, 224, 224, 3})
+    assert %Bumblebee.Diffusion.StableDiffusion.SafetyChecker{architecture: :base} = spec
 
-      safe2 =
-        Nx.tensor([
-          [
-            [204, 187, 172],
-            [165, 142, 130],
-            [216, 183, 169],
-            [226, 202, 183],
-            [220, 190, 178],
-            [180, 153, 148]
-          ],
-          [
-            [213, 195, 179],
-            [113, 96, 86],
-            [102, 91, 85],
-            [124, 111, 105],
-            [117, 97, 93],
-            [78, 72, 71]
-          ],
-          [
-            [203, 187, 172],
-            [90, 81, 75],
-            [83, 53, 50],
-            [83, 54, 51],
-            [70, 56, 53],
-            [127, 109, 107]
-          ],
-          [
-            [221, 210, 192],
-            [95, 83, 79],
-            [127, 65, 66],
-            [128, 63, 63],
-            [90, 70, 69],
-            [186, 163, 161]
-          ],
-          [
-            [227, 221, 198],
-            [131, 122, 116],
-            [192, 152, 147],
-            [180, 137, 126],
-            [143, 121, 114],
-            [204, 175, 172]
-          ],
-          [
-            [178, 172, 156],
-            [89, 83, 78],
-            [156, 145, 136],
-            [116, 104, 95],
-            [96, 77, 73],
-            [176, 144, 142]
-          ]
-        ])
+    safe1 = Nx.broadcast(100, {1, 224, 224, 3})
 
-      unsafe1 =
-        Nx.tensor([
-          [
-            [158, 106, 56],
-            [222, 147, 81],
-            [233, 180, 125],
-            [226, 164, 105],
-            [244, 205, 162],
-            [189, 170, 148]
-          ],
-          [
-            [180, 124, 68],
-            [233, 163, 88],
-            [232, 183, 130],
-            [213, 145, 88],
-            [244, 196, 145],
-            [224, 203, 178]
-          ],
-          [
-            [202, 158, 108],
-            [228, 168, 103],
-            [220, 162, 108],
-            [190, 133, 88],
-            [223, 168, 118],
-            [229, 206, 182]
-          ],
-          [
-            [172, 135, 106],
-            [237, 197, 160],
-            [241, 203, 171],
-            [236, 192, 153],
-            [224, 178, 138],
-            [141, 109, 89]
-          ],
-          [
-            [156, 112, 88],
-            [228, 179, 139],
-            [234, 188, 153],
-            [225, 171, 132],
-            [217, 163, 124],
-            [123, 86, 67]
-          ],
-          [
-            [133, 88, 66],
-            [181, 113, 75],
-            [151, 93, 63],
-            [150, 89, 61],
-            [162, 94, 63],
-            [100, 63, 47]
-          ]
-        ])
+    safe2 =
+      Nx.tensor([
+        [
+          [204, 187, 172],
+          [165, 142, 130],
+          [216, 183, 169],
+          [226, 202, 183],
+          [220, 190, 178],
+          [180, 153, 148]
+        ],
+        [
+          [213, 195, 179],
+          [113, 96, 86],
+          [102, 91, 85],
+          [124, 111, 105],
+          [117, 97, 93],
+          [78, 72, 71]
+        ],
+        [
+          [203, 187, 172],
+          [90, 81, 75],
+          [83, 53, 50],
+          [83, 54, 51],
+          [70, 56, 53],
+          [127, 109, 107]
+        ],
+        [
+          [221, 210, 192],
+          [95, 83, 79],
+          [127, 65, 66],
+          [128, 63, 63],
+          [90, 70, 69],
+          [186, 163, 161]
+        ],
+        [
+          [227, 221, 198],
+          [131, 122, 116],
+          [192, 152, 147],
+          [180, 137, 126],
+          [143, 121, 114],
+          [204, 175, 172]
+        ],
+        [
+          [178, 172, 156],
+          [89, 83, 78],
+          [156, 145, 136],
+          [116, 104, 95],
+          [96, 77, 73],
+          [176, 144, 142]
+        ]
+      ])
 
-      unsafe2 =
-        Nx.tensor([
-          [
-            [148, 120, 67],
-            [152, 123, 70],
-            [136, 109, 59],
-            [114, 88, 44],
-            [171, 114, 69],
-            [233, 135, 85]
-          ],
-          [
-            [158, 126, 72],
-            [209, 139, 87],
-            [211, 139, 87],
-            [211, 144, 91],
-            [247, 161, 107],
-            [243, 148, 96]
-          ],
-          [
-            [187, 136, 82],
-            [223, 138, 88],
-            [249, 119, 82],
-            [251, 99, 73],
-            [249, 138, 97],
-            [249, 162, 111]
-          ],
-          [
-            [245, 160, 106],
-            [244, 167, 112],
-            [241, 128, 90],
-            [233, 91, 67],
-            [226, 103, 73],
-            [215, 113, 72]
-          ],
-          [
-            [168, 79, 45],
-            [173, 76, 43],
-            [171, 56, 33],
-            [162, 41, 25],
-            [150, 35, 20],
-            [142, 44, 24]
-          ],
-          [[86, 52, 24], [111, 66, 33], [109, 61, 30], [104, 48, 21], [103, 46, 20], [72, 32, 17]]
-        ])
+    unsafe1 =
+      Nx.tensor([
+        [
+          [158, 106, 56],
+          [222, 147, 81],
+          [233, 180, 125],
+          [226, 164, 105],
+          [244, 205, 162],
+          [189, 170, 148]
+        ],
+        [
+          [180, 124, 68],
+          [233, 163, 88],
+          [232, 183, 130],
+          [213, 145, 88],
+          [244, 196, 145],
+          [224, 203, 178]
+        ],
+        [
+          [202, 158, 108],
+          [228, 168, 103],
+          [220, 162, 108],
+          [190, 133, 88],
+          [223, 168, 118],
+          [229, 206, 182]
+        ],
+        [
+          [172, 135, 106],
+          [237, 197, 160],
+          [241, 203, 171],
+          [236, 192, 153],
+          [224, 178, 138],
+          [141, 109, 89]
+        ],
+        [
+          [156, 112, 88],
+          [228, 179, 139],
+          [234, 188, 153],
+          [225, 171, 132],
+          [217, 163, 124],
+          [123, 86, 67]
+        ],
+        [
+          [133, 88, 66],
+          [181, 113, 75],
+          [151, 93, 63],
+          [150, 89, 61],
+          [162, 94, 63],
+          [100, 63, 47]
+        ]
+      ])
 
-      # Note: the example images are downscaled to 6x6 as it appears
-      # to be enough for this test case and this way we don't need to
-      # keep unsafe images around
-      inputs = Bumblebee.apply_featurizer(featurizer, [safe1, safe2, unsafe1, unsafe2])
+    unsafe2 =
+      Nx.tensor([
+        [
+          [148, 120, 67],
+          [152, 123, 70],
+          [136, 109, 59],
+          [114, 88, 44],
+          [171, 114, 69],
+          [233, 135, 85]
+        ],
+        [
+          [158, 126, 72],
+          [209, 139, 87],
+          [211, 139, 87],
+          [211, 144, 91],
+          [247, 161, 107],
+          [243, 148, 96]
+        ],
+        [
+          [187, 136, 82],
+          [223, 138, 88],
+          [249, 119, 82],
+          [251, 99, 73],
+          [249, 138, 97],
+          [249, 162, 111]
+        ],
+        [
+          [245, 160, 106],
+          [244, 167, 112],
+          [241, 128, 90],
+          [233, 91, 67],
+          [226, 103, 73],
+          [215, 113, 72]
+        ],
+        [
+          [168, 79, 45],
+          [173, 76, 43],
+          [171, 56, 33],
+          [162, 41, 25],
+          [150, 35, 20],
+          [142, 44, 24]
+        ],
+        [[86, 52, 24], [111, 66, 33], [109, 61, 30], [104, 48, 21], [103, 46, 20], [72, 32, 17]]
+      ])
 
-      outputs = Axon.predict(model, params, inputs)
+    # Note: the example images are downscaled to 6x6 as it appears
+    # to be enough for this test case and this way we don't need to
+    # keep unsafe images around
+    inputs = Bumblebee.apply_featurizer(featurizer, [safe1, safe2, unsafe1, unsafe2])
 
-      assert Nx.shape(outputs.is_unsafe) == {4}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.is_unsafe,
-        Nx.tensor([0, 0, 1, 1]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.is_unsafe) == {4}
+
+    assert_all_close(
+      outputs.is_unsafe,
+      Nx.tensor([0, 0, 1, 1]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/diffusion/stable_diffusion_test.exs b/test/bumblebee/diffusion/stable_diffusion_test.exs
index 2f332819..58355e13 100644
--- a/test/bumblebee/diffusion/stable_diffusion_test.exs
+++ b/test/bumblebee/diffusion/stable_diffusion_test.exs
@@ -3,11 +3,16 @@ defmodule Bumblebee.Diffusion.StableDiffusionTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
-  describe "integration" do
-    test "text_to_image/6" do
-      repository_id = "CompVis/stable-diffusion-v1-4"
+  describe "text_to_image/6" do
+    test "generates image for a text prompt" do
+      # Since we don't assert on the result in this case, we use
+      # a tiny random checkpoint. This test is basically to verify
+      # the whole generation computation end-to-end
+
+      # repository_id = "CompVis/stable-diffusion-v1-4"
+      repository_id = "bumblebee-testing/tiny-stable-diffusion"
 
       {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/clip-vit-large-patch14"})
 
@@ -33,7 +38,33 @@ defmodule Bumblebee.Diffusion.StableDiffusionTest do
 
       serving =
         Bumblebee.Diffusion.StableDiffusion.text_to_image(clip, unet, vae, tokenizer, scheduler,
-          num_steps: 2,
+          num_steps: 3,
+          safety_checker: safety_checker,
+          safety_checker_featurizer: featurizer
+        )
+
+      prompt = "numbat in forest, detailed, digital art"
+
+      assert %{
+               results: [%{image: %Nx.Tensor{}, is_safe: _boolean}]
+             } = Nx.Serving.run(serving, prompt)
+
+      # Without safety checker
+
+      serving =
+        Bumblebee.Diffusion.StableDiffusion.text_to_image(clip, unet, vae, tokenizer, scheduler,
+          num_steps: 3
+        )
+
+      prompt = "numbat in forest, detailed, digital art"
+
+      assert %{results: [%{image: %Nx.Tensor{}}]} = Nx.Serving.run(serving, prompt)
+
+      # With compilation
+
+      serving =
+        Bumblebee.Diffusion.StableDiffusion.text_to_image(clip, unet, vae, tokenizer, scheduler,
+          num_steps: 3,
           safety_checker: safety_checker,
           safety_checker_featurizer: featurizer,
           defn_options: [compiler: EXLA]
diff --git a/test/bumblebee/diffusion/unet_2d_conditional_test.exs b/test/bumblebee/diffusion/unet_2d_conditional_test.exs
index e85cc4c6..cc9b0867 100644
--- a/test/bumblebee/diffusion/unet_2d_conditional_test.exs
+++ b/test/bumblebee/diffusion/unet_2d_conditional_test.exs
@@ -1,45 +1,47 @@
 defmodule Bumblebee.Diffusion.UNet2DConditionalTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "CompVis/stable-diffusion-v1-4", subdir: "unet"},
-                 params_filename: "diffusion_pytorch_model.bin"
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-stable-diffusion-torch", subdir: "unet"},
+               params_filename: "diffusion_pytorch_model.bin"
+             )
 
-      assert %Bumblebee.Diffusion.UNet2DConditional{architecture: :base} = spec
+    assert %Bumblebee.Diffusion.UNet2DConditional{architecture: :base} = spec
 
-      inputs = %{
-        "sample" => Nx.broadcast(0.5, {1, 32, 32, 4}),
-        "timestep" => Nx.tensor(1),
-        "encoder_hidden_state" => Nx.broadcast(0.5, {1, 1, 768})
-      }
+    inputs = %{
+      "sample" => Nx.broadcast(0.5, {1, 32, 32, 4}),
+      "timestep" => Nx.tensor(1),
+      "encoder_hidden_state" => Nx.broadcast(0.5, {1, 1, 32})
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.sample) == {1, 32, 32, 4}
+    assert Nx.shape(outputs.sample) == {1, 32, 32, 4}
 
-      assert_all_close(
-        to_channels_first(outputs.sample)[[.., .., 1..3, 1..3]],
-        Nx.tensor([
+    assert_all_close(
+      to_channels_first(outputs.sample)[[.., 1..3, 1..3, 1..3]],
+      Nx.tensor([
+        [
           [
-            [[0.0283, -0.0525, 0.0433], [-0.1055, -0.1024, -0.0299], [-0.0498, -0.0391, 0.0032]],
-            [[-0.2615, 0.1989, 0.1763], [-0.1742, 0.2385, 0.2464], [-0.2188, 0.1589, 0.1809]],
-            [
-              [-0.5708, -0.3721, -0.2976],
-              [-0.2256, -0.0616, -0.0092],
-              [-0.2484, -0.1358, -0.0635]
-            ],
-            [[0.0672, 0.2093, 0.2373], [0.0086, 0.1947, 0.2024], [0.0041, 0.1981, 0.2100]]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+            [-1.0813, -0.5109, -0.1545],
+            [-0.8094, -1.2588, -0.8355],
+            [-0.9218, -1.2142, -0.6982]
+          ],
+          [
+            [-0.2179, -0.2799, -1.0922],
+            [-0.9485, -0.8376, 0.0843],
+            [-0.9650, -0.7105, -0.3920]
+          ],
+          [[1.3359, 0.8373, -0.2392], [0.9448, -0.0478, 0.6881], [-0.0154, -0.5304, 0.2081]]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/diffusion/vae_kl_test.exs b/test/bumblebee/diffusion/vae_kl_test.exs
index bcd98fb3..0d9a9826 100644
--- a/test/bumblebee/diffusion/vae_kl_test.exs
+++ b/test/bumblebee/diffusion/vae_kl_test.exs
@@ -1,122 +1,114 @@
 defmodule Bumblebee.Diffusion.VaeKlTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "fusing/autoencoder-kl-dummy"},
-                 params_filename: "diffusion_pytorch_model.bin"
-               )
-
-      assert %Bumblebee.Diffusion.VaeKl{architecture: :base} = spec
-
-      inputs = %{
-        "sample" => Nx.broadcast(0.5, {1, 32, 32, 3})
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.sample) == {1, 32, 32, 3}
-
-      # Values from the PyTorch implementation with relaxed tolerance.
-      # This is expected, because the 2D convolution (conv_in) gives
-      # slightly different values
-      assert_all_close(
-        to_channels_first(outputs.sample)[[.., .., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [[-0.2663, -0.1856, -0.0329], [-0.3195, -0.2043, 0.0261], [-0.1437, 0.1092, -0.0886]],
-            [
-              [-0.1602, 0.0089, -0.0834],
-              [-0.2720, -0.2133, -0.2161],
-              [-0.2255, -0.4390, -0.0873]
-            ],
-            [[-0.1968, -0.1538, 0.0143], [-0.0999, -0.1270, -0.0190], [-0.0566, 0.1445, 0.0548]]
-          ]
-        ]),
-        atol: 5.0e-4
-      )
-    end
-
-    test "decoder model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "fusing/autoencoder-kl-dummy"},
-                 architecture: :decoder,
-                 params_filename: "diffusion_pytorch_model.bin"
-               )
-
-      assert %Bumblebee.Diffusion.VaeKl{architecture: :decoder} = spec
-
-      inputs = %{
-        "sample" => Nx.broadcast(0.5, {1, 16, 16, 4})
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.sample) == {1, 32, 32, 3}
-
-      assert_all_close(
-        to_channels_first(outputs.sample)[[.., .., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [[-0.3571, -0.2580, -0.0133], [-0.0827, 0.0831, 0.1217], [0.8464, 0.5589, 0.0858]],
-            [[-0.4579, -0.0463, 0.0853], [-0.8820, 0.0898, -0.4705], [-0.8381, -0.5012, 0.2303]],
-            [[0.2384, 1.0047, 0.4958], [-0.1108, 0.4506, 0.2563], [0.2548, 0.5310, -0.2233]]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "encoder model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "fusing/autoencoder-kl-dummy"},
-                 architecture: :encoder,
-                 params_filename: "diffusion_pytorch_model.bin"
-               )
-
-      assert %Bumblebee.Diffusion.VaeKl{architecture: :encoder} = spec
-
-      inputs = %{
-        "sample" => Nx.broadcast(0.5, {1, 32, 32, 3})
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.latent_dist.mean) == {1, 16, 16, 4}
-      assert Nx.shape(outputs.latent_dist.var) == {1, 16, 16, 4}
-      assert Nx.shape(outputs.latent_dist.logvar) == {1, 16, 16, 4}
-      assert Nx.shape(outputs.latent_dist.std) == {1, 16, 16, 4}
-
-      assert_all_close(
-        to_channels_first(outputs.latent_dist.mean)[[.., .., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [[0.1872, 0.4903, 0.1864], [0.0671, 0.5873, 0.1105], [-0.1166, 0.2500, 0.1097]],
-            [[0.2362, 0.5435, 0.2865], [-0.0456, 0.5072, 0.0343], [0.0375, 0.4808, 0.1607]],
-            [[-0.0504, -0.0917, 0.0713], [0.1328, -0.0544, 0.2171], [0.3996, 0.2134, 0.1796]],
-            [[0.2317, -0.1167, 0.1082], [0.4584, 0.0792, 0.0767], [0.2208, -0.0846, 0.0651]]
-          ]
-        ]),
-        atol: 5.0e-4
-      )
-
-      assert_all_close(
-        to_channels_first(outputs.latent_dist.var)[[.., .., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [[1.5876, 1.0834, 1.4341], [1.7221, 1.0370, 1.2434], [1.2043, 0.8315, 1.2684]],
-            [[1.6400, 0.9540, 1.4241], [2.1689, 1.1963, 1.4273], [1.5476, 0.9472, 1.3265]],
-            [[0.5249, 0.6610, 0.6645], [0.4862, 0.4959, 0.6945], [0.6391, 0.7181, 0.6905]],
-            [[0.8795, 1.1088, 1.2060], [1.0547, 0.9093, 0.9656], [1.0600, 0.9056, 1.1402]]
-          ]
-        ]),
-        atol: 5.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-stable-diffusion-torch", subdir: "vae"},
+               params_filename: "diffusion_pytorch_model.bin"
+             )
+
+    assert %Bumblebee.Diffusion.VaeKl{architecture: :base} = spec
+
+    inputs = %{
+      "sample" => Nx.broadcast(0.5, {1, 32, 32, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.sample) == {1, 32, 32, 3}
+
+    assert_all_close(
+      to_channels_first(outputs.sample)[[.., .., 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [[0.0164, -0.1439, 0.4768], [0.3165, 0.0599, 0.1729], [0.1148, 0.5428, 0.6126]],
+          [[0.3587, 0.4221, 0.1088], [0.4741, 0.4139, 0.6284], [0.4739, 0.1454, 0.3089]],
+          [[-0.2398, 0.2247, -0.2082], [-0.1440, 0.0256, -0.0120], [-0.0982, -0.3666, -0.1221]]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":decoder" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-stable-diffusion-torch", subdir: "vae"},
+               architecture: :decoder,
+               params_filename: "diffusion_pytorch_model.bin"
+             )
+
+    assert %Bumblebee.Diffusion.VaeKl{architecture: :decoder} = spec
+
+    inputs = %{
+      "sample" => Nx.broadcast(0.5, {1, 16, 16, 4})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.sample) == {1, 32, 32, 3}
+
+    assert_all_close(
+      to_channels_first(outputs.sample)[[.., .., 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [[-0.1682, -0.1015, -0.4158], [-0.4621, 0.5176, -0.0999], [-0.0685, 0.7141, 0.3287]],
+          [[-0.6000, -0.0538, -0.6703], [0.4113, 0.3203, -0.5005], [0.2073, -0.1205, 0.2487]],
+          [[0.2776, 0.6985, 0.2960], [0.4759, -0.3528, -0.7306], [0.5656, -0.5858, -0.2490]]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":encoder" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-stable-diffusion-torch", subdir: "vae"},
+               architecture: :encoder,
+               params_filename: "diffusion_pytorch_model.bin"
+             )
+
+    assert %Bumblebee.Diffusion.VaeKl{architecture: :encoder} = spec
+
+    inputs = %{
+      "sample" => Nx.broadcast(0.5, {1, 32, 32, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.latent_dist.mean) == {1, 16, 16, 4}
+    assert Nx.shape(outputs.latent_dist.var) == {1, 16, 16, 4}
+    assert Nx.shape(outputs.latent_dist.logvar) == {1, 16, 16, 4}
+    assert Nx.shape(outputs.latent_dist.std) == {1, 16, 16, 4}
+
+    assert_all_close(
+      to_channels_first(outputs.latent_dist.mean)[[.., 1..3, 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [[0.1788, 0.2526, 0.3464], [0.1255, 0.4318, 0.0935], [-0.3859, -0.1090, -0.1257]],
+          [[-0.6560, -0.1389, -0.1010], [-0.0494, -0.5862, -0.2144], [0.1139, -0.5287, -0.3207]],
+          [[0.2527, 0.6616, -0.1320], [0.2834, -0.1787, -0.1887], [0.2339, 0.6370, -0.1075]]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      to_channels_first(outputs.latent_dist.var)[[.., 1..3, 1..3, 1..3]],
+      Nx.tensor([
+        [
+          [[0.7926, 0.6405, 0.8108], [0.4721, 0.9543, 0.8660], [0.5069, 0.7749, 0.9574]],
+          [[0.4830, 1.2762, 1.1277], [0.9835, 1.1715, 1.5034], [1.2341, 1.0105, 1.2950]],
+          [[1.7315, 1.8338, 1.7099], [1.6843, 1.5880, 1.2972], [1.2979, 1.3841, 1.1591]]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/huggingface/hub_test.exs b/test/bumblebee/huggingface/hub_test.exs
index cad31697..0a6f2ffc 100644
--- a/test/bumblebee/huggingface/hub_test.exs
+++ b/test/bumblebee/huggingface/hub_test.exs
@@ -21,7 +21,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.json"
 
-      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert File.read!(path) == "{}"
     end
 
@@ -37,8 +37,8 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.json"
 
-      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir)
-      assert {:ok, ^path} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
+      assert {:ok, ^path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert File.read!(path) == "{}"
     end
 
@@ -64,8 +64,8 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.json"
 
-      assert {:ok, path1} = Hub.cached_download(url, cache_dir: tmp_dir)
-      assert {:ok, path2} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path1} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
+      assert {:ok, path2} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert path1 != path2
       assert File.read!(path2) == "[]"
     end
@@ -89,7 +89,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.bin"
 
-      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert File.read!(path) == <<0, 1>>
     end
 
@@ -110,8 +110,8 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.bin"
 
-      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir)
-      assert {:ok, ^path} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
+      assert {:ok, ^path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert File.read!(path) == <<0, 1>>
     end
 
@@ -124,7 +124,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
       url = url(bypass.port) <> "/file.json"
 
       assert {:error, "no ETag found on the resource"} =
-               Hub.cached_download(url, cache_dir: tmp_dir)
+               Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
     end
 
     @tag :tmp_dir
@@ -136,7 +136,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
       url = url(bypass.port) <> "/file.json"
 
       assert {:error, "HTTP request failed with status 500, url: " <> _} =
-               Hub.cached_download(url, cache_dir: tmp_dir)
+               Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
     end
 
     @tag :tmp_dir
@@ -151,7 +151,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
       url = url(bypass.port) <> "/file.json"
 
       assert {:error, "repository not found, url: " <> _} =
-               Hub.cached_download(url, cache_dir: tmp_dir)
+               Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
     end
 
     @tag :tmp_dir
@@ -167,7 +167,7 @@ defmodule Bumblebee.HuggingFace.HubTest do
 
       url = url(bypass.port) <> "/file.json"
 
-      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir)
+      assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: false)
       assert File.read!(path) == "{}"
 
       assert {:ok, path} = Hub.cached_download(url, cache_dir: tmp_dir, offline: true)
diff --git a/test/bumblebee/multimodal/blip_test.exs b/test/bumblebee/multimodal/blip_test.exs
index da75cfae..26a9d6eb 100644
--- a/test/bumblebee/multimodal/blip_test.exs
+++ b/test/bumblebee/multimodal/blip_test.exs
@@ -1,43 +1,32 @@
 defmodule Bumblebee.Multimodal.BlipTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Salesforce/blip-image-captioning-base"})
-
-      assert %Bumblebee.Multimodal.Blip{architecture: :for_conditional_generation} = spec
-
-      inputs = %{
-        "decoder_input_ids" =>
-          Nx.tensor([
-            [101, 2019],
-            [101, 2019]
-          ]),
-        "decoder_attention_mask" => Nx.tensor([[1, 1], [1, 1]]),
-        "pixel_values" =>
-          Nx.concatenate([
-            Nx.broadcast(0.25, {1, 384, 384, 3}),
-            Nx.broadcast(0.75, {1, 384, 384, 3})
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {2, 2, 30524}
-
-      assert_all_close(
-        outputs.logits[[.., .., 1..3]],
-        Nx.tensor([
-          [[-3.6837, -3.6838, -3.6837], [-1.4808, -1.4809, -1.4808]],
-          [[-3.5190, -3.5191, -3.5190], [-1.4715, -1.4715, -1.4715]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BlipForConditionalGeneration"}
+             )
+
+    assert %Bumblebee.Multimodal.Blip{architecture: :for_conditional_generation} = spec
+
+    inputs = %{
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]]),
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 8, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.1215, 0.0226, -0.1134], [0.1472, 0.1118, 0.1031], [-0.0687, 0.0104, 0.1781]]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/multimodal/clip_test.exs b/test/bumblebee/multimodal/clip_test.exs
index b58cd3e7..c7603c00 100644
--- a/test/bumblebee/multimodal/clip_test.exs
+++ b/test/bumblebee/multimodal/clip_test.exs
@@ -1,48 +1,49 @@
 defmodule Bumblebee.Multimodal.ClipTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"})
-
-      assert %Bumblebee.Multimodal.Clip{architecture: :base} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [49406, 320, 1125, 539, 320, 2368, 49407],
-            [49406, 320, 1125, 539, 320, 1929, 49407]
-          ]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]),
-        "pixel_values" =>
-          Nx.concatenate([
-            Nx.broadcast(0.25, {1, 224, 224, 3}),
-            Nx.broadcast(0.75, {1, 224, 224, 3})
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits_per_text) == {2, 2}
-
-      assert_all_close(
-        outputs.logits_per_text,
-        Nx.tensor([[22.7866, 22.8397], [23.2389, 22.8406]]),
-        atol: 1.0e-4
-      )
-
-      assert Nx.shape(outputs.logits_per_image) == {2, 2}
-
-      assert_all_close(
-        outputs.logits_per_image,
-        Nx.tensor([[22.7866, 23.2389], [22.8397, 22.8406]]),
-        atol: 1.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-CLIPModel"})
+
+    assert %Bumblebee.Multimodal.Clip{architecture: :base} = spec
+
+    inputs = %{
+      "input_ids" =>
+        Nx.tensor([
+          [10, 20, 30, 40, 50, 60, 70, 80, 0, 0],
+          [15, 25, 35, 45, 55, 65, 75, 85, 0, 0]
+        ]),
+      "attention_mask" =>
+        Nx.tensor([
+          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
+        ]),
+      "pixel_values" =>
+        Nx.concatenate([
+          Nx.broadcast(0.25, {1, 30, 30, 3}),
+          Nx.broadcast(0.75, {1, 30, 30, 3})
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits_per_text) == {2, 2}
+    assert Nx.shape(outputs.logits_per_image) == {2, 2}
+
+    assert_all_close(
+      outputs.logits_per_text,
+      Nx.tensor([[0.5381, 0.1981], [0.5212, 0.3291]]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.logits_per_image,
+      Nx.tensor([[0.5381, 0.5212], [0.1981, 0.3291]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/multimodal/layout_lm_test.exs b/test/bumblebee/multimodal/layout_lm_test.exs
index 9d820e0e..51073ca9 100644
--- a/test/bumblebee/multimodal/layout_lm_test.exs
+++ b/test/bumblebee/multimodal/layout_lm_test.exs
@@ -1,181 +1,208 @@
 defmodule Bumblebee.Multimodal.LayoutLmTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/layoutlm-base-uncased"},
-                 module: Bumblebee.Multimodal.LayoutLm,
-                 architecture: :base
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-LayoutLMModel"})
 
-      assert %Bumblebee.Multimodal.LayoutLm{architecture: :base} = spec
+    assert %Bumblebee.Multimodal.LayoutLm{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 2088, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1]]),
-        "token_type_ids" => Nx.tensor([[0, 0, 0, 0]]),
-        "bounding_box" =>
-          Nx.tensor([
-            [[0, 0, 0, 0], [637, 773, 693, 782], [698, 773, 733, 782], [1000, 1000, 1000, 1000]]
-          ])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "bounding_box" =>
+        Nx.tensor([
+          [
+            [10, 12, 16, 18],
+            [20, 22, 26, 28],
+            [30, 32, 36, 38],
+            [40, 42, 46, 48],
+            [50, 52, 56, 58],
+            [60, 62, 66, 68],
+            [70, 72, 76, 78],
+            [80, 82, 86, 88],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0]
+          ]
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.0240, -0.8855, 1.8877], [1.8435, 0.6223, 2.0573], [1.6961, -1.2411, 1.2824]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      outputs = Axon.predict(model, params, inputs)
+  test ":for_masked_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-LayoutLMForMaskedLM"})
 
-      assert Nx.shape(outputs.hidden_state) == {1, 4, 768}
+    assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_masked_language_modeling} = spec
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "bounding_box" =>
         Nx.tensor([
-          [[-0.0126, 0.2175, 0.1398], [0.0240, 0.5338, -0.1337], [-0.0190, 0.5194, 0.0706]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/layoutlm-base-uncased"},
-                 module: Bumblebee.Multimodal.LayoutLm,
-                 architecture: :for_masked_language_modeling
-               )
-
-      assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_masked_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 2088, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1]]),
-        "token_type_ids" => Nx.tensor([[0, 0, 0, 0]]),
-        "bounding_box" =>
-          Nx.tensor([
-            [[0, 0, 0, 0], [637, 773, 693, 782], [698, 773, 733, 782], [1000, 1000, 1000, 1000]]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 4, 30522}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
+          [
+            [10, 12, 16, 18],
+            [20, 22, 26, 28],
+            [30, 32, 36, 38],
+            [40, 42, 46, 48],
+            [50, 52, 56, 58],
+            [60, 62, 66, 68],
+            [70, 72, 76, 78],
+            [80, 82, 86, 88],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0]
+          ]
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.2101, -0.0342, 0.1613], [-0.0734, 0.1874, -0.0231], [-0.0776, 0.0145, 0.2504]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-LayoutLMForSequenceClassification"}
+             )
+
+    assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_sequence_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "bounding_box" =>
         Nx.tensor([
-          [[-0.9018, -0.7695, 1.1371], [0.1485, -0.1378, 1.6499], [-0.5236, -0.4974, -0.6739]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/layoutlm-base-uncased"},
-                 module: Bumblebee.Multimodal.LayoutLm,
-                 architecture: :for_sequence_classification
-               )
-
-      assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_sequence_classification} = spec
-
-      params =
-        update_in(params["sequence_classification_head.output"], fn %{"kernel" => k, "bias" => b} ->
-          %{"kernel" => Nx.broadcast(1.0, k), "bias" => Nx.broadcast(0.0, b)}
-        end)
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 2088, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1]]),
-        "token_type_ids" => Nx.tensor([[0, 0, 0, 0]]),
-        "bounding_box" =>
-          Nx.tensor([
-            [[0, 0, 0, 0], [637, 773, 693, 782], [698, 773, 733, 782], [1000, 1000, 1000, 1000]]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-0.6356, -0.6356]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "token classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/layoutlm-base-uncased"},
-                 module: Bumblebee.Multimodal.LayoutLm,
-                 architecture: :for_token_classification
-               )
-
-      assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_token_classification} = spec
-
-      params =
-        update_in(params["token_classification_head.output"], fn %{"kernel" => k, "bias" => b} ->
-          %{"kernel" => Nx.broadcast(1.0, k), "bias" => Nx.broadcast(0.0, b)}
-        end)
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 2088, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1]]),
-        "token_type_ids" => Nx.tensor([[0, 0, 0, 0]]),
-        "bounding_box" =>
-          Nx.tensor([
-            [[0, 0, 0, 0], [637, 773, 693, 782], [698, 773, 733, 782], [1000, 1000, 1000, 1000]]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 4, 2}
-
-      assert_all_close(
-        outputs.logits,
+          [
+            [10, 12, 16, 18],
+            [20, 22, 26, 28],
+            [30, 32, 36, 38],
+            [40, 42, 46, 48],
+            [50, 52, 56, 58],
+            [60, 62, 66, 68],
+            [70, 72, 76, 78],
+            [80, 82, 86, 88],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0]
+          ]
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0241, 0.0096]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-LayoutLMForTokenClassification"}
+             )
+
+    assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_token_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "bounding_box" =>
+        Nx.tensor([
+          [
+            [10, 12, 16, 18],
+            [20, 22, 26, 28],
+            [30, 32, 36, 38],
+            [40, 42, 46, 48],
+            [50, 52, 56, 58],
+            [60, 62, 66, 68],
+            [70, 72, 76, 78],
+            [80, 82, 86, 88],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0]
+          ]
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[-0.1849, 0.1134], [-0.1329, 0.0025], [-0.0454, 0.0441]]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-LayoutLMForQuestionAnswering"}
+             )
+
+    assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_question_answering} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]),
+      "bounding_box" =>
         Nx.tensor([
-          [[-9.0337, -9.0337], [-7.6490, -7.6490], [-6.9672, -6.9672], [-9.0373, -9.0373]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "question answering model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "impira/layoutlm-document-qa"},
-                 module: Bumblebee.Multimodal.LayoutLm,
-                 architecture: :for_question_answering
-               )
-
-      assert %Bumblebee.Multimodal.LayoutLm{architecture: :for_question_answering} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 20920, 232, 2]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1]]),
-        "bounding_box" =>
-          Nx.tensor([
-            [[0, 0, 0, 0], [637, 773, 693, 782], [698, 773, 733, 782], [1000, 1000, 1000, 1000]]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.start_logits) == {1, 4}
-      assert Nx.shape(outputs.end_logits) == {1, 4}
-
-      assert_all_close(
-        outputs.start_logits,
-        Nx.tensor([[-5.7846, -9.4211, -14.8011, -18.0101]]),
-        atol: 1.0e-4
-      )
-
-      assert_all_close(
-        outputs.end_logits,
-        Nx.tensor([[-7.8913, -11.3020, -10.6801, -19.1530]]),
-        atol: 1.0e-4
-      )
-    end
+          [
+            [10, 12, 16, 18],
+            [20, 22, 26, 28],
+            [30, 32, 36, 38],
+            [40, 42, 46, 48],
+            [50, 52, 56, 58],
+            [60, 62, 66, 68],
+            [70, 72, 76, 78],
+            [80, 82, 86, 88],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0]
+          ]
+        ])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
+
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[-0.1853, 0.1580, 0.2387]]),
+      atol: 1.0e-3
+    )
+
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[-0.1854, -0.0074, 0.0670]]),
+      atol: 1.0e-3
+    )
   end
 end
diff --git a/test/bumblebee/text/albert_test.exs b/test/bumblebee/text/albert_test.exs
index 91f67ef5..2af48b4a 100644
--- a/test/bumblebee/text/albert_test.exs
+++ b/test/bumblebee/text/albert_test.exs
@@ -1,158 +1,160 @@
 defmodule Bumblebee.Text.AlbertTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "albert-base-v2"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-AlbertModel"})
 
-      assert %Bumblebee.Text.Albert{architecture: :base} = spec
+    assert %Bumblebee.Text.Albert{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]),
-        "attention_mask" => Nx.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 36}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.6513, 1.5035, -0.2766], [-0.6515, 1.5046, -0.2780], [-0.6512, 1.5049, -0.2784]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "albert-base-v2"})
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0733, 0.0512, 1.2024], [-0.6761, -0.5774, 1.5411], [-1.4047, 2.3050, 0.6840]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Albert{architecture: :for_masked_language_modeling} = spec
+  test ":for_masked_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-AlbertForMaskedLM"})
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
+    assert %Bumblebee.Text.Albert{architecture: :for_masked_language_modeling} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 9, 30000}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[1.0450, -2.2835, -3.8152], [1.0635, -2.3124, -3.8890], [1.2576, -2.4207, -3.9500]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 10, 30000}
 
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "textattack/albert-base-v2-imdb"})
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.0895, -0.3613, 0.2426], [0.0475, -0.1905, 0.3426], [-0.5433, -0.0310, 0.0662]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Albert{architecture: :for_sequence_classification} = spec
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-AlbertForSequenceClassification"}
+             )
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
+    assert %Bumblebee.Text.Albert{architecture: :for_sequence_classification} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 2}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[0.4954, 0.1815]]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-    test "multiple choice model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "albert-base-v2"}, architecture: :for_multiple_choice)
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0050, 0.0035]]),
+      atol: 1.0e-4
+    )
+  end
 
-      # The base is missing classifier params so we set them to
-      # a static value here
-      params =
-        put_in(params["multiple_choice_head.output"], %{
-          "kernel" => Nx.broadcast(1.0e-3, {spec.hidden_size, 1}),
-          "bias" => Nx.tensor(0.0)
-        })
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-AlbertForTokenClassification"}
+             )
 
-      assert %Bumblebee.Text.Albert{architecture: :for_multiple_choice} = spec
+    assert %Bumblebee.Text.Albert{architecture: :for_token_classification} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]]]),
-        "attention_mask" => Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1]]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 1}
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-0.0200]]),
-        atol: 1.0e-3
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[0.1026, -0.2463], [0.1207, -0.1684], [-0.0811, -0.1414]]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "token classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "vumichien/tiny-albert"})
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-AlbertForQuestionAnswering"}
+             )
 
-      assert %Bumblebee.Text.Albert{architecture: :for_token_classification} = spec
+    assert %Bumblebee.Text.Albert{architecture: :for_question_answering} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 9, 2}
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3//1, ..]],
-        Nx.tensor([[[0.1364, -0.0437], [0.0360, -0.0786], [-0.1296, 0.0436]]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[0.0339, -0.0724, -0.0992]]),
+      atol: 1.0e-4
+    )
 
-    test "question answering model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "twmkn9/albert-base-v2-squad2"})
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[0.1820, 0.2451, 0.1535]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Albert{architecture: :for_question_answering} = spec
+  test ":for_multiple_choice" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-AlbertForMultipleChoice"}
+             )
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]])
-      }
+    assert %Bumblebee.Text.Albert{architecture: :for_multiple_choice} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]]),
+      "attention_mask" => Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]]),
+      "token_type_ids" => Nx.tensor([[[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]])
+    }
 
-      assert Nx.shape(outputs.start_logits) == {1, 9}
-      assert Nx.shape(outputs.end_logits) == {1, 9}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.start_logits[[.., 1..3]],
-        Nx.tensor([[-0.2464, -0.1028, -0.2076]]),
-        atol: 1.0e-4
-      )
+    assert Nx.shape(outputs.logits) == {1, 1}
 
-      assert_all_close(
-        outputs.end_logits[[.., 1..3]],
-        Nx.tensor([[-1.3742, -1.3609, -1.3454]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0048]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/albert_tokenizer_test.exs b/test/bumblebee/text/albert_tokenizer_test.exs
index 8be6711c..d4462c3b 100644
--- a/test/bumblebee/text/albert_tokenizer_test.exs
+++ b/test/bumblebee/text/albert_tokenizer_test.exs
@@ -3,64 +3,60 @@ defmodule Bumblebee.Text.AlbertTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "albert-base-v2"})
-
-      assert %Bumblebee.Text.AlbertTokenizer{} = tokenizer
-
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with [MASK].",
-          {"Question?", "Answer"}
-        ])
-
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [2, 1289, 5123, 29, 4, 13, 9, 3],
-          [2, 1301, 60, 3, 1623, 3, 0, 0]
-        ])
-      )
-
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 0, 0]
-        ])
-      )
-
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([
-          [0, 0, 0, 0, 0, 0, 0, 0],
-          [0, 0, 0, 0, 1, 1, 0, 0]
-        ])
-      )
-    end
-
-    test "encoding model input pads and truncates to :length" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "albert-base-v2"})
-
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, ["foo", "foo bar", "foo bar baz bang buzz"],
-          length: 6
-        )
-
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [2, 4310, 111, 3, 0, 0],
-          [2, 4310, 111, 748, 3, 0],
-          [2, 4310, 111, 748, 19674, 6582]
-        ])
-      )
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "albert-base-v2"})
+
+    assert %Bumblebee.Text.AlbertTokenizer{} = tokenizer
+
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with [MASK].",
+        {"Question?", "Answer"}
+      ])
+
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [2, 1289, 5123, 29, 4, 13, 9, 3],
+        [2, 1301, 60, 3, 1623, 3, 0, 0]
+      ])
+    )
+
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 0, 0]
+      ])
+    )
+
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([
+        [0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1, 0, 0]
+      ])
+    )
+  end
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([[1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1]])
-      )
-    end
+  test "pads and truncates to :length" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "albert-base-v2"})
+
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, ["foo", "foo bar", "foo bar baz bang buzz"], length: 6)
+
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [2, 4310, 111, 3, 0, 0],
+        [2, 4310, 111, 748, 3, 0],
+        [2, 4310, 111, 748, 19674, 6582]
+      ])
+    )
+
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([[1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1]])
+    )
   end
 end
diff --git a/test/bumblebee/text/bart_test.exs b/test/bumblebee/text/bart_test.exs
index 0854ec11..b492ae6d 100644
--- a/test/bumblebee/text/bart_test.exs
+++ b/test/bumblebee/text/bart_test.exs
@@ -1,169 +1,162 @@
 defmodule Bumblebee.Text.BartTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/bart-base"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BartModel"})
 
-      assert %Bumblebee.Text.Bart{architecture: :base} = spec
+    assert %Bumblebee.Text.Bart{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
-
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 16}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.3985, -1.2727, 1.8201], [1.2444, -1.5131, -0.9588], [-1.0806, -0.0743, 0.5012]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/bart-base"},
-                 architecture: :for_conditional_generation
-               )
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.9984, -0.0751, 0.4176], [0.0095, -0.3245, -0.4237], [-0.8061, -0.3498, 0.9201]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Bart{architecture: :for_conditional_generation} = spec
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BartForConditionalGeneration"}
+             )
 
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+    assert %Bumblebee.Text.Bart{architecture: :for_conditional_generation} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 11, 50265}
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [-4.3683, 2.3527, -4.6605],
-          [-5.9831, 1.2762, -5.9307],
-          [-5.8700, 5.1656, -6.0870]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0000, -0.0601, -0.0501], [0.0000, 0.0443, 0.0813], [0.0000, -0.1303, 0.0968]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "valhalla/bart-large-sst2"})
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BartForSequenceClassification"}
+             )
 
-      assert %Bumblebee.Text.Bart{architecture: :for_sequence_classification} = spec
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+    assert %Bumblebee.Text.Bart{architecture: :for_sequence_classification} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 2, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 2}
+    assert Nx.shape(outputs.logits) == {1, 3}
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-0.1599, -0.0090]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0075, -0.0078, -0.0073]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "question answering model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "valhalla/bart-large-finetuned-squadv1"})
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BartForQuestionAnswering"}
+             )
 
-      assert %Bumblebee.Text.Bart{architecture: :for_question_answering} = spec
+    assert %Bumblebee.Text.Bart{architecture: :for_question_answering} = spec
 
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
 
-      assert Nx.shape(outputs.start_logits) == {1, 11}
-      assert Nx.shape(outputs.end_logits) == {1, 11}
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[0.0474, -0.0767, 0.0278]]),
+      atol: 1.0e-4
+    )
 
-      assert_all_close(
-        outputs.start_logits[[0, 1..3]],
-        Nx.tensor([-8.3735, -10.8867, -12.2982]),
-        atol: 1.0e-4
-      )
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[0.1557, -0.1034, -0.1271]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert_all_close(
-        outputs.end_logits[[0, 1..3]],
-        Nx.tensor([-8.7642, -7.8842, -11.4208]),
-        atol: 1.0e-4
-      )
-    end
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BartForCausalLM"})
 
-    test "causal language model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/bart-base"},
-                 architecture: :for_causal_language_modeling
-               )
+    assert %Bumblebee.Text.Bart{architecture: :for_causal_language_modeling} = spec
 
-      assert %Bumblebee.Text.Bart{architecture: :for_causal_language_modeling} = spec
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0000, -0.2084, -0.0013], [0.0000, -0.0502, 0.0656], [0.0000, -0.1301, -0.1234]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 11, 50265}
+  test "generation with :for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BartForConditionalGeneration"}
+             )
 
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [-1.7658, -1.1057, -0.6313],
-          [-1.0344, 4.4774, 0.5581],
-          [-1.3625, 2.6272, -0.6478]
-        ]),
-        atol: 1.0e-4
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config(
+        {:hf, "hf-internal-testing/tiny-random-BartForConditionalGeneration"}
       )
-    end
-  end
 
-  test "conditional generation" do
-    {:ok, model_info} = Bumblebee.load_model({:hf, "facebook/bart-large-cnn"})
-    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "facebook/bart-large-cnn"})
-
-    assert %Bumblebee.Text.Bart{architecture: :for_conditional_generation} = model_info.spec
+    assert %Bumblebee.Text.Bart{architecture: :for_conditional_generation} = spec
 
     inputs = %{
-      "input_ids" => Nx.tensor([[0, 133, 812, 9, 1470, 16, 2201, 2]]),
-      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
     }
 
-    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 5)
-
-    generate =
-      Bumblebee.Text.Generation.build_generate(
-        model_info.model,
-        model_info.spec,
-        generation_config
-      )
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 3)
 
-    token_ids = generate.(model_info.params, inputs)
+    generate = Bumblebee.Text.Generation.build_generate(model, spec, generation_config)
+    token_ids = generate.(params, inputs)
 
-    assert_equal(token_ids, Nx.tensor([[2, 0, 133, 812, 9, 2]]))
+    assert_equal(token_ids, Nx.tensor([[2, 988, 988, 988]]))
   end
 end
diff --git a/test/bumblebee/text/bart_tokenizer_test.exs b/test/bumblebee/text/bart_tokenizer_test.exs
index 0b6113b3..b0b46b34 100644
--- a/test/bumblebee/text/bart_tokenizer_test.exs
+++ b/test/bumblebee/text/bart_tokenizer_test.exs
@@ -3,41 +3,39 @@ defmodule Bumblebee.Text.BartTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-base"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-base"})
 
-      assert %Bumblebee.Text.BartTokenizer{} = tokenizer
+    assert %Bumblebee.Text.BartTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with [MASK].",
-          {"Question?", "Answer"}
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with [MASK].",
+        {"Question?", "Answer"}
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [0, 34603, 3645, 19, 646, 32804, 530, 8174, 2],
-          [0, 45641, 116, 2, 2, 33683, 2, 1, 1]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [0, 34603, 3645, 19, 646, 32804, 530, 8174, 2],
+        [0, 45641, 116, 2, 2, 33683, 2, 1, 1]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 1, 0, 0]
-        ])
-      )
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 1, 0, 0]
+      ])
+    )
 
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([
-          [0, 0, 0, 0, 0, 0, 0, 0, 0],
-          [0, 0, 0, 0, 0, 0, 0, 0, 0]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([
+        [0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/bert_test.exs b/test/bumblebee/text/bert_test.exs
index 3c359f2d..8e504781 100644
--- a/test/bumblebee/text/bert_test.exs
+++ b/test/bumblebee/text/bert_test.exs
@@ -1,236 +1,205 @@
 defmodule Bumblebee.Text.BertTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "bert-base-uncased"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BertModel"})
 
-      assert %Bumblebee.Text.Bert{architecture: :base} = spec
+    assert %Bumblebee.Text.Bert{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]),
-        "attention_mask" => Nx.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([[[0.4249, 0.1008, 0.7531], [0.3771, 0.1188, 0.7467], [0.4152, 0.1098, 0.7108]]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "bert-base-uncased"})
-
-      assert %Bumblebee.Text.Bert{architecture: :for_masked_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 9, 30522}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-14.7240, -14.2120, -14.6434],
-            [-10.3125, -9.7459, -9.9923],
-            [-15.1105, -14.8048, -14.9276]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "sequence classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "textattack/bert-base-uncased-yelp-polarity"},
-                 architecture: :for_sequence_classification
-               )
-
-      assert %Bumblebee.Text.Bert{architecture: :for_sequence_classification} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 2002, 7317, 4747, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-1.3199, 1.5447]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "token classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "dbmdz/bert-large-cased-finetuned-conll03-english"})
-
-      assert %Bumblebee.Text.Bert{architecture: :for_token_classification} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 2123, 1105, 1203, 1365, 102]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 13, 9}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-3.1215, -0.4028, -3.3213], [-2.4627, 0.0613, -3.2501], [-3.1475, -0.7705, -2.8248]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "question answering" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "deepset/bert-base-cased-squad2"})
-
-      assert %Bumblebee.Text.Bert{architecture: :for_question_answering} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [101, 2627, 1108, 3104, 1124, 15703, 136] ++
-              [102, 3104, 1124, 15703, 1108, 170, 3505, 16797, 102]
-          ]),
-        "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.start_logits) == {1, 16}
-      assert Nx.shape(outputs.end_logits) == {1, 16}
-
-      assert_all_close(
-        outputs.start_logits[[.., 1..3]],
-        Nx.tensor([[-6.9344, -6.9556, -2.8814]]),
-        atol: 1.0e-4
-      )
-
-      assert_all_close(
-        outputs.end_logits[[.., 1..3]],
-        Nx.tensor([[-7.3395, -7.9609, -7.4926]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "multiple choice" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "nightingal3/bert-finetuned-wsc"})
-
-      assert %Bumblebee.Text.Bert{architecture: :for_multiple_choice} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [
-              [101, 1999, 3304, 10733, 2003, 8828, 102, 2478, 9292, 1998, 5442, 1012, 102, 0],
-              [101, 1999, 3304, 10733, 2003, 8828, 102, 2096, 2218, 1999, 1996, 2192, 1012, 102]
-            ]
-          ]),
-        "attention_mask" =>
-          Nx.tensor([
-            [
-              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
-              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-            ]
-          ]),
-        "token_type_ids" =>
-          Nx.tensor([
-            [
-              [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0],
-              [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-            ]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[0.3749, -3.9458]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "next sentence prediction" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "bert-base-uncased"},
-                 architecture: :for_next_sentence_prediction
-               )
-
-      assert %Bumblebee.Text.Bert{architecture: :for_next_sentence_prediction} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [101, 1999, 3304, 10733, 2003, 2366, 4895, 14540, 6610, 2094, 1012] ++
-              [102, 2059, 1996, 8013, 20323, 2009, 2478, 9292, 1998, 5442, 1012, 102]
-          ]),
-        "token_type_ids" =>
-          Nx.tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[6.1459, -5.7820]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "causal language modeling" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "bert-base-uncased"},
-                 architecture: :for_causal_language_modeling
-               )
-
-      assert %Bumblebee.Text.Bert{architecture: :for_causal_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 8, 30522}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-6.0980, -6.1492, -6.0886], [-6.1857, -6.2198, -6.2982], [-6.3880, -6.3918, -6.3503]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.2331, 1.7817, 1.1736], [-1.1001, 1.3922, -0.3391], [0.0408, 0.8677, -0.0779]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_masked_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BertForMaskedLM"})
+
+    assert %Bumblebee.Text.Bert{architecture: :for_masked_language_modeling} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[-0.0127, 0.0508, 0.0904], [0.1151, 0.1189, 0.0922], [0.0089, 0.1132, -0.2470]]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BertForSequenceClassification"}
+             )
+
+    assert %Bumblebee.Text.Bert{architecture: :for_sequence_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0037, -0.0239]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BertForTokenClassification"}
+             )
+
+    assert %Bumblebee.Text.Bert{architecture: :for_token_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[0.2078, 0.0055], [0.0681, 0.1132], [0.1049, 0.0479]]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BertForQuestionAnswering"}
+             )
+
+    assert %Bumblebee.Text.Bert{architecture: :for_question_answering} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
+
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[0.0465, 0.1204, 0.2137]]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[0.1654, 0.0930, 0.1304]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_multiple_choice" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BertForMultipleChoice"})
+
+    assert %Bumblebee.Text.Bert{architecture: :for_multiple_choice} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]]),
+      "attention_mask" => Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]]),
+      "token_type_ids" => Nx.tensor([[[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 1}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0033]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_next_sentence_prediction" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BertForNextSentencePrediction"}
+             )
+
+    assert %Bumblebee.Text.Bert{architecture: :for_next_sentence_prediction} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0072, 0.0098]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BertLMHeadModel"})
+
+    assert %Bumblebee.Text.Bert{architecture: :for_causal_language_modeling} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.0498, 0.0272, -0.0722], [-0.2410, 0.1069, -0.2430], [-0.0683, 0.0077, -0.1277]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/bert_tokenizer_test.exs b/test/bumblebee/text/bert_tokenizer_test.exs
index 248b1f46..4c2ab4c0 100644
--- a/test/bumblebee/text/bert_tokenizer_test.exs
+++ b/test/bumblebee/text/bert_tokenizer_test.exs
@@ -3,87 +3,85 @@ defmodule Bumblebee.Text.BertTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
-
-      assert %Bumblebee.Text.BertTokenizer{} = tokenizer
-
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with [MASK].",
-          {"Question?", "Answer"}
-        ])
-
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [101, 3231, 6251, 2007, 103, 1012, 102],
-          [101, 3160, 1029, 102, 3437, 102, 0]
-        ])
-      )
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
+
+    assert %Bumblebee.Text.BertTokenizer{} = tokenizer
+
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with [MASK].",
+        {"Question?", "Answer"}
+      ])
+
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [101, 3231, 6251, 2007, 103, 1012, 102],
+        [101, 3160, 1029, 102, 3437, 102, 0]
+      ])
+    )
+
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 0]
+      ])
+    )
+
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([
+        [0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1, 0]
+      ])
+    )
+  end
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 0]
-        ])
-      )
+  test "encoding with special tokens mask" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
 
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([
-          [0, 0, 0, 0, 0, 0, 0],
-          [0, 0, 0, 0, 1, 1, 0]
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(
+        tokenizer,
+        [
+          "Test sentence with [MASK]."
+        ],
+        return_special_tokens_mask: true
       )
-    end
-
-    test "encoding with special tokens mask" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
-
-      inputs =
-        Bumblebee.apply_tokenizer(
-          tokenizer,
-          [
-            "Test sentence with [MASK]."
-          ],
-          return_special_tokens_mask: true
-        )
 
-      assert_equal(inputs["special_tokens_mask"], Nx.tensor([[1, 0, 0, 0, 0, 0, 1]]))
-    end
+    assert_equal(inputs["special_tokens_mask"], Nx.tensor([[1, 0, 0, 0, 0, 0, 1]]))
+  end
 
-    test "encoding with offsets" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
+  test "encoding with offsets" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
 
-      inputs =
-        Bumblebee.apply_tokenizer(
-          tokenizer,
-          [
-            "Test sentence with [MASK]."
-          ],
-          return_offsets: true
-        )
+    inputs =
+      Bumblebee.apply_tokenizer(
+        tokenizer,
+        [
+          "Test sentence with [MASK]."
+        ],
+        return_offsets: true
+      )
 
-      assert_equal(inputs["start_offsets"], Nx.tensor([[0, 0, 5, 14, 19, 25, 0]]))
-      assert_equal(inputs["end_offsets"], Nx.tensor([[0, 4, 13, 18, 25, 26, 0]]))
-    end
+    assert_equal(inputs["start_offsets"], Nx.tensor([[0, 0, 5, 14, 19, 25, 0]]))
+    assert_equal(inputs["end_offsets"], Nx.tensor([[0, 4, 13, 18, 25, 26, 0]]))
+  end
 
-    test "encoding with multiple lengths" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
+  test "encoding with multiple lengths" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
 
-      inputs = Bumblebee.apply_tokenizer(tokenizer, "This is short.", length: [8, 16])
+    inputs = Bumblebee.apply_tokenizer(tokenizer, "This is short.", length: [8, 16])
 
-      assert {1, 8} = Nx.shape(inputs["input_ids"])
+    assert {1, 8} = Nx.shape(inputs["input_ids"])
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, "This is definitely much longer than the above.",
-          length: [8, 16]
-        )
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, "This is definitely much longer than the above.",
+        length: [8, 16]
+      )
 
-      assert {1, 16} = Nx.shape(inputs["input_ids"])
-    end
+    assert {1, 16} = Nx.shape(inputs["input_ids"])
   end
 end
diff --git a/test/bumblebee/text/blenderbot_test.exs b/test/bumblebee/text/blenderbot_test.exs
index 6a20bf60..860da566 100644
--- a/test/bumblebee/text/blenderbot_test.exs
+++ b/test/bumblebee/text/blenderbot_test.exs
@@ -1,82 +1,87 @@
 defmodule Bumblebee.Text.BlenderbotTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/blenderbot-400M-distill"},
-                 architecture: :base
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BlenderbotModel"})
 
-      assert %Bumblebee.Text.Blenderbot{architecture: :base} = spec
+    assert %Bumblebee.Text.Blenderbot{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[1710, 86, 1085, 2]]),
-        "decoder_input_ids" => Nx.tensor([[1, 86]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 2, 1280}
+    assert Nx.shape(outputs.hidden_state) == {1, 8, 16}
 
-      assert_all_close(
-        outputs.hidden_state[[0, .., 1..3]],
-        Nx.tensor([[0.1749, 0.4835, 0.3060], [0.0664, 0.0215, 0.5945]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.6578, 1.9730, 0.6908], [-1.8067, 0.0553, -0.7491], [0.1820, -0.4390, -0.8273]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/blenderbot-400M-distill"})
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BlenderbotForConditionalGeneration"}
+             )
 
-      assert %Bumblebee.Text.Blenderbot{architecture: :for_conditional_generation} = spec
+    assert %Bumblebee.Text.Blenderbot{architecture: :for_conditional_generation} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[1710, 86, 1085, 2]]),
-        "decoder_input_ids" => Nx.tensor([[1, 86]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 2, 8008}
+    assert Nx.shape(outputs.logits) == {1, 8, 1024}
 
-      assert_all_close(
-        outputs.logits[[0, .., 1..3]],
-        Nx.tensor([[12.0658, 3.7026, -4.7830], [-2.9581, 7.9437, -5.8420]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0440, -0.0115, -0.0004], [0.0772, 0.0327, -0.0667], [-0.0419, 0.1483, 0.0140]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 
-  test "conditional generation" do
-    {:ok, model_info} = Bumblebee.load_model({:hf, "facebook/blenderbot-400M-distill"})
+  test "generation with :for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BlenderbotForConditionalGeneration"}
+             )
 
     {:ok, generation_config} =
-      Bumblebee.load_generation_config({:hf, "facebook/blenderbot-400M-distill"})
+      Bumblebee.load_generation_config(
+        {:hf, "hf-internal-testing/tiny-random-BlenderbotForConditionalGeneration"}
+      )
 
-    assert %Bumblebee.Text.Blenderbot{architecture: :for_conditional_generation} = model_info.spec
+    assert %Bumblebee.Text.Blenderbot{architecture: :for_conditional_generation} = spec
 
     inputs = %{
-      "input_ids" => Nx.tensor([[2675, 19, 544, 366, 304, 38, 2]]),
-      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1]])
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
     }
 
-    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 5)
-
-    generate =
-      Bumblebee.Text.Generation.build_generate(
-        model_info.model,
-        model_info.spec,
-        generation_config
-      )
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 3)
 
-    token_ids = generate.(model_info.params, inputs)
+    generate = Bumblebee.Text.Generation.build_generate(model, spec, generation_config)
+    token_ids = generate.(params, inputs)
 
-    assert_equal(token_ids, Nx.tensor([[1, 281, 476, 929, 731, 2]]))
+    assert_equal(token_ids, Nx.tensor([[1, 382, 382, 382]]))
   end
 end
diff --git a/test/bumblebee/text/blip_text_test.exs b/test/bumblebee/text/blip_text_test.exs
index 334b8b21..39f2a674 100644
--- a/test/bumblebee/text/blip_text_test.exs
+++ b/test/bumblebee/text/blip_text_test.exs
@@ -1,61 +1,63 @@
 defmodule Bumblebee.Text.BlipTextTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Salesforce/blip-vqa-base"},
-                 module: Bumblebee.Text.BlipText,
-                 architecture: :base
-               )
-
-      assert %Bumblebee.Text.BlipText{architecture: :base} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]),
-        "attention_mask" => Nx.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.0219, 0.0386, -0.0164], [-0.0205, 0.0398, -0.0155], [-0.0242, 0.0405, -0.0186]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "causal language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Salesforce/blip-image-captioning-base"},
-                 module: Bumblebee.Text.BlipText,
-                 architecture: :for_causal_language_modeling
-               )
-
-      assert %Bumblebee.Text.BlipText{architecture: :for_causal_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]),
-        "attention_mask" => Nx.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 11, 30524}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([[[0.0525, 0.0526, 0.0525], [0.0433, 0.0434, 0.0433], [0.0833, 0.0834, 0.0833]]]),
-        atol: 1.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BlipModel"},
+               module: Bumblebee.Text.BlipText,
+               architecture: :base
+             )
+
+    assert %Bumblebee.Text.BlipText{architecture: :base} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.9281, 1.2373, 0.4223], [-1.1549, 2.1187, -0.9194], [0.0237, -0.7517, 0.5720]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-BlipForConditionalGeneration"},
+               module: Bumblebee.Text.BlipText,
+               architecture: :for_causal_language_modeling
+             )
+
+    assert %Bumblebee.Text.BlipText{architecture: :for_causal_language_modeling} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0736, -0.0142, 0.2178], [0.0744, 0.0990, 0.1510], [-0.1186, -0.1449, -0.0643]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/camembert_test.exs b/test/bumblebee/text/camembert_test.exs
index a433af2d..8697b320 100644
--- a/test/bumblebee/text/camembert_test.exs
+++ b/test/bumblebee/text/camembert_test.exs
@@ -1,55 +1,31 @@
 defmodule Bumblebee.Text.CamembertTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "camembert-base"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-camembert"})
 
-      assert %Bumblebee.Text.Roberta{architecture: :base} = spec
+    assert %Bumblebee.Text.Roberta{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 402, 232, 328, 740, 1140, 12695, 69, 1588, 2]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 10, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[0.0592, 0.0688, 0.0185], [0.0024, 0.1443, -0.1943], [0.0102, 0.2724, -0.2474]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "camembert-base"})
-
-      assert %Bumblebee.Text.Roberta{architecture: :for_masked_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 402, 232, 328, 740, 1140, 12695, 69, 1588, 2]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 10, 32005}
-
-      assert_all_close(
-        outputs.logits[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[18.4213, -4.5504, 6.5444], [-1.2791, -2.4822, 2.8339], [-2.5561, -4.3118, -1.3791]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.1734, -0.5058, 0.6278], [-0.2506, -0.3877, -0.0394], [-0.4477, 1.9433, -0.7990]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/camembert_tokenizer_test.exs b/test/bumblebee/text/camembert_tokenizer_test.exs
index fb22d9ce..e4612739 100644
--- a/test/bumblebee/text/camembert_tokenizer_test.exs
+++ b/test/bumblebee/text/camembert_tokenizer_test.exs
@@ -3,30 +3,28 @@ defmodule Bumblebee.Text.CamembertTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "camembert-base"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "camembert-base"})
 
-      assert %Bumblebee.Text.CamembertTokenizer{} = tokenizer
+    assert %Bumblebee.Text.CamembertTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with <mask>."
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with <mask>."
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [5, 9115, 22625, 1466, 32004, 21, 9, 6]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [5, 9115, 22625, 1466, 32004, 21, 9, 6]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1, 1]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1, 1]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/clip_text_test.exs b/test/bumblebee/text/clip_text_test.exs
index fcf985e2..5bbef50f 100644
--- a/test/bumblebee/text/clip_text_test.exs
+++ b/test/bumblebee/text/clip_text_test.exs
@@ -1,75 +1,66 @@
 defmodule Bumblebee.Text.ClipTextTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-                 module: Bumblebee.Text.ClipText,
-                 architecture: :base
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-CLIPModel"},
+               module: Bumblebee.Text.ClipText,
+               architecture: :base
+             )
 
-      assert %Bumblebee.Text.ClipText{architecture: :base} = spec
+    assert %Bumblebee.Text.ClipText{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [49406, 320, 1125, 539, 320, 2368, 49407],
-            [49406, 320, 1125, 539, 320, 1929, 49407]
-          ]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1], [1, 1, 0, 0, 0, 0, 0]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {2, 7, 512}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+    assert Nx.shape(outputs.pooled_state) == {1, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.5844, 0.3685, -2.0744], [-0.9600, 1.0018, -0.2415], [-0.5957, -0.1719, 0.4689]],
-          [[-0.5844, 0.3685, -2.0744], [-0.0025, 0.1219, -0.0435], [0.0661, 0.1142, 0.0056]]
-        ]),
-        atol: 1.0e-4
-      )
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.1696, -0.2324, -0.1659], [-0.0525, -0.3103, 0.1557], [-0.2566, -0.4519, 0.6398]]
+      ]),
+      atol: 1.0e-4
+    )
 
-      assert_all_close(
-        outputs.pooled_state[[.., 1..3]],
-        Nx.tensor([[0.1658, 0.8876, 10.6313], [0.0130, 0.1167, 0.0371]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]],
+      Nx.tensor([[-0.6903, -1.2524, 1.5328]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "embedding model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-                 module: Bumblebee.Text.ClipText,
-                 architecture: :for_embedding
-               )
+  test ":for_embedding" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-CLIPModel"},
+               module: Bumblebee.Text.ClipText,
+               architecture: :for_embedding
+             )
 
-      assert %Bumblebee.Text.ClipText{architecture: :for_embedding} = spec
+    assert %Bumblebee.Text.ClipText{architecture: :for_embedding} = spec
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [49406, 320, 1125, 539, 320, 2368, 49407]
-          ]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.embedding) == {1, 512}
+    assert Nx.shape(outputs.embedding) == {1, 64}
 
-      assert_all_close(
-        outputs.embedding[[.., 1..3]],
-        Nx.tensor([[0.0733, -0.2448, -0.2212]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.embedding[[.., 1..3]],
+      Nx.tensor([[1.1069, -0.0839, -1.6185]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/clip_tokenizer_test.exs b/test/bumblebee/text/clip_tokenizer_test.exs
index f751b90f..429a2f82 100644
--- a/test/bumblebee/text/clip_tokenizer_test.exs
+++ b/test/bumblebee/text/clip_tokenizer_test.exs
@@ -3,33 +3,31 @@ defmodule Bumblebee.Text.ClipTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/clip-vit-base-patch32"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/clip-vit-base-patch32"})
 
-      assert %Bumblebee.Text.ClipTokenizer{} = tokenizer
+    assert %Bumblebee.Text.ClipTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "a photo of a cat",
-          "a photo of a dog"
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "a photo of a cat",
+        "a photo of a dog"
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [49406, 320, 1125, 539, 320, 2368, 49407],
-          [49406, 320, 1125, 539, 320, 1929, 49407]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [49406, 320, 1125, 539, 320, 2368, 49407],
+        [49406, 320, 1125, 539, 320, 1929, 49407]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 1]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 1]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/conversation_test.exs b/test/bumblebee/text/conversation_test.exs
index b32adea0..6ec9ed0e 100644
--- a/test/bumblebee/text/conversation_test.exs
+++ b/test/bumblebee/text/conversation_test.exs
@@ -3,27 +3,25 @@ defmodule Bumblebee.Text.ConversationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
-  describe "integration" do
-    test "generates text" do
-      {:ok, model} = Bumblebee.load_model({:hf, "microsoft/DialoGPT-medium"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
+  test "generates text" do
+    {:ok, model} = Bumblebee.load_model({:hf, "microsoft/DialoGPT-medium"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
 
-      serving = Bumblebee.Text.conversation(model, tokenizer, generation_config)
+    serving = Bumblebee.Text.conversation(model, tokenizer, generation_config)
 
-      history = nil
+    history = nil
 
-      message = "Hey!"
+    message = "Hey!"
 
-      assert %{text: "Hey !", history: history} =
-               Nx.Serving.run(serving, %{text: message, history: history})
+    assert %{text: "Hey !", history: history} =
+             Nx.Serving.run(serving, %{text: message, history: history})
 
-      message = "What's up?"
+    message = "What's up?"
 
-      assert %{text: "Not much .", history: _history} =
-               Nx.Serving.run(serving, %{text: message, history: history})
-    end
+    assert %{text: "Not much .", history: _history} =
+             Nx.Serving.run(serving, %{text: message, history: history})
   end
 end
diff --git a/test/bumblebee/text/distilbert_test.exs b/test/bumblebee/text/distilbert_test.exs
index bd147b8d..0f1988b2 100644
--- a/test/bumblebee/text/distilbert_test.exs
+++ b/test/bumblebee/text/distilbert_test.exs
@@ -1,139 +1,158 @@
 defmodule Bumblebee.Text.DistilbertTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "distilbert-base-uncased"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-DistilBertModel"})
 
-      assert %Bumblebee.Text.Distilbert{architecture: :base} = spec
+    assert %Bumblebee.Text.Distilbert{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 7592, 1010, 2026, 3899, 2003, 10140, 102]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.hidden_state) == {1, 8, 768}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.1483, 0.3433, -0.5248], [0.5309, 0.3716, 0.0803], [0.3805, 0.5581, -0.4261]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "distilbert-base-uncased"})
-
-      assert %Bumblebee.Text.Distilbert{architecture: :for_masked_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 9, 30522}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-14.1975, -13.9020, -13.9615],
-            [-8.8192, -8.5549, -8.3866],
-            [-13.4315, -13.2120, -13.3121]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "sequence classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "joeddav/distilbert-base-uncased-go-emotions-student"},
-                 architecture: :for_sequence_classification
-               )
-
-      assert %Bumblebee.Text.Distilbert{architecture: :for_sequence_classification} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[101, 1045, 2514, 5341, 2000, 2022, 2182, 1012, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 28}
-
-      assert_all_close(
-        outputs.logits[[.., 1..4]],
-        Nx.tensor([[-0.2951, -1.8836, -1.9071, 1.2820]]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "token classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Davlan/distilbert-base-multilingual-cased-ner-hrl"})
-
-      assert %Bumblebee.Text.Distilbert{architecture: :for_token_classification} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([[101, 11590, 11324, 10124, 14290, 10111, 146, 12962, 10106, 11193, 102]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 11, 9}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-3.9901, -4.0522, -2.4171], [-4.0584, -4.2153, -2.4035], [-3.9693, -4.0597, -2.2356]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "question answering" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "distilbert-base-cased-distilled-squad"})
-
-      assert %Bumblebee.Text.Distilbert{architecture: :for_question_answering} = spec
-
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [101, 2627, 1108, 3104, 1124, 15703, 136, 102, 3104, 1124] ++
-              [15703, 1108, 170, 3505, 16797, 102]
-          ])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.start_logits) == {1, 16}
-      assert Nx.shape(outputs.end_logits) == {1, 16}
-
-      assert_all_close(
-        outputs.start_logits[[.., 1..3]],
-        Nx.tensor([[-5.1663, -6.8352, -3.5082]]),
-        atol: 1.0e-4
-      )
-
-      assert_all_close(
-        outputs.end_logits[[.., 1..3]],
-        Nx.tensor([[-4.5860, -6.7391, -6.8987]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.9427, 0.7933, 0.1031], [1.0913, 1.0214, -1.5890], [-2.1149, -0.3367, -0.6268]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_masked_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-DistilBertForMaskedLM"})
+
+    assert %Bumblebee.Text.Distilbert{architecture: :for_masked_language_modeling} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 1124}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.1839, -0.0195, 0.1220], [-0.2048, 0.0667, 0.0878], [-0.2045, -0.0483, -0.1567]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DistilBertForSequenceClassification"}
+             )
+
+    assert %Bumblebee.Text.Distilbert{architecture: :for_sequence_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0047, -0.0103]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DistilBertForTokenClassification"}
+             )
+
+    assert %Bumblebee.Text.Distilbert{architecture: :for_token_classification} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[-0.0504, -0.0751], [0.1354, 0.2180], [-0.0386, 0.1059]]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DistilBertForQuestionAnswering"}
+             )
+
+    assert %Bumblebee.Text.Distilbert{architecture: :for_question_answering} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
+
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[0.1790, -0.0074, 0.0412]]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[-0.1520, -0.0973, 0.0166]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_multiple_choice" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DistilBertForMultipleChoice"}
+             )
+
+    assert %Bumblebee.Text.Distilbert{architecture: :for_multiple_choice} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]]),
+      "attention_mask" => Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 1}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0027]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/distilbert_tokenizer_test.exs b/test/bumblebee/text/distilbert_tokenizer_test.exs
index 5cf02e8d..f657ff9a 100644
--- a/test/bumblebee/text/distilbert_tokenizer_test.exs
+++ b/test/bumblebee/text/distilbert_tokenizer_test.exs
@@ -3,64 +3,52 @@ defmodule Bumblebee.Text.DistilbertTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-uncased"})
-
-      assert %Bumblebee.Text.DistilbertTokenizer{} = tokenizer
-
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with [MASK].",
-          {"Question?", "Answer"}
-        ])
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-uncased"})
+
+    assert %Bumblebee.Text.DistilbertTokenizer{} = tokenizer
+
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with [MASK].",
+        {"Question?", "Answer"}
+      ])
+
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [101, 3231, 6251, 2007, 103, 1012, 102],
+        [101, 3160, 1029, 102, 3437, 102, 0]
+      ])
+    )
+
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 0]
+      ])
+    )
+  end
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [101, 3231, 6251, 2007, 103, 1012, 102],
-          [101, 3160, 1029, 102, 3437, 102, 0]
-        ])
-      )
+  test "with special tokens mask" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-cased"})
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 0]
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, ["Test sentence with [MASK]."],
+        return_special_tokens_mask: true
       )
-    end
 
-    test "encoding with special tokens mask" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-cased"})
-
-      inputs =
-        Bumblebee.apply_tokenizer(
-          tokenizer,
-          [
-            "Test sentence with [MASK]."
-          ],
-          return_special_tokens_mask: true
-        )
-
-      assert_equal(inputs["special_tokens_mask"], Nx.tensor([[1, 0, 0, 0, 0, 0, 1]]))
-    end
+    assert_equal(inputs["special_tokens_mask"], Nx.tensor([[1, 0, 0, 0, 0, 0, 1]]))
+  end
 
-    test "encoding with offsets" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-cased"})
+  test "with offsets" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert-base-cased"})
 
-      inputs =
-        Bumblebee.apply_tokenizer(
-          tokenizer,
-          [
-            "Test sentence with [MASK]."
-          ],
-          return_offsets: true
-        )
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, ["Test sentence with [MASK]."], return_offsets: true)
 
-      assert_equal(inputs["start_offsets"], Nx.tensor([[0, 0, 5, 14, 19, 25, 0]]))
-      assert_equal(inputs["end_offsets"], Nx.tensor([[0, 4, 13, 18, 25, 26, 0]]))
-    end
+    assert_equal(inputs["start_offsets"], Nx.tensor([[0, 0, 5, 14, 19, 25, 0]]))
+    assert_equal(inputs["end_offsets"], Nx.tensor([[0, 4, 13, 18, 25, 26, 0]]))
   end
 end
diff --git a/test/bumblebee/text/fill_mask_test.exs b/test/bumblebee/text/fill_mask_test.exs
index c76cfcd7..1f82d1d1 100644
--- a/test/bumblebee/text/fill_mask_test.exs
+++ b/test/bumblebee/text/fill_mask_test.exs
@@ -3,45 +3,43 @@ defmodule Bumblebee.Text.FillMaskTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
-
-  describe "integration" do
-    test "returns top scored tokens" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
-
-      serving = Bumblebee.Text.FillMask.fill_mask(model_info, tokenizer)
-
-      text = "The capital of [MASK] is Paris."
-
-      assert %{
-               predictions: [
-                 %{score: _, token: "france"},
-                 %{score: _, token: "brittany"},
-                 %{score: _, token: "algeria"},
-                 %{score: _, token: "department"},
-                 %{score: _, token: "reunion"}
-               ]
-             } = Nx.Serving.run(serving, text)
-    end
-
-    test "raises when there isn't exactly one mask token" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
-
-      serving = Bumblebee.Text.FillMask.fill_mask(model_info, tokenizer)
-
-      assert_raise ArgumentError,
-                   ~s/expected exactly one occurrence of [MASK], got: 0 in "The capital of France is Paris."/,
-                   fn ->
-                     Nx.Serving.run(serving, "The capital of France is Paris.")
-                   end
-
-      assert_raise ArgumentError,
-                   ~s/expected exactly one occurrence of [MASK], got: 2 in "The [MASK] of [MASK] is Paris."/,
-                   fn ->
-                     Nx.Serving.run(serving, "The [MASK] of [MASK] is Paris.")
-                   end
-    end
+  @moduletag serving_test_tags()
+
+  test "returns top scored tokens" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
+
+    serving = Bumblebee.Text.FillMask.fill_mask(model_info, tokenizer)
+
+    text = "The capital of [MASK] is Paris."
+
+    assert %{
+             predictions: [
+               %{score: _, token: "france"},
+               %{score: _, token: "brittany"},
+               %{score: _, token: "algeria"},
+               %{score: _, token: "department"},
+               %{score: _, token: "reunion"}
+             ]
+           } = Nx.Serving.run(serving, text)
+  end
+
+  test "raises when there isn't exactly one mask token" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"})
+
+    serving = Bumblebee.Text.FillMask.fill_mask(model_info, tokenizer)
+
+    assert_raise ArgumentError,
+                 ~s/expected exactly one occurrence of [MASK], got: 0 in "The capital of France is Paris."/,
+                 fn ->
+                   Nx.Serving.run(serving, "The capital of France is Paris.")
+                 end
+
+    assert_raise ArgumentError,
+                 ~s/expected exactly one occurrence of [MASK], got: 2 in "The [MASK] of [MASK] is Paris."/,
+                 fn ->
+                   Nx.Serving.run(serving, "The [MASK] of [MASK] is Paris.")
+                 end
   end
 end
diff --git a/test/bumblebee/text/generation_test.exs b/test/bumblebee/text/generation_test.exs
index 5131031a..23692e95 100644
--- a/test/bumblebee/text/generation_test.exs
+++ b/test/bumblebee/text/generation_test.exs
@@ -3,114 +3,113 @@ defmodule Bumblebee.Text.GenerationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
-  describe "integration" do
-    test "generates text with greedy generation" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "facebook/bart-large-cnn"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-cnn"})
+  test "generates text with greedy generation" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "JulesBelveze/t5-small-headline-generator"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "JulesBelveze/t5-small-headline-generator"})
 
-      {:ok, generation_config} =
-        Bumblebee.load_generation_config({:hf, "facebook/bart-large-cnn"})
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config({:hf, "JulesBelveze/t5-small-headline-generator"})
 
-      article = """
-      PG&E stated it scheduled the blackouts in response to forecasts for high \
-      winds amid dry conditions. The aim is to reduce the risk of wildfires. \
-      Nearly 800 thousand customers were scheduled to be affected by the shutoffs \
-      which were expected to last through at least midday tomorrow.
-      """
+    article = """
+    PG&E stated it scheduled the blackouts in response to forecasts for high \
+    winds amid dry conditions. The aim is to reduce the risk of wildfires. \
+    Nearly 800 thousand customers were scheduled to be affected by the shutoffs \
+    which were expected to last through at least midday tomorrow.
+    """
 
-      generation_config = Bumblebee.configure(generation_config, max_new_tokens: 8)
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 10)
 
-      serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
+    serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
 
-      assert %{results: [%{text: "PG&E scheduled the black"}]} = Nx.Serving.run(serving, article)
-    end
+    assert %{results: [%{text: "PG&E plans blackouts to reduce"}]} =
+             Nx.Serving.run(serving, article)
+  end
 
-    test "with :no_repeat_ngram_length" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
+  test "with :no_repeat_ngram_length" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
 
-      generation_config =
-        Bumblebee.configure(generation_config, max_new_tokens: 12, no_repeat_ngram_length: 2)
+    generation_config =
+      Bumblebee.configure(generation_config, max_new_tokens: 12, no_repeat_ngram_length: 2)
 
-      serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
+    serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
 
-      # Without :no_repeat_ngram_length we get
-      # %{results: [%{text: "I was going to say, 'Well, I'm going to say,"}]}
+    # Without :no_repeat_ngram_length we get
+    # %{results: [%{text: "I was going to say, 'Well, I'm going to say,"}]}
 
-      assert %{results: [%{text: "I was going to say, 'Well, I'm going back to the"}]} =
-               Nx.Serving.run(serving, "I was going")
-    end
+    assert %{results: [%{text: "I was going to say, 'Well, I'm going back to the"}]} =
+             Nx.Serving.run(serving, "I was going")
+  end
 
-    test "sampling" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
+  test "sampling" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
 
-      generation_config =
-        Bumblebee.configure(generation_config,
-          max_new_tokens: 12,
-          strategy: %{type: :multinomial_sampling}
-        )
+    generation_config =
+      Bumblebee.configure(generation_config,
+        max_new_tokens: 12,
+        strategy: %{type: :multinomial_sampling}
+      )
 
-      serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config, seed: 0)
+    serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config, seed: 0)
 
-      # Note that this is just a snapshot test, we do not use any
-      # reference value, because of PRNG difference
+    # Note that this is just a snapshot test, we do not use any
+    # reference value, because of PRNG difference
 
-      assert %{
-               results: [
-                 %{text: "I was going to fall asleep.\"\n\nThis is not Wallace's fifth"}
-               ]
-             } = Nx.Serving.run(serving, "I was going")
-    end
+    assert %{
+             results: [
+               %{text: "I was going to fall asleep.\"\n\nThis is not Wallace's fifth"}
+             ]
+           } = Nx.Serving.run(serving, "I was going")
+  end
 
-    test "contrastive search" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
-      {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
+  test "contrastive search" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "gpt2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
+    {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "gpt2"})
 
-      generation_config =
-        Bumblebee.configure(generation_config,
-          max_new_tokens: 12,
-          strategy: %{type: :contrastive_search, top_k: 4, alpha: 0.6}
-        )
+    generation_config =
+      Bumblebee.configure(generation_config,
+        max_new_tokens: 12,
+        strategy: %{type: :contrastive_search, top_k: 4, alpha: 0.6}
+      )
 
-      serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
+    serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config)
 
-      assert %{results: [%{text: "I was going to say, 'Well, I don't know what you"}]} =
-               Nx.Serving.run(serving, "I was going")
-    end
+    assert %{results: [%{text: "I was going to say, 'Well, I don't know what you"}]} =
+             Nx.Serving.run(serving, "I was going")
+  end
 
-    test "streaming text chunks" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "facebook/bart-large-cnn"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-cnn"})
+  test "streaming text chunks" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "JulesBelveze/t5-small-headline-generator"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "JulesBelveze/t5-small-headline-generator"})
 
-      {:ok, generation_config} =
-        Bumblebee.load_generation_config({:hf, "facebook/bart-large-cnn"})
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config({:hf, "JulesBelveze/t5-small-headline-generator"})
 
-      article = """
-      PG&E stated it scheduled the blackouts in response to forecasts for high \
-      winds amid dry conditions. The aim is to reduce the risk of wildfires. \
-      Nearly 800 thousand customers were scheduled to be affected by the shutoffs \
-      which were expected to last through at least midday tomorrow.
-      """
+    article = """
+    PG&E stated it scheduled the blackouts in response to forecasts for high \
+    winds amid dry conditions. The aim is to reduce the risk of wildfires. \
+    Nearly 800 thousand customers were scheduled to be affected by the shutoffs \
+    which were expected to last through at least midday tomorrow.
+    """
 
-      generation_config = Bumblebee.configure(generation_config, max_new_tokens: 8)
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 10)
 
-      serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config, stream: true)
+    serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config, stream: true)
 
-      stream = Nx.Serving.run(serving, article)
-      assert Enum.to_list(stream) == ["PG&E", " scheduled", " the", " black"]
+    stream = Nx.Serving.run(serving, article)
+    assert Enum.to_list(stream) == ["PG&E", " plans", " blackouts", " to", " reduce"]
 
-      # Raises when a batch is given
-      assert_raise ArgumentError,
-                   "serving only accepts singular input when stream is enabled, call the serving with each input in the batch separately",
-                   fn ->
-                     Nx.Serving.run(serving, [article])
-                   end
-    end
+    # Raises when a batch is given
+    assert_raise ArgumentError,
+                 "serving only accepts singular input when stream is enabled, call the serving with each input in the batch separately",
+                 fn ->
+                   Nx.Serving.run(serving, [article])
+                 end
   end
 end
diff --git a/test/bumblebee/text/gpt2_test.exs b/test/bumblebee/text/gpt2_test.exs
index 4baf8e8f..6a18038d 100644
--- a/test/bumblebee/text/gpt2_test.exs
+++ b/test/bumblebee/text/gpt2_test.exs
@@ -1,109 +1,101 @@
 defmodule Bumblebee.Text.Gpt2Test do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "gpt2"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPT2Model"})
 
-      assert %Bumblebee.Text.Gpt2{architecture: :base} = spec
+    assert %Bumblebee.Text.Gpt2{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-0.0436, 0.0046, -0.1025],
-            [-0.4822, 0.2564, 0.1926],
-            [-0.2747, 0.0428, -0.1841]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.8136, -0.2392, 0.2378], [0.9714, -0.4651, 0.8788], [-0.0980, 0.2294, -1.1416]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "causal language modeling" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "gpt2"})
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPT2LMHeadModel"})
 
-      assert %Bumblebee.Text.Gpt2{architecture: :for_causal_language_modeling} = spec
+    assert %Bumblebee.Text.Gpt2{architecture: :for_causal_language_modeling} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[15496, 11, 616, 3290, 318, 13779]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 6, 50257}
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-114.5832, -116.5725, -116.0830],
-            [-89.8644, -93.1977, -94.4351],
-            [-88.3380, -92.8703, -94.4454]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.1184, -0.0259, 0.1688], [0.1064, 0.1412, 0.1120], [0.1421, -0.2010, 0.3757]]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "token classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "brad1141/gpt2-finetuned-comp2"})
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPT2ForTokenClassification"}
+             )
 
-      assert %Bumblebee.Text.Gpt2{architecture: :for_token_classification} = spec
+    assert %Bumblebee.Text.Gpt2{architecture: :for_token_classification} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[15496, 11, 616, 3290, 318, 13779]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 6, 7}
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.4187, 3.4156, -2.8762], [2.9556, 0.9153, -1.0290], [1.3047, 1.0234, -1.2765]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[0.0207, 0.1338], [-0.1582, -0.0384], [-0.2225, -0.0400]]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "sequence classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/DialogRPT-updown"})
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPT2ForSequenceClassification"}
+             )
 
-      assert %Bumblebee.Text.Gpt2{architecture: :for_sequence_classification} = spec
+    assert %Bumblebee.Text.Gpt2{architecture: :for_sequence_classification} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[15496, 11, 616, 3290, 318, 13779]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 1023, 1023]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 1}
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-1.2981]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0098, -0.0456]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/gpt2_tokenizer_test.exs b/test/bumblebee/text/gpt2_tokenizer_test.exs
index 24b2a7e3..90bd6031 100644
--- a/test/bumblebee/text/gpt2_tokenizer_test.exs
+++ b/test/bumblebee/text/gpt2_tokenizer_test.exs
@@ -3,30 +3,28 @@ defmodule Bumblebee.Text.Gpt2TokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "gpt2"})
 
-      assert %Bumblebee.Text.Gpt2Tokenizer{} = tokenizer
+    assert %Bumblebee.Text.Gpt2Tokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Hello World"
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Hello World"
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [15496, 2159]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [15496, 2159]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/gpt_big_code_test.exs b/test/bumblebee/text/gpt_big_code_test.exs
index 71feacfc..e2541cca 100644
--- a/test/bumblebee/text/gpt_big_code_test.exs
+++ b/test/bumblebee/text/gpt_big_code_test.exs
@@ -1,131 +1,132 @@
 defmodule Bumblebee.Text.GptBigCodeTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPTBigCodeModel"})
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPTBigCodeModel"})
 
-      assert %Bumblebee.Text.GptBigCode{architecture: :base} = spec
+    assert %Bumblebee.Text.GptBigCode{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[68, 69, 70, 266, 412, 8, 76, 396, 9, 26]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.8586, 0.3071, -0.3434], [-0.1530, 0.7143, -0.4393], [0.7845, 0.3625, -0.1734]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.8193, 0.5945, -0.2915], [0.0150, 0.4736, 0.5148], [-0.4247, -1.8000, -1.6479]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":base without multi-query attention" do
+    # We have a separate test to test parameter loading without
+    # multi-query attention, because the parameters layout differs
+
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "bumblebee-testing/tiny-random-GPTBigCodeModel-multi_query-False"}
+             )
 
-    test "base model without multi-query attention" do
-      # We have a separate test to test parameter loading without
-      # multi-query attention, because the parameters layout differs
+    assert %Bumblebee.Text.GptBigCode{architecture: :base} = spec
 
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "jonatanklosko/tiny-random-GPTBigCodeModel-multi_query-False"}
-               )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.GptBigCode{architecture: :base} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[68, 69, 70, 266, 412, 8, 76, 396, 9, 26]])
-      }
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-1.3966, 0.6641, -1.3937], [-0.5489, 0.3397, 0.4567], [-0.6488, -1.6745, -1.1570]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM"})
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-1.3692, -0.4104, -1.2525], [-1.1314, 0.3077, -1.2131], [-0.5550, -0.0240, -1.1081]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.GptBigCode{architecture: :for_causal_language_modeling} = spec
 
-    test "causal language modeling" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM"}
-               )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.GptBigCode{architecture: :for_causal_language_modeling} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[68, 69, 70, 266, 412, 8, 76, 396, 9, 26]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.1509, -0.1751, 0.1848], [-0.0860, -0.2476, 0.3373], [-0.2671, -0.2028, -0.0896]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 10, 1024}
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPTBigCodeForTokenClassification"}
+             )
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.0105, -0.0399, 0.1105], [-0.0350, 0.0781, 0.2945], [-0.1949, -0.1349, 0.0651]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.GptBigCode{architecture: :for_token_classification} = spec
 
-    test "token classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "hf-internal-testing/tiny-random-GPTBigCodeForTokenClassification"}
-               )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.GptBigCode{architecture: :for_token_classification} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[68, 69, 70, 266, 412, 8, 76, 396, 9, 26]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[-0.0775, -0.0276], [0.0634, 0.0396], [-0.0695, 0.1575]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 10, 2}
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPTBigCodeForSequenceClassification"}
+             )
 
-      assert_all_close(
-        outputs.logits[[.., 1..3]],
-        Nx.tensor([[[0.0179, -0.1119], [-0.1250, -0.0535], [-0.1324, 0.0488]]]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.GptBigCode{architecture: :for_sequence_classification} = spec
 
-    test "sequence classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "hf-internal-testing/tiny-random-GPTBigCodeForSequenceClassification"}
-               )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 1021, 1021]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.GptBigCode{architecture: :for_sequence_classification} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[68, 69, 70, 266, 412, 8, 76, 396, 9, 26]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[0.1027, 0.2042]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.1722, 0.1999]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/gpt_neo_x_test.exs b/test/bumblebee/text/gpt_neo_x_test.exs
index a6e299b3..156c2ff0 100644
--- a/test/bumblebee/text/gpt_neo_x_test.exs
+++ b/test/bumblebee/text/gpt_neo_x_test.exs
@@ -1,109 +1,103 @@
 defmodule Bumblebee.Text.GptNeoXTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "seanmor5/tiny-gpt-neox-test"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPTNeoXModel"})
 
-      assert %Bumblebee.Text.GptNeoX{architecture: :base} = spec
+    assert %Bumblebee.Text.GptNeoX{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[4, 928, 219, 10, 591, 1023]])
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert Nx.shape(outputs.hidden_state) == {1, 6, 32}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[1.4331, 0.7042, -2.8534], [1.4009, 1.5367, -0.8567], [0.7013, 1.5902, -1.4052]]
-        ]),
-        atol: 1.0e-2
-      )
-    end
-
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "hf-internal-testing/tiny-random-GPTNeoXForSequenceClassification"}
-               )
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.4428, 0.3349, -1.1917], [-0.1550, -0.4439, -0.5855], [0.3737, 3.4893, -0.6499]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.GptNeoX{architecture: :for_sequence_classification} = spec
-      input_ids = Nx.tensor([[4, 928, 219, 10, 591, 1023]])
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPTNeoXForSequenceClassification"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert %Bumblebee.Text.GptNeoX{architecture: :for_sequence_classification} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 2}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[0.0622, -0.0701]]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-    test "token classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "hf-internal-testing/tiny-random-GPTNeoXForTokenClassification"}
-               )
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.1089, -0.3733]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.GptNeoX{architecture: :for_token_classification} = spec
-      input_ids = Nx.tensor([[4, 928, 219, 10, 591, 1023]])
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-GPTNeoXForTokenClassification"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert %Bumblebee.Text.GptNeoX{architecture: :for_token_classification} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 6, 2}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits[[.., 1..3]],
-        Nx.tensor([[[0.0089, -0.0230], [-0.0282, 0.0478], [0.1127, -0.0674]]]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
 
-    test "causal language model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "seanmor5/tiny-gpt-neox-test"},
-                 architecture: :for_causal_language_modeling
-               )
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[-0.0900, -0.1853], [0.0567, -0.0443], [-0.0104, -0.1112]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.GptNeoX{architecture: :for_causal_language_modeling} = spec
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPTNeoXForCausalLM"})
 
-      input_ids = Nx.tensor([[4, 928, 219, 10, 591, 1023]])
+    assert %Bumblebee.Text.GptNeoX{architecture: :for_causal_language_modeling} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 6, 1024}
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.0559, 0.1583, 0.0423], [0.0446, 0.0843, -0.0328], [0.1069, 0.0430, 0.0127]]
-        ]),
-        atol: 1.0e-2
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.1134, 0.0507, -0.0534], [-0.1113, 0.0035, -0.0319], [0.0019, -0.0273, -0.0151]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/layout_lm_tokenizer_test.exs b/test/bumblebee/text/layout_lm_tokenizer_test.exs
index 33d60133..18c9ab16 100644
--- a/test/bumblebee/text/layout_lm_tokenizer_test.exs
+++ b/test/bumblebee/text/layout_lm_tokenizer_test.exs
@@ -3,32 +3,30 @@ defmodule Bumblebee.Text.LayoutLmTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "microsoft/layoutlm-base-uncased"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "microsoft/layoutlm-base-uncased"})
 
-      assert %Bumblebee.Text.LayoutLmTokenizer{} = tokenizer
+    assert %Bumblebee.Text.LayoutLmTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with [MASK].",
-          {"Question?", "Answer"}
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with [MASK].",
+        {"Question?", "Answer"}
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([[101, 3231, 6251, 2007, 103, 1012, 102], [101, 3160, 1029, 102, 3437, 102, 0]])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([[101, 3231, 6251, 2007, 103, 1012, 102], [101, 3160, 1029, 102, 3437, 102, 0]])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]])
-      )
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0]])
+    )
 
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0]])
-      )
-    end
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0]])
+    )
   end
 end
diff --git a/test/bumblebee/text/llama_test.exs b/test/bumblebee/text/llama_test.exs
index bab21ac3..07148f49 100644
--- a/test/bumblebee/text/llama_test.exs
+++ b/test/bumblebee/text/llama_test.exs
@@ -1,85 +1,79 @@
 defmodule Bumblebee.Text.LlamaTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "seanmor5/tiny-llama-test"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-LlamaModel"})
 
-      assert %Bumblebee.Text.Llama{architecture: :base} = spec
+    assert %Bumblebee.Text.Llama{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[1, 15043, 3186, 825, 29915, 29879, 701]])
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert Nx.shape(outputs.hidden_state) == {1, 7, 32}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.4411, -1.9037, 0.9454], [0.8148, -1.4606, 0.0076], [0.9480, 0.6038, 0.1649]]
-        ]),
-        atol: 1.0e-2
-      )
-    end
-
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model(
-                 {:hf, "HuggingFaceH4/tiny-random-LlamaForSequenceClassification"}
-               )
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[1.4799, -2.0333, 0.4759], [2.3749, -0.8369, -0.0206], [0.5767, -0.0515, -1.1795]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Llama{architecture: :for_sequence_classification} = spec
-      input_ids = Nx.tensor([[1, 15043, 3186, 825, 29915, 29879, 701]])
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "bumblebee-testing/tiny-random-LlamaForSequenceClassification"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert %Bumblebee.Text.Llama{architecture: :for_sequence_classification} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 1}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-0.0977]]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-    test "causal language model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "seanmor5/tiny-llama-test"},
-                 architecture: :for_causal_language_modeling
-               )
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.1964, -0.1069]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Llama{architecture: :for_causal_language_modeling} = spec
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-LlamaForCausalLM"})
 
-      input_ids = Nx.tensor([[1, 15043, 3186, 825, 29915, 29879, 701]])
+    assert %Bumblebee.Text.Llama{architecture: :for_causal_language_modeling} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 7, 32000}
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.0592, 0.1188, -0.1214], [-0.0331, 0.0335, -0.1808], [-0.1825, -0.0711, 0.0497]]
-        ]),
-        atol: 1.0e-2
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0469, -0.0751, 0.0349], [0.0617, -0.1357, -0.0204], [-0.1495, 0.0557, -0.0737]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/llama_tokenizer_test.exs b/test/bumblebee/text/llama_tokenizer_test.exs
index 6713ea34..83330cfd 100644
--- a/test/bumblebee/text/llama_tokenizer_test.exs
+++ b/test/bumblebee/text/llama_tokenizer_test.exs
@@ -4,7 +4,7 @@ defmodule Bumblebee.Text.LlamaTokenizerTest do
   import Bumblebee.TestHelpers
 
   describe "integration" do
-    test "encoding model input" do
+    test "encodes text" do
       assert {:ok, tokenizer} =
                Bumblebee.load_tokenizer({:hf, "hf-internal-testing/llama-tokenizer"},
                  module: Bumblebee.Text.LlamaTokenizer
diff --git a/test/bumblebee/text/mbart_test.exs b/test/bumblebee/text/mbart_test.exs
index d797bf3f..a0e5123b 100644
--- a/test/bumblebee/text/mbart_test.exs
+++ b/test/bumblebee/text/mbart_test.exs
@@ -1,186 +1,160 @@
 defmodule Bumblebee.Text.MbartTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/mbart-large-cc25"},
-                 architecture: :base
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-MBartModel"})
 
-      assert %Bumblebee.Text.Mbart{architecture: :base} = spec
+    assert %Bumblebee.Text.Mbart{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[35378, 4, 759, 10269, 83, 99942, 2, 250_004]])
-
-      inputs = %{
-        "input_ids" => input_ids
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.hidden_state) == {1, 8, 1024}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-2.8804, -4.7890, -1.7658], [-3.0863, -4.9929, -1.2588], [-2.6020, -5.3808, -0.6461]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/mbart-large-en-ro"},
-                 architecture: :for_conditional_generation,
-                 module: Bumblebee.Text.Mbart
-               )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.Mbart{architecture: :for_conditional_generation} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      input_ids = Nx.tensor([[4828, 83, 70, 35166, 2, 250_004]])
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 16}
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.8300, -0.4815, 0.4641], [-1.6583, 0.9162, -0.3562], [-0.6983, -0.7699, 1.0282]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      outputs = Axon.predict(model, params, inputs)
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-MBartForConditionalGeneration"}
+             )
 
-      assert Nx.shape(outputs.logits) == {1, 6, 250_027}
+    assert %Bumblebee.Text.Mbart{architecture: :for_conditional_generation} = spec
 
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [[3.6470, 11.0182, 3.5707], [3.5739, 7.6637, 1.8500], [3.2506, 8.7177, 2.7895]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-mbart"},
-                 architecture: :for_sequence_classification,
-                 module: Bumblebee.Text.Mbart
-               )
+    outputs = Axon.predict(model, params, inputs)
 
-      assert %Bumblebee.Text.Mbart{architecture: :for_sequence_classification} = spec
+    assert Nx.shape(outputs.logits) == {1, 10, 250_027}
 
-      input_ids = Nx.tensor([[157, 87, 21, 4, 44, 93, 43, 47, 70, 152, 16, 2, 1004]])
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.0000, 0.0923, 0.0841], [0.0000, 0.1023, -0.0938], [0.0000, 0.0703, 0.1231]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-MBartForSequenceClassification"}
+             )
 
-      outputs = Axon.predict(model, params, inputs)
+    assert %Bumblebee.Text.Mbart{architecture: :for_sequence_classification} = spec
 
-      assert Nx.shape(outputs.logits) == {1, 2}
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 2, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])
+    }
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-0.0062, 0.0032]]),
-        atol: 1.0e-4
-      )
-    end
+    outputs = Axon.predict(model, params, inputs)
 
-    test "question answering model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-mbart"},
-                 architecture: :for_question_answering,
-                 module: Bumblebee.Text.Mbart
-               )
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-      assert %Bumblebee.Text.Mbart{architecture: :for_question_answering} = spec
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0085, 0.0054]]),
+      atol: 1.0e-4
+    )
+  end
 
-      input_ids = Nx.tensor([[8, 324, 53, 21, 22, 8, 338, 434, 157, 25, 7, 110, 153]])
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-MBartForQuestionAnswering"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert %Bumblebee.Text.Mbart{architecture: :for_question_answering} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.start_logits) == {1, 13}
-      assert Nx.shape(outputs.end_logits) == {1, 13}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.start_logits[[0, 1..3]],
-        Nx.tensor([-0.1411, 0.1579, 0.1181]),
-        atol: 1.0e-4
-      )
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
 
-      assert_all_close(
-        outputs.end_logits[[0, 1..3]],
-        Nx.tensor([-0.0198, -0.2103, -0.1095]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[0.1063, -0.1271, -0.1534]]),
+      atol: 1.0e-4
+    )
 
-    test "causal language model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/mbart-large-cc25"},
-                 architecture: :for_causal_language_modeling,
-                 module: Bumblebee.Text.Mbart
-               )
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[0.0268, 0.0238, 0.0857]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Mbart{architecture: :for_causal_language_modeling} = spec
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-MBartForCausalLM"})
 
-      input_ids = Nx.tensor([[35378, 4, 759, 10269, 83, 99942, 2, 250_004]])
+    assert %Bumblebee.Text.Mbart{architecture: :for_causal_language_modeling} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 8, 250_027}
+    assert Nx.shape(outputs.logits) == {1, 10, 250_027}
 
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [-0.1630, 20.1722, 20.1680],
-          [-1.2354, 59.5818, 59.0031],
-          [-2.2185, 94.7050, 92.3012]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0000, -0.0236, -0.0043], [0.0000, -0.0101, 0.0510], [0.0000, 0.0404, 0.0327]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 
-  test "conditional generation" do
-    {:ok, model_info} =
-      Bumblebee.load_model({:hf, "facebook/mbart-large-en-ro"},
-        architecture: :for_conditional_generation,
-        module: Bumblebee.Text.Mbart
-      )
+  test "generation with :for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-MBartForConditionalGeneration"}
+             )
 
     {:ok, generation_config} =
-      Bumblebee.load_generation_config({:hf, "facebook/mbart-large-en-ro"},
-        spec_module: Bumblebee.Text.Mbart
+      Bumblebee.load_generation_config(
+        {:hf, "hf-internal-testing/tiny-random-MBartForConditionalGeneration"}
       )
 
-    assert %Bumblebee.Text.Mbart{architecture: :for_conditional_generation} = model_info.spec
+    assert %Bumblebee.Text.Mbart{architecture: :for_conditional_generation} = spec
 
     inputs = %{
-      "input_ids" => Nx.tensor([[4828, 83, 70, 35166, 2, 250_004]]),
-      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1]])
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
     }
 
-    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 5)
-
-    generate =
-      Bumblebee.Text.Generation.build_generate(
-        model_info.model,
-        model_info.spec,
-        generation_config
-      )
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 3)
 
-    token_ids = generate.(model_info.params, inputs)
+    generate = Bumblebee.Text.Generation.build_generate(model, spec, generation_config)
+    token_ids = generate.(params, inputs)
 
-    assert_equal(token_ids, Nx.tensor([[250_020, 4828, 473, 54051, 202, 2]]))
+    assert_equal(token_ids, Nx.tensor([[0, 230_521, 20386, 20386]]))
   end
 end
diff --git a/test/bumblebee/text/mbart_tokenizer_test.exs b/test/bumblebee/text/mbart_tokenizer_test.exs
index 6ee06858..59d85c5c 100644
--- a/test/bumblebee/text/mbart_tokenizer_test.exs
+++ b/test/bumblebee/text/mbart_tokenizer_test.exs
@@ -3,27 +3,25 @@ defmodule Bumblebee.Text.MbartTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/mbart-large-cc25"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/mbart-large-cc25"})
 
-      assert %Bumblebee.Text.MbartTokenizer{} = tokenizer
+    assert %Bumblebee.Text.MbartTokenizer{} = tokenizer
 
-      inputs = Bumblebee.apply_tokenizer(tokenizer, ["Hello, my dog is cute <mask>"])
+    inputs = Bumblebee.apply_tokenizer(tokenizer, ["Hello, my dog is cute <mask>"])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [35378, 4, 759, 10269, 83, 99942, 250_026, 2, 250_004]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [35378, 4, 759, 10269, 83, 99942, 250_026, 2, 250_004]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1, 1, 1]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1, 1, 1]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/mistral_test.exs b/test/bumblebee/text/mistral_test.exs
index 416c493a..1ad8657f 100644
--- a/test/bumblebee/text/mistral_test.exs
+++ b/test/bumblebee/text/mistral_test.exs
@@ -1,87 +1,79 @@
 defmodule Bumblebee.Text.MistralTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "echarlaix/tiny-random-mistral"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-MistralModel"})
 
-      assert %Bumblebee.Text.Mistral{architecture: :base} = spec
+    assert %Bumblebee.Text.Mistral{architecture: :base} = spec
 
-      input_ids = Nx.tensor([[1, 6312, 28709, 1526, 28808]])
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    outputs = Axon.predict(model, params, inputs)
 
-      outputs = Axon.predict(model, params, inputs)
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert Nx.shape(outputs.hidden_state) == {1, 5, 32}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-1.1513, -0.3565, -1.3482],
-            [0.5468, 0.5652, -0.4141],
-            [-1.2177, -0.7919, -0.7064]
-          ]
-        ]),
-        atol: 1.0e-2
-      )
-    end
-
-    test "sequence classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "seanmor5/tiny-random-mistral-classification"})
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.9450, -1.3945, 0.7331], [-2.1118, -1.3091, -0.7834], [-1.7609, -1.3034, 1.0634]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Mistral{architecture: :for_sequence_classification} = spec
-      input_ids = Nx.tensor([[1, 6312, 28709, 1526]])
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-MistralForSequenceClassification"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    assert %Bumblebee.Text.Mistral{architecture: :for_sequence_classification} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 2}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[0.0255, 0.0318]]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-    test "causal language model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "echarlaix/tiny-random-mistral"},
-                 architecture: :for_causal_language_modeling
-               )
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0035, -0.0357]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.Mistral{architecture: :for_causal_language_modeling} = spec
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-MistralForCausalLM"})
 
-      input_ids = Nx.tensor([[1, 6312, 28709, 1526]])
+    assert %Bumblebee.Text.Mistral{architecture: :for_causal_language_modeling} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 4, 32000}
+    assert Nx.shape(outputs.logits) == {1, 10, 32000}
 
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.1156, 0.0420, -0.0609], [0.0333, 0.0376, -0.0531], [-0.0507, -0.0097, -0.0039]]
-        ]),
-        atol: 1.0e-2
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.1054, 0.0026, 0.0450], [0.1400, 0.1388, 0.0265], [0.0060, -0.1150, -0.1463]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/question_answering_test.exs b/test/bumblebee/text/question_answering_test.exs
index 7b128539..2ececd0d 100644
--- a/test/bumblebee/text/question_answering_test.exs
+++ b/test/bumblebee/text/question_answering_test.exs
@@ -3,46 +3,44 @@ defmodule Bumblebee.Text.QuestionAnsweringTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
-
-  describe "integration" do
-    test "returns the most probable answer" do
-      {:ok, roberta} = Bumblebee.load_model({:hf, "deepset/roberta-base-squad2"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
-
-      serving = Bumblebee.Text.question_answering(roberta, tokenizer)
-
-      input = %{question: "What's my name?", context: "My name is Sarah and I live in London."}
-
-      assert %{
-               results: [
-                 %{
-                   text: "Sarah",
-                   start: 11,
-                   end: 16,
-                   score: score
-                 }
-               ]
-             } = Nx.Serving.run(serving, input)
-
-      assert_all_close(score, 0.8105)
-    end
-
-    test "supports multiple inputs" do
-      {:ok, roberta} = Bumblebee.load_model({:hf, "deepset/roberta-base-squad2"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
-
-      serving = Bumblebee.Text.question_answering(roberta, tokenizer)
-
-      inputs = [
-        %{question: "What's my name?", context: "My name is Sarah and I live in London."},
-        %{question: "Where do I live?", context: "My name is Clara and I live in Berkeley."}
-      ]
-
-      assert [
-               %{results: [%{text: "Sarah", start: 11, end: 16, score: _}]},
-               %{results: [%{text: "Berkeley", start: 31, end: 39, score: _}]}
-             ] = Nx.Serving.run(serving, inputs)
-    end
+  @moduletag serving_test_tags()
+
+  test "returns the most probable answer" do
+    {:ok, roberta} = Bumblebee.load_model({:hf, "deepset/roberta-base-squad2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
+
+    serving = Bumblebee.Text.question_answering(roberta, tokenizer)
+
+    input = %{question: "What's my name?", context: "My name is Sarah and I live in London."}
+
+    assert %{
+             results: [
+               %{
+                 text: "Sarah",
+                 start: 11,
+                 end: 16,
+                 score: score
+               }
+             ]
+           } = Nx.Serving.run(serving, input)
+
+    assert_all_close(score, 0.8105)
+  end
+
+  test "supports multiple inputs" do
+    {:ok, roberta} = Bumblebee.load_model({:hf, "deepset/roberta-base-squad2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
+
+    serving = Bumblebee.Text.question_answering(roberta, tokenizer)
+
+    inputs = [
+      %{question: "What's my name?", context: "My name is Sarah and I live in London."},
+      %{question: "Where do I live?", context: "My name is Clara and I live in Berkeley."}
+    ]
+
+    assert [
+             %{results: [%{text: "Sarah", start: 11, end: 16, score: _}]},
+             %{results: [%{text: "Berkeley", start: 31, end: 39, score: _}]}
+           ] = Nx.Serving.run(serving, inputs)
   end
 end
diff --git a/test/bumblebee/text/roberta_test.exs b/test/bumblebee/text/roberta_test.exs
index 45617b9d..172f0c59 100644
--- a/test/bumblebee/text/roberta_test.exs
+++ b/test/bumblebee/text/roberta_test.exs
@@ -1,179 +1,184 @@
 defmodule Bumblebee.Text.RobertaTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "roberta-base"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-RobertaModel"})
 
-      assert %Bumblebee.Text.Roberta{architecture: :base} = spec
+    assert %Bumblebee.Text.Roberta{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 11, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([[[-0.3633, 0.8947, 1.8965], [0.5881, 1.9730, 1.4211], [0.8067, 1.6098, 0.0291]]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_masked_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-RobertaForMaskedLM"})
+
+    assert %Bumblebee.Text.Roberta{architecture: :for_masked_language_modeling} = spec
+
+    # TODO: remove once we load tied embeddings
+    params = put_in(params["language_modeling_head.output"], params["embedder.token_embedding"])
 
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "roberta-base"})
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_masked_language_modeling} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.0000, -0.0796, 0.1734], [0.0000, -0.0754, 0.0755], [0.0000, 0.0299, 0.1902]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 11, 50265}
+  test ":for_sequence_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-RobertaForSequenceClassification"}
+             )
 
-      assert_all_close(
-        outputs.logits[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.Roberta{architecture: :for_sequence_classification} = spec
 
-    test "sequence classification" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "cardiffnlp/twitter-roberta-base-emotion"})
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_sequence_classification} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 31414, 6, 127, 2335, 16, 11962, 37, 11639, 1168, 2]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0032, 0.0017]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 4}
+  test ":for_token_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-RobertaForTokenClassification"}
+             )
 
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-1.3661, 3.0174, -0.9609, -0.4145]]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.Roberta{architecture: :for_token_classification} = spec
 
-    test "token classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Jean-Baptiste/roberta-large-ner-english"})
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_token_classification} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[30581, 3923, 34892, 16, 10, 138, 716, 11, 2201, 8, 188, 469]])
-      }
+    assert Nx.shape(outputs.logits) == {1, 10, 2}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.logits[[.., 1..3//1, ..]],
+      Nx.tensor([[[-0.0276, 0.0128], [-0.1321, 0.0960], [0.1680, 0.0699]]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert Nx.shape(outputs.logits) == {1, 12, 5}
+  test ":for_question_answering" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-RobertaForQuestionAnswering"}
+             )
 
-      assert_all_close(
-        outputs.logits[[.., 0..2, 0..1]],
-        Nx.tensor([[[4.1969, -2.5614], [-1.4174, -0.6959], [-1.3807, 0.1313]]]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.Roberta{architecture: :for_question_answering} = spec
 
-    test "question answering model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "deepset/roberta-base-squad2"})
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "token_type_ids" => Nx.tensor([[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_question_answering} = spec
+    outputs = Axon.predict(model, params, inputs)
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [0, 12375, 21, 2488, 289, 13919, 116, 2, 2, 24021, 289, 13919, 21, 10, 2579, 29771, 2]
-          ])
-      }
+    assert Nx.shape(outputs.start_logits) == {1, 10}
+    assert Nx.shape(outputs.end_logits) == {1, 10}
 
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      outputs.start_logits[[.., 1..3]],
+      Nx.tensor([[-0.1215, -0.1325, -0.1389]]),
+      atol: 1.0e-4
+    )
 
-      assert Nx.shape(outputs.start_logits) == {1, 17}
-      assert Nx.shape(outputs.end_logits) == {1, 17}
+    assert_all_close(
+      outputs.end_logits[[.., 1..3]],
+      Nx.tensor([[-0.2795, -0.0051, -0.1547]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_multiple_choice" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-RobertaForMultipleChoice"}
+             )
+
+    assert %Bumblebee.Text.Roberta{architecture: :for_multiple_choice} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]]),
+      "attention_mask" => Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]]),
+      "token_type_ids" => Nx.tensor([[[0, 0, 0, 0, 1, 1, 1, 1, 0, 0]]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 1}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0257]]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert_all_close(
-        outputs.start_logits[[.., 0..2]],
-        Nx.tensor([[0.5901, -8.3490, -8.8031]]),
-        atol: 1.0e-4
-      )
+  test ":for_causal_language_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-RobertaForCausalLM"})
 
-      assert_all_close(
-        outputs.end_logits[[.., 0..2]],
-        Nx.tensor([[1.1207, -7.5968, -7.6151]]),
-        atol: 1.0e-4
-      )
-    end
+    assert %Bumblebee.Text.Roberta{architecture: :for_causal_language_modeling} = spec
 
-    test "multiple choice model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "LIAMF-USP/aristo-roberta"})
+    # TODO: remove once we load tied embeddings
+    params = put_in(params["language_modeling_head.output"], params["embedder.token_embedding"])
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_multiple_choice} = spec
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([
-            [[0, 38576, 103, 4437, 2, 2, 725, 895, 2], [0, 38576, 103, 4437, 2, 2, 487, 895, 2]]
-          ]),
-        "attention_mask" =>
-          Nx.tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 2}
-
-      assert_all_close(
-        outputs.logits,
-        Nx.tensor([[-13.9123, -13.4582]]),
-        atol: 1.0e-3
-      )
-    end
+    outputs = Axon.predict(model, params, inputs)
 
-    test "casual language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "roberta-base"},
-                 architecture: :for_causal_language_modeling
-               )
+    assert Nx.shape(outputs.logits) == {1, 10, 1024}
 
-      assert %Bumblebee.Text.Roberta{architecture: :for_causal_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 31414, 6, 127, 2335, 16, 11962, 2]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 8, 50265}
-
-      assert_all_close(
-        outputs.logits[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-3.3435, 32.1472, -3.5083], [-3.5373, 21.8191, -3.5197], [-4.2189, 22.5419, -3.9859]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]],
+      Nx.tensor([[[0.0000, 0.0661, -0.0063], [0.0000, 0.1107, -0.1137], [0.0000, 0.1044, 0.0803]]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/roberta_tokenizer_test.exs b/test/bumblebee/text/roberta_tokenizer_test.exs
index c43c2456..297d1658 100644
--- a/test/bumblebee/text/roberta_tokenizer_test.exs
+++ b/test/bumblebee/text/roberta_tokenizer_test.exs
@@ -3,41 +3,39 @@ defmodule Bumblebee.Text.RobertaTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
 
-      assert %Bumblebee.Text.RobertaTokenizer{} = tokenizer
+    assert %Bumblebee.Text.RobertaTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with <mask>.",
-          {"Question?", "Answer"}
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with <mask>.",
+        {"Question?", "Answer"}
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([
-          [0, 34603, 3645, 19, 50264, 4, 2],
-          [0, 45641, 116, 2, 2, 33683, 2]
-        ])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([
+        [0, 34603, 3645, 19, 50264, 4, 2],
+        [0, 45641, 116, 2, 2, 33683, 2]
+      ])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([
-          [1, 1, 1, 1, 1, 1, 1],
-          [1, 1, 1, 1, 1, 1, 1]
-        ])
-      )
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([
+        [1, 1, 1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 1, 1, 1]
+      ])
+    )
 
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([
-          [0, 0, 0, 0, 0, 0, 0],
-          [0, 0, 0, 0, 0, 0, 0]
-        ])
-      )
-    end
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([
+        [0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0]
+      ])
+    )
   end
 end
diff --git a/test/bumblebee/text/t5_test.exs b/test/bumblebee/text/t5_test.exs
index 441e9a6a..98775ce0 100644
--- a/test/bumblebee/text/t5_test.exs
+++ b/test/bumblebee/text/t5_test.exs
@@ -1,196 +1,200 @@
 defmodule Bumblebee.Text.T5Test do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "t5-small"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-T5Model"})
 
-      assert %Bumblebee.Text.T5{architecture: :base} = spec
+    assert %Bumblebee.Text.T5{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([[6536, 43, 118, 2008, 24, 293, 53, 3, 9, 1782, 19, 207, 21, 25, 1]]),
-        "decoder_input_ids" => Nx.tensor([[0, 6536, 504, 24]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 4, 512}
+    assert Nx.shape(outputs.hidden_state) == {1, 8, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.1380, -0.0321, 0.0281], [0.0637, 0.0025, 0.0985], [-0.0019, 0.1075, 0.1575]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]] |> Nx.multiply(100),
+      Nx.tensor([
+        [[-0.0353, -0.2614, -0.0219], [0.0829, 0.0845, -0.1971], [-0.0208, -0.0795, -0.0401]]
+      ]),
+      atol: 1.0e-4
+    )
 
-    test "base model (gated activation)" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "google/flan-t5-small"}, architecture: :base)
+    assert_all_close(Nx.sum(outputs.hidden_state), -0.0235, atol: 1.0e-4)
+  end
 
-      assert %Bumblebee.Text.T5{architecture: :base} = spec
+  test ":base with gated feed-forward activation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "bumblebee-testing/tiny-random-T5Model-feed_forward_proj-gated"}
+             )
 
-      inputs = %{
-        "input_ids" =>
-          Nx.tensor([[6536, 43, 118, 2008, 24, 293, 53, 3, 9, 1782, 19, 207, 21, 25, 1]]),
-        "decoder_input_ids" => Nx.tensor([[0, 6536, 504, 24]])
-      }
+    assert %Bumblebee.Text.T5{architecture: :base} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.hidden_state) == {1, 4, 512}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.1101, 0.0512, 0.1005], [0.0091, -0.0398, 0.0895], [-0.1061, -0.0152, 0.0702]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "conditional generation model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "t5-small"},
-                 architecture: :for_conditional_generation
-               )
-
-      assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
-
-      input_ids = Nx.tensor([[37, 32099, 10681, 16, 32098, 2447, 1]])
-      decoder_input_ids = Nx.tensor([[32099, 5295, 1782, 32098, 8, 32097, 1]])
-
-      inputs = %{
-        "input_ids" => input_ids,
-        "decoder_input_ids" => decoder_input_ids
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 7, 32128}
-
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [
-            [-11.7720, -12.8368, -9.6471],
-            [-10.6815, -11.4800, -8.5046],
-            [-15.8921, -15.2948, -8.4964]
-          ]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.hidden_state) == {1, 8, 32}
 
-    test "conditional generation model (tied embeddings)" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "google/flan-t5-small"})
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]] |> Nx.multiply(100),
+      Nx.tensor([
+        [[-0.0353, -0.2614, -0.0219], [0.0829, 0.0845, -0.1971], [-0.0208, -0.0795, -0.0401]]
+      ]),
+      atol: 1.0e-4
+    )
 
-      assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
+    assert_all_close(Nx.sum(outputs.hidden_state), -0.0235, atol: 1.0e-4)
+  end
 
-      input_ids = Nx.tensor([[37, 32099, 10681, 16, 32098, 2447, 1]])
-      decoder_input_ids = Nx.tensor([[32099, 5295, 1782, 32098, 8, 32097, 1]])
+  test ":for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-T5ForConditionalGeneration"}
+             )
 
-      inputs = %{
-        "input_ids" => input_ids,
-        "decoder_input_ids" => decoder_input_ids
-      }
+    assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
 
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      assert Nx.shape(outputs.logits) == {1, 7, 32128}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        outputs.logits[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [2.7100, -3.0434, 1.2578],
-          [3.1423, -3.6663, 1.2443],
-          [1.0911, -3.8732, 0.5008]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.logits) == {1, 8, 32100}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]] |> Nx.multiply(10_000),
+      Nx.tensor([[[-0.0158, 0.0067, 0.0636], [0.0128, 0.0742, -0.0398], [0.0050, 0.0554, 0.0083]]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "encoder model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "t5-small"},
-                 architecture: :encoder
-               )
+  test ":for_conditional_generation without tied embeddings" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf,
+                "bumblebee-testing/tiny-random-T5ForConditionalGeneration-tie_word_embeddings-False"}
+             )
+
+    assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+      "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]),
+      "decoder_attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 8, 32100}
+
+    assert_all_close(
+      outputs.logits[[.., 1..3, 1..3]] |> Nx.multiply(10_000),
+      Nx.tensor([
+        [[0.0537, -0.0358, -0.2016], [0.0580, 0.2900, -0.0393], [0.0194, 0.0153, -0.0144]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-      assert %Bumblebee.Text.T5{architecture: :encoder} = spec
+  test ":encoder" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-T5Model"},
+               architecture: :encoder
+             )
 
-      input_ids = Nx.tensor([[37, 32099, 10681, 16, 32098, 2447, 1]])
+    assert %Bumblebee.Text.T5{architecture: :encoder} = spec
 
-      inputs = %{
-        "input_ids" => input_ids
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 7, 512}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[0, 1..3, 1..3]],
-        Nx.tensor([
-          [[0.0713, -0.1633, -0.0978], [-0.0314, -0.3135, -0.1801], [-0.2863, 0.0751, -0.0536]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.0034, -0.0005, -0.0036], [-0.0002, 0.0029, 0.0021], [-0.0011, -0.0004, -0.0034]]
+      ]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "text generation" do
-      assert {:ok, model_info} = Bumblebee.load_model({:hf, "t5-small"})
-      assert {:ok, generation_config} = Bumblebee.load_generation_config({:hf, "t5-small"})
+  test "generation with :for_conditional_generation" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-T5ForConditionalGeneration"}
+             )
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[13959, 1566, 12, 2968, 10, 571, 625, 33, 25, 58, 1]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config(
+        {:hf, "hf-internal-testing/tiny-random-T5ForConditionalGeneration"}
+      )
 
-      generation_config = Bumblebee.configure(generation_config, max_new_tokens: 5)
+    assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
 
-      generate =
-        Bumblebee.Text.Generation.build_generate(
-          model_info.model,
-          model_info.spec,
-          generation_config
-        )
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      token_ids = generate.(model_info.params, inputs)
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 3)
 
-      assert_equal(token_ids, Nx.tensor([[0, 2739, 4445, 436, 292, 58]]))
-    end
+    generate = Bumblebee.Text.Generation.build_generate(model, spec, generation_config)
+    token_ids = generate.(params, inputs)
 
-    test "text generation (tied embeddings)" do
-      assert {:ok, model_info} = Bumblebee.load_model({:hf, "google/flan-t5-small"})
+    assert_equal(token_ids, Nx.tensor([[0, 0, 0, 0]]))
+  end
 
-      assert {:ok, generation_config} =
-               Bumblebee.load_generation_config({:hf, "google/flan-t5-small"})
+  test "generation with :for_conditional_generation without tied embeddings" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf,
+                "bumblebee-testing/tiny-random-T5ForConditionalGeneration-tie_word_embeddings-False"}
+             )
+
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config(
+        {:hf,
+         "bumblebee-testing/tiny-random-T5ForConditionalGeneration-tie_word_embeddings-False"}
+      )
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[13959, 1566, 12, 2968, 10, 571, 625, 33, 25, 58, 1]]),
-        "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      }
+    assert %Bumblebee.Text.T5{architecture: :for_conditional_generation} = spec
 
-      generation_config = Bumblebee.configure(generation_config, max_new_tokens: 5)
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      generate =
-        Bumblebee.Text.Generation.build_generate(
-          model_info.model,
-          model_info.spec,
-          generation_config
-        )
+    generation_config = Bumblebee.configure(generation_config, max_new_tokens: 3)
 
-      token_ids = generate.(model_info.params, inputs)
+    generate = Bumblebee.Text.Generation.build_generate(model, spec, generation_config)
+    token_ids = generate.(params, inputs)
 
-      assert_equal(token_ids, Nx.tensor([[0, 2739, 3, 362, 3, 49]]))
-    end
+    assert_equal(token_ids, Nx.tensor([[0, 6161, 29516, 9788]]))
   end
 end
diff --git a/test/bumblebee/text/t5_tokenizer_test.exs b/test/bumblebee/text/t5_tokenizer_test.exs
index 5a2c56fd..c6a2980a 100644
--- a/test/bumblebee/text/t5_tokenizer_test.exs
+++ b/test/bumblebee/text/t5_tokenizer_test.exs
@@ -3,24 +3,22 @@ defmodule Bumblebee.Text.T5TokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "t5-small"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "t5-small"})
 
-      assert %Bumblebee.Text.T5Tokenizer{} = tokenizer
+    assert %Bumblebee.Text.T5Tokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, ["translate English to German: How old are you?"])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, ["translate English to German: How old are you?"])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([[13959, 1566, 12, 2968, 10, 571, 625, 33, 25, 58, 1]])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([[13959, 1566, 12, 2968, 10, 571, 625, 33, 25, 58, 1]])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
-      )
-    end
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+    )
   end
 end
diff --git a/test/bumblebee/text/text_classification_test.exs b/test/bumblebee/text/text_classification_test.exs
index 829d2bb6..015783c2 100644
--- a/test/bumblebee/text/text_classification_test.exs
+++ b/test/bumblebee/text/text_classification_test.exs
@@ -3,25 +3,23 @@ defmodule Bumblebee.Text.TextClassificationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
-  describe "integration" do
-    test "returns top scored labels" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "cardiffnlp/twitter-roberta-base-emotion"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
+  test "returns top scored labels" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "cardiffnlp/twitter-roberta-base-emotion"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "roberta-base"})
 
-      serving = Bumblebee.Text.TextClassification.text_classification(model_info, tokenizer)
+    serving = Bumblebee.Text.TextClassification.text_classification(model_info, tokenizer)
 
-      text = "Cats are cute."
+    text = "Cats are cute."
 
-      assert %{
-               predictions: [
-                 %{label: "optimism", score: _},
-                 %{label: "sadness", score: _},
-                 %{label: "anger", score: _},
-                 %{label: "joy", score: _}
-               ]
-             } = Nx.Serving.run(serving, text)
-    end
+    assert %{
+             predictions: [
+               %{label: "optimism", score: _},
+               %{label: "sadness", score: _},
+               %{label: "anger", score: _},
+               %{label: "joy", score: _}
+             ]
+           } = Nx.Serving.run(serving, text)
   end
 end
diff --git a/test/bumblebee/text/text_embedding_test.exs b/test/bumblebee/text/text_embedding_test.exs
index 150eb723..21fb8747 100644
--- a/test/bumblebee/text/text_embedding_test.exs
+++ b/test/bumblebee/text/text_embedding_test.exs
@@ -3,105 +3,103 @@ defmodule Bumblebee.Text.TextEmbeddingTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
-  describe "integration" do
-    test "returns E5 embedding for a piece of text" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-large"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-large"})
+  test "returns embedding for a piece of text" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-small-v2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-small-v2"})
 
-      serving = Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer)
+    serving = Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer)
 
-      text = "query: Cats are cute."
+    text = "query: Cats are cute."
 
-      assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, text)
+    assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, text)
 
-      assert Nx.shape(embedding) == {1024}
+    assert Nx.shape(embedding) == {384}
 
-      assert_all_close(
-        embedding[1..3],
-        Nx.tensor([-0.9815, -0.5015, 0.9868]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      embedding[1..3],
+      Nx.tensor([0.0420, -0.0188, 0.1115]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "returns normalized E5 embedding for a piece of text" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-large"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-large"})
+  test "returns normalized embedding for a piece of text" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-small-v2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-small-v2"})
 
-      options = [embedding_processor: :l2_norm]
+    options = [embedding_processor: :l2_norm]
 
-      serving = Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer, options)
+    serving = Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer, options)
 
-      text = "query: Cats are cute."
+    text = "query: Cats are cute."
 
-      assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, text)
+    assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, text)
 
-      assert Nx.shape(embedding) == {1024}
+    assert Nx.shape(embedding) == {384}
 
-      assert_all_close(
-        embedding[1..3],
-        Nx.tensor([-0.0459, -0.0234, 0.0461]),
-        atol: 1.0e-4
-      )
+    assert_all_close(
+      embedding[1..3],
+      Nx.tensor([0.0433, -0.0194, 0.1151]),
+      atol: 1.0e-4
+    )
 
-      assert_all_close(Nx.sum(Nx.pow(embedding, 2)), Nx.tensor(1.0), atol: 1.0e-6)
-    end
+    assert_all_close(Nx.sum(Nx.pow(embedding, 2)), Nx.tensor(1.0), atol: 1.0e-6)
+  end
 
-    test "supports compilation for single or multiple sequence lengths" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-large"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-large"})
+  test "supports compilation for single or multiple sequence lengths" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-small-v2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-small-v2"})
 
-      serving_short =
-        Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
-          compile: [batch_size: 1, sequence_length: 8]
-        )
+    serving_short =
+      Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+        compile: [batch_size: 1, sequence_length: 8]
+      )
 
-      serving_long =
-        Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
-          compile: [batch_size: 1, sequence_length: 16]
-        )
+    serving_long =
+      Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+        compile: [batch_size: 1, sequence_length: 16]
+      )
 
-      serving_both =
-        Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
-          compile: [batch_size: 1, sequence_length: [8, 16]]
-        )
+    serving_both =
+      Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+        compile: [batch_size: 1, sequence_length: [8, 16]]
+      )
 
-      short_text = "short text"
-      long_text = "definitely much longer text that should exceed 16 tokens"
+    short_text = "short text"
+    long_text = "definitely much longer text that should exceed 16 tokens"
 
-      assert %{embedding: embedding_short} = Nx.Serving.run(serving_short, short_text)
-      assert %{embedding: embedding_long} = Nx.Serving.run(serving_long, long_text)
+    assert %{embedding: embedding_short} = Nx.Serving.run(serving_short, short_text)
+    assert %{embedding: embedding_long} = Nx.Serving.run(serving_long, long_text)
 
-      assert %{embedding: embedding_short2} = Nx.Serving.run(serving_both, short_text)
-      assert %{embedding: embedding_long2} = Nx.Serving.run(serving_both, long_text)
+    assert %{embedding: embedding_short2} = Nx.Serving.run(serving_both, short_text)
+    assert %{embedding: embedding_long2} = Nx.Serving.run(serving_both, long_text)
 
-      assert_equal(embedding_short, embedding_short2)
-      assert_equal(embedding_long, embedding_long2)
-    end
+    assert_equal(embedding_short, embedding_short2)
+    assert_equal(embedding_long, embedding_long2)
+  end
 
-    @tag :multi_device
-    test "works with partitioned serving", %{test: test} do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-large"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-large"})
+  @tag :multi_device
+  test "works with partitioned serving", %{test: test} do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "intfloat/e5-small-v2"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "intfloat/e5-small-v2"})
 
-      serving =
-        Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
-          compile: [batch_size: 1, sequence_length: 16],
-          defn_options: [compiler: EXLA, client: :other_host],
-          preallocate_params: true
-        )
+    serving =
+      Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer,
+        compile: [batch_size: 1, sequence_length: 16],
+        defn_options: [compiler: EXLA, client: :other_host],
+        preallocate_params: true
+      )
 
-      start_supervised!({Nx.Serving, serving: serving, name: test, partitions: true})
+    start_supervised!({Nx.Serving, serving: serving, name: test, partitions: true})
 
-      text = "query: Cats are cute."
+    text = "query: Cats are cute."
 
-      assert [
-               %{embedding: %Nx.Tensor{} = embedding1},
-               %{embedding: %Nx.Tensor{} = embedding2}
-             ] = Nx.Serving.batched_run(test, [text, text])
+    assert [
+             %{embedding: %Nx.Tensor{} = embedding1},
+             %{embedding: %Nx.Tensor{} = embedding2}
+           ] = Nx.Serving.batched_run(test, [text, text])
 
-      assert_equal(embedding1, embedding2)
-    end
+    assert_equal(embedding1, embedding2)
   end
 end
diff --git a/test/bumblebee/text/token_classification_test.exs b/test/bumblebee/text/token_classification_test.exs
index df3e830f..83b95cf3 100644
--- a/test/bumblebee/text/token_classification_test.exs
+++ b/test/bumblebee/text/token_classification_test.exs
@@ -3,129 +3,127 @@ defmodule Bumblebee.Text.TokenClassificationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
+
+  test "correctly extracts entities with :same aggregation" do
+    assert {:ok, model_info} = Bumblebee.load_model({:hf, "dslim/bert-base-NER"})
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
+
+    serving =
+      Bumblebee.Text.TokenClassification.token_classification(model_info, tokenizer,
+        aggregation: :same
+      )
+
+    text = "I went with Jane Doe to Atlanta and we talked to John Smith about Microsoft"
+
+    assert %{entities: [jane, atlanta, john, microsoft]} = Nx.Serving.run(serving, text)
+
+    assert %{
+             label: "PER",
+             score: _jane_score,
+             phrase: "Jane Doe",
+             start: 12,
+             end: 20
+           } = jane
+
+    assert %{
+             label: "LOC",
+             score: _atlanta_score,
+             phrase: "Atlanta",
+             start: 24,
+             end: 31
+           } = atlanta
+
+    assert %{
+             label: "PER",
+             score: _john_score,
+             phrase: "John Smith",
+             start: 49,
+             end: 59
+           } = john
+
+    assert %{
+             label: "ORG",
+             score: _microsoft_score,
+             phrase: "Microsoft",
+             start: 66,
+             end: 75
+           } = microsoft
+
+    # Offsets should be expressed in terms of bytes (note that é is 2 bytes)
+
+    text = "Jane é John"
+
+    assert %{
+             entities: [%{start: 0, end: 4}, %{start: 8, end: 12}]
+           } = Nx.Serving.run(serving, text)
+  end
 
-  describe "integration" do
-    test "correctly extracts entities with :same aggregation" do
+  for aggregation <- [:word_first, :word_max, :word_average] do
+    test "correctly extracts entities with :#{aggregation} aggregation" do
       assert {:ok, model_info} = Bumblebee.load_model({:hf, "dslim/bert-base-NER"})
       assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
 
       serving =
         Bumblebee.Text.TokenClassification.token_classification(model_info, tokenizer,
-          aggregation: :same
+          aggregation: unquote(aggregation)
         )
 
-      text = "I went with Jane Doe to Atlanta and we talked to John Smith about Microsoft"
+      text = "I went with Janine Doe to Atlanta and we talked to John Smith about Microsoft"
 
       assert %{entities: [jane, atlanta, john, microsoft]} = Nx.Serving.run(serving, text)
 
       assert %{
                label: "PER",
-               score: _jane_score,
-               phrase: "Jane Doe",
+               score: _janine_score,
+               phrase: "Janine Doe",
                start: 12,
-               end: 20
+               end: 22
              } = jane
 
       assert %{
                label: "LOC",
                score: _atlanta_score,
                phrase: "Atlanta",
-               start: 24,
-               end: 31
+               start: 26,
+               end: 33
              } = atlanta
 
       assert %{
                label: "PER",
                score: _john_score,
                phrase: "John Smith",
-               start: 49,
-               end: 59
+               start: 51,
+               end: 61
              } = john
 
       assert %{
                label: "ORG",
                score: _microsoft_score,
                phrase: "Microsoft",
-               start: 66,
-               end: 75
+               start: 68,
+               end: 77
              } = microsoft
-
-      # Offsets should be expressed in terms of bytes (note that é is 2 bytes)
-
-      text = "Jane é John"
-
-      assert %{
-               entities: [%{start: 0, end: 4}, %{start: 8, end: 12}]
-             } = Nx.Serving.run(serving, text)
-    end
-
-    for aggregation <- [:word_first, :word_max, :word_average] do
-      test "correctly extracts entities with :#{aggregation} aggregation" do
-        assert {:ok, model_info} = Bumblebee.load_model({:hf, "dslim/bert-base-NER"})
-        assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
-
-        serving =
-          Bumblebee.Text.TokenClassification.token_classification(model_info, tokenizer,
-            aggregation: unquote(aggregation)
-          )
-
-        text = "I went with Janine Doe to Atlanta and we talked to John Smith about Microsoft"
-
-        assert %{entities: [jane, atlanta, john, microsoft]} = Nx.Serving.run(serving, text)
-
-        assert %{
-                 label: "PER",
-                 score: _janine_score,
-                 phrase: "Janine Doe",
-                 start: 12,
-                 end: 22
-               } = jane
-
-        assert %{
-                 label: "LOC",
-                 score: _atlanta_score,
-                 phrase: "Atlanta",
-                 start: 26,
-                 end: 33
-               } = atlanta
-
-        assert %{
-                 label: "PER",
-                 score: _john_score,
-                 phrase: "John Smith",
-                 start: 51,
-                 end: 61
-               } = john
-
-        assert %{
-                 label: "ORG",
-                 score: _microsoft_score,
-                 phrase: "Microsoft",
-                 start: 68,
-                 end: 77
-               } = microsoft
-      end
     end
+  end
 
-    test "correctly extracts entities with simple aggregation on batched input" do
-      assert {:ok, model_info} = Bumblebee.load_model({:hf, "dslim/bert-base-NER"})
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
+  test "correctly extracts entities with simple aggregation on batched input" do
+    assert {:ok, model_info} = Bumblebee.load_model({:hf, "dslim/bert-base-NER"})
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-cased"})
 
-      serving =
-        Bumblebee.Text.TokenClassification.token_classification(model_info, tokenizer,
-          aggregation: :same
-        )
+    serving =
+      Bumblebee.Text.TokenClassification.token_classification(model_info, tokenizer,
+        aggregation: :same
+      )
 
-      texts = [
-        "I went with Janine Doe to Atlanta and we talked to John Smith about Microsoft",
-        "John went to Philadelphia"
-      ]
+    texts = [
+      "I went with Janine Doe to Atlanta and we talked to John Smith about Microsoft",
+      "John went to Philadelphia"
+    ]
 
-      assert [_first, %{entities: [john, philadelphia]}] = Nx.Serving.run(serving, texts)
+    assert [_first, %{entities: [john, philadelphia]}] = Nx.Serving.run(serving, texts)
 
-      assert %{label: "PER", phrase: "John"} = john
-      assert %{label: "LOC", phrase: "Philadelphia"} = philadelphia
-    end
+    assert %{label: "PER", phrase: "John"} = john
+    assert %{label: "LOC", phrase: "Philadelphia"} = philadelphia
   end
 end
diff --git a/test/bumblebee/text/whisper_tokenizer_test.exs b/test/bumblebee/text/whisper_tokenizer_test.exs
index 75390c5a..d133808a 100644
--- a/test/bumblebee/text/whisper_tokenizer_test.exs
+++ b/test/bumblebee/text/whisper_tokenizer_test.exs
@@ -3,16 +3,14 @@ defmodule Bumblebee.Text.WhisperTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
 
-      assert %Bumblebee.Text.WhisperTokenizer{} = tokenizer
+    assert %Bumblebee.Text.WhisperTokenizer{} = tokenizer
 
-      inputs = Bumblebee.apply_tokenizer(tokenizer, ["Hello world"])
+    inputs = Bumblebee.apply_tokenizer(tokenizer, ["Hello world"])
 
-      assert_equal(inputs["input_ids"], Nx.tensor([[50258, 50363, 15947, 1002, 50257]]))
-      assert_equal(inputs["attention_mask"], Nx.tensor([[1, 1, 1, 1, 1]]))
-    end
+    assert_equal(inputs["input_ids"], Nx.tensor([[50258, 50363, 15947, 1002, 50257]]))
+    assert_equal(inputs["attention_mask"], Nx.tensor([[1, 1, 1, 1, 1]]))
   end
 end
diff --git a/test/bumblebee/text/xlm_roberta_test.exs b/test/bumblebee/text/xlm_roberta_test.exs
index 4df656a9..b58913ae 100644
--- a/test/bumblebee/text/xlm_roberta_test.exs
+++ b/test/bumblebee/text/xlm_roberta_test.exs
@@ -1,55 +1,31 @@
 defmodule Bumblebee.Text.XlmRobertaTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "xlm-roberta-base"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-XLMRobertaModel"})
 
-      assert %Bumblebee.Text.Roberta{architecture: :base} = spec
+    assert %Bumblebee.Text.Roberta{architecture: :base} = spec
 
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 581, 10323, 111, 9942, 83, 250_001, 6, 5, 2]])
-      }
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 10, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 10, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[0.4921, 0.3050, 0.1307], [-0.0038, -0.0187, -0.0312], [0.0248, -0.0300, 0.0382]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked language modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "xlm-roberta-base"})
-
-      assert %Bumblebee.Text.Roberta{architecture: :for_masked_language_modeling} = spec
-
-      inputs = %{
-        "input_ids" => Nx.tensor([[0, 581, 10323, 111, 9942, 83, 250_001, 6, 5, 2]])
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 10, 250_002}
-
-      assert_all_close(
-        outputs.logits[[.., 0..2, 0..2]],
-        Nx.tensor([
-          [[64.3345, 0.1994, 38.5827], [28.9445, -1.5083, 73.2020], [21.0732, -1.0673, 52.7042]]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.6455, -0.4189, 0.3424], [-0.4303, -0.6731, 0.2534], [-0.5240, 0.0864, -0.5632]]
+      ]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/text/xlm_roberta_tokenizer_test.exs b/test/bumblebee/text/xlm_roberta_tokenizer_test.exs
index 7e85a6fd..a207c3f8 100644
--- a/test/bumblebee/text/xlm_roberta_tokenizer_test.exs
+++ b/test/bumblebee/text/xlm_roberta_tokenizer_test.exs
@@ -3,32 +3,30 @@ defmodule Bumblebee.Text.XlmRobertaTokenizerTest do
 
   import Bumblebee.TestHelpers
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "xlm-roberta-base"})
+  test "encodes text" do
+    assert {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "xlm-roberta-base"})
 
-      assert %Bumblebee.Text.XlmRobertaTokenizer{} = tokenizer
+    assert %Bumblebee.Text.XlmRobertaTokenizer{} = tokenizer
 
-      inputs =
-        Bumblebee.apply_tokenizer(tokenizer, [
-          "Test sentence with <mask>.",
-          {"Question?", "Answer"}
-        ])
+    inputs =
+      Bumblebee.apply_tokenizer(tokenizer, [
+        "Test sentence with <mask>.",
+        {"Question?", "Answer"}
+      ])
 
-      assert_equal(
-        inputs["input_ids"],
-        Nx.tensor([[0, 8647, 149_357, 678, 250_001, 6, 5, 2], [0, 68185, 32, 2, 2, 130_373, 2, 1]])
-      )
+    assert_equal(
+      inputs["input_ids"],
+      Nx.tensor([[0, 8647, 149_357, 678, 250_001, 6, 5, 2], [0, 68185, 32, 2, 2, 130_373, 2, 1]])
+    )
 
-      assert_equal(
-        inputs["attention_mask"],
-        Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0]])
-      )
+    assert_equal(
+      inputs["attention_mask"],
+      Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0]])
+    )
 
-      assert_equal(
-        inputs["token_type_ids"],
-        Nx.tensor([[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]])
-      )
-    end
+    assert_equal(
+      inputs["token_type_ids"],
+      Nx.tensor([[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]])
+    )
   end
 end
diff --git a/test/bumblebee/text/zero_shot_classification_test.exs b/test/bumblebee/text/zero_shot_classification_test.exs
index 69d008a3..ef66b0d2 100644
--- a/test/bumblebee/text/zero_shot_classification_test.exs
+++ b/test/bumblebee/text/zero_shot_classification_test.exs
@@ -3,72 +3,70 @@ defmodule Bumblebee.Text.ZeroShotClassificationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
-
-  describe "integration" do
-    test "correctly classifies labels with 1 sequence" do
-      {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
-      labels = ["cooking", "traveling", "dancing"]
-
-      zero_shot_serving = Bumblebee.Text.zero_shot_classification(model, tokenizer, labels)
-
-      output = Nx.Serving.run(zero_shot_serving, "one day I will see the world")
-
-      assert %{
-               predictions: [
-                 %{label: "traveling", score: _},
-                 %{label: "dancing", score: _},
-                 %{label: "cooking", score: _}
-               ]
-             } = output
-
-      assert %{label: "traveling", score: score} = Enum.max_by(output.predictions, & &1.score)
-      assert_all_close(score, 0.9874)
-    end
-
-    test "correctly classifies labels with 2 sequences" do
-      {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
-      labels = ["cooking", "traveling", "dancing"]
-
-      zero_shot_serving = Bumblebee.Text.zero_shot_classification(model, tokenizer, labels)
-
-      assert [output1, output2] =
-               Nx.Serving.run(zero_shot_serving, [
-                 "one day I will see the world",
-                 "one day I will learn to salsa"
-               ])
-
-      assert %{label: "traveling", score: score1} = Enum.max_by(output1.predictions, & &1.score)
-      assert_all_close(score1, 0.9874)
-
-      assert %{label: "dancing", score: score2} = Enum.max_by(output2.predictions, & &1.score)
-      assert_all_close(score2, 0.9585)
-    end
-
-    test "correctly classifies batch with compilation set to true" do
-      {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
-      labels = ["cooking", "traveling", "dancing"]
-
-      zero_shot_serving =
-        Bumblebee.Text.zero_shot_classification(model, tokenizer, labels,
-          compile: [batch_size: 2, sequence_length: 32],
-          defn_options: [compiler: EXLA]
-        )
-
-      assert [output1, output2] =
-               Nx.Serving.run(zero_shot_serving, [
-                 "one day I will see the world",
-                 "one day I will learn to salsa"
-               ])
-
-      assert %{label: "traveling", score: score1} = Enum.max_by(output1.predictions, & &1.score)
-      assert_all_close(score1, 0.9874)
-
-      assert %{label: "dancing", score: score2} = Enum.max_by(output2.predictions, & &1.score)
-      assert_all_close(score2, 0.9585)
-    end
+  @moduletag serving_test_tags()
+
+  test "correctly classifies labels with one sequence" do
+    {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
+    labels = ["cooking", "traveling", "dancing"]
+
+    zero_shot_serving = Bumblebee.Text.zero_shot_classification(model, tokenizer, labels)
+
+    output = Nx.Serving.run(zero_shot_serving, "one day I will see the world")
+
+    assert %{
+             predictions: [
+               %{label: "traveling", score: _},
+               %{label: "dancing", score: _},
+               %{label: "cooking", score: _}
+             ]
+           } = output
+
+    assert %{label: "traveling", score: score} = Enum.max_by(output.predictions, & &1.score)
+    assert_all_close(score, 0.9874)
+  end
+
+  test "correctly classifies labels with multiple sequences" do
+    {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
+    labels = ["cooking", "traveling", "dancing"]
+
+    zero_shot_serving = Bumblebee.Text.zero_shot_classification(model, tokenizer, labels)
+
+    assert [output1, output2] =
+             Nx.Serving.run(zero_shot_serving, [
+               "one day I will see the world",
+               "one day I will learn to salsa"
+             ])
+
+    assert %{label: "traveling", score: score1} = Enum.max_by(output1.predictions, & &1.score)
+    assert_all_close(score1, 0.9874)
+
+    assert %{label: "dancing", score: score2} = Enum.max_by(output2.predictions, & &1.score)
+    assert_all_close(score2, 0.9585)
+  end
+
+  test "correctly classifies batch with compilation set to true" do
+    {:ok, model} = Bumblebee.load_model({:hf, "facebook/bart-large-mnli"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "facebook/bart-large-mnli"})
+    labels = ["cooking", "traveling", "dancing"]
+
+    zero_shot_serving =
+      Bumblebee.Text.zero_shot_classification(model, tokenizer, labels,
+        compile: [batch_size: 2, sequence_length: 32],
+        defn_options: [compiler: EXLA]
+      )
+
+    assert [output1, output2] =
+             Nx.Serving.run(zero_shot_serving, [
+               "one day I will see the world",
+               "one day I will learn to salsa"
+             ])
+
+    assert %{label: "traveling", score: score1} = Enum.max_by(output1.predictions, & &1.score)
+    assert_all_close(score1, 0.9874)
+
+    assert %{label: "dancing", score: score2} = Enum.max_by(output2.predictions, & &1.score)
+    assert_all_close(score2, 0.9585)
   end
 end
diff --git a/test/bumblebee/vision/blip_featurizer_test.exs b/test/bumblebee/vision/blip_featurizer_test.exs
index 2df0b460..0437ee20 100644
--- a/test/bumblebee/vision/blip_featurizer_test.exs
+++ b/test/bumblebee/vision/blip_featurizer_test.exs
@@ -1,18 +1,16 @@
 defmodule Bumblebee.Vision.BlipFeaturizerTest do
   use ExUnit.Case, async: true
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, featurizer} =
-               Bumblebee.load_featurizer({:hf, "Salesforce/blip-image-captioning-base"})
+  test "encodes image" do
+    assert {:ok, featurizer} =
+             Bumblebee.load_featurizer({:hf, "Salesforce/blip-image-captioning-base"})
 
-      assert %Bumblebee.Vision.BlipFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.BlipFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 384, 384, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 384, 384, 3}
   end
 end
diff --git a/test/bumblebee/vision/blip_vision_test.exs b/test/bumblebee/vision/blip_vision_test.exs
index dd485ad6..bb128035 100644
--- a/test/bumblebee/vision/blip_vision_test.exs
+++ b/test/bumblebee/vision/blip_vision_test.exs
@@ -1,41 +1,40 @@
 defmodule Bumblebee.Vision.BlipVisionTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "Salesforce/blip-image-captioning-base"},
-                 module: Bumblebee.Vision.BlipVision,
-                 architecture: :base
-               )
-
-      assert %Bumblebee.Vision.BlipVision{architecture: :base} = spec
-
-      inputs = %{
-        "pixel_values" => Nx.broadcast(0.5, {1, 384, 384, 3})
-      }
-
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.hidden_state) == {1, 577, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[-0.5337, 1.1098, 0.4768], [-0.7984, 0.9996, -0.2640], [-0.1782, 0.8242, 0.4417]]
-        ]),
-        atol: 1.0e-4
-      )
-
-      assert_all_close(
-        outputs.pooled_state[[.., 1..3]],
-        Nx.tensor([[-0.0882, -0.3926, -0.5420]]),
-        atol: 1.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-BlipModel"},
+               module: Bumblebee.Vision.BlipVision,
+               architecture: :base
+             )
+
+    assert %Bumblebee.Vision.BlipVision{architecture: :base} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 226, 32}
+    assert Nx.shape(outputs.pooled_state) == {1, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]] |> Nx.multiply(1_000_000),
+      Nx.tensor([
+        [[-0.0272, -0.0129, 0.0174], [0.0069, -0.0429, -0.0334], [0.0428, -0.0797, -0.0353]]
+      ]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]] |> Nx.multiply(10_000),
+      Nx.tensor([[-0.0128, -0.0792, -0.1011]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/clip_featurizer_test.exs b/test/bumblebee/vision/clip_featurizer_test.exs
index a9aa7812..50b885f1 100644
--- a/test/bumblebee/vision/clip_featurizer_test.exs
+++ b/test/bumblebee/vision/clip_featurizer_test.exs
@@ -1,17 +1,15 @@
 defmodule Bumblebee.Vision.ClipFeaturizerTest do
   use ExUnit.Case, async: true
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
+  test "encodes image" do
+    assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
 
-      assert %Bumblebee.Vision.ClipFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.ClipFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
   end
 end
diff --git a/test/bumblebee/vision/clip_vision_test.exs b/test/bumblebee/vision/clip_vision_test.exs
index 03dd6f6c..c5932afb 100644
--- a/test/bumblebee/vision/clip_vision_test.exs
+++ b/test/bumblebee/vision/clip_vision_test.exs
@@ -1,65 +1,64 @@
 defmodule Bumblebee.Vision.ClipVisionTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-                 module: Bumblebee.Vision.ClipVision,
-                 architecture: :base
-               )
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-CLIPModel"},
+               module: Bumblebee.Vision.ClipVision,
+               architecture: :base
+             )
 
-      assert %Bumblebee.Vision.ClipVision{architecture: :base} = spec
+    assert %Bumblebee.Vision.ClipVision{architecture: :base} = spec
 
-      inputs = %{
-        "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
-      }
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.hidden_state) == {1, 50, 768}
+    assert Nx.shape(outputs.hidden_state) == {1, 226, 32}
+    assert Nx.shape(outputs.pooled_state) == {1, 32}
 
-      assert_all_close(
-        outputs.hidden_state[[.., 1..3, 1..3]],
-        Nx.tensor([
-          [[0.3465, -0.3939, -0.5297], [0.3588, -0.2529, -0.5606], [0.3958, -0.2688, -0.5367]]
-        ]),
-        atol: 1.0e-4
-      )
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[0.4483, 0.3736, -0.5581], [0.9376, -0.3424, -0.1002], [0.5782, 0.1069, -0.2953]]
+      ]),
+      atol: 1.0e-4
+    )
 
-      assert_all_close(
-        outputs.pooled_state[[.., 1..3]],
-        Nx.tensor([[0.3602, 0.3658, -0.2337]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]],
+      Nx.tensor([[-0.5059, 0.7391, 0.9252]]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "embedding model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-                 module: Bumblebee.Vision.ClipVision,
-                 architecture: :for_embedding
-               )
+  test ":for_embedding" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-CLIPModel"},
+               module: Bumblebee.Vision.ClipVision,
+               architecture: :for_embedding
+             )
 
-      assert %Bumblebee.Vision.ClipVision{architecture: :for_embedding} = spec
+    assert %Bumblebee.Vision.ClipVision{architecture: :for_embedding} = spec
 
-      inputs = %{
-        "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
-      }
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
 
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.embedding) == {1, 512}
+    assert Nx.shape(outputs.embedding) == {1, 64}
 
-      assert_all_close(
-        outputs.embedding[[.., 1..3]],
-        Nx.tensor([[-0.3381, -0.0196, -0.4053]]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.embedding[[.., 1..3]],
+      Nx.tensor([[0.8865, -0.9042, -1.1233]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/convnext_featurizer_test.exs b/test/bumblebee/vision/convnext_featurizer_test.exs
index 22e2ff18..2eb1e86f 100644
--- a/test/bumblebee/vision/convnext_featurizer_test.exs
+++ b/test/bumblebee/vision/convnext_featurizer_test.exs
@@ -1,29 +1,27 @@
 defmodule Bumblebee.Vision.ConvNextFeaturizerTest do
   use ExUnit.Case, async: true
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "facebook/convnext-tiny-224"})
+  test "encodes image" do
+    assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "facebook/convnext-tiny-224"})
 
-      assert %Bumblebee.Vision.ConvNextFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.ConvNextFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
+  end
 
-    test "allows an alpha channel" do
-      assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "facebook/convnext-tiny-224"})
+  test "allows an alpha channel" do
+    assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "facebook/convnext-tiny-224"})
 
-      assert %Bumblebee.Vision.ConvNextFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.ConvNextFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 4})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 4})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
   end
 end
diff --git a/test/bumblebee/vision/convnext_test.exs b/test/bumblebee/vision/convnext_test.exs
index 807d1d81..e6a89efd 100644
--- a/test/bumblebee/vision/convnext_test.exs
+++ b/test/bumblebee/vision/convnext_test.exs
@@ -1,45 +1,58 @@
 defmodule Bumblebee.Vision.ConvNextTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/convnext-tiny-224"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-ConvNextModel"})
 
-      assert %Bumblebee.Vision.ConvNext{architecture: :base} = spec
+    assert %Bumblebee.Vision.ConvNext{architecture: :base} = spec
 
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
+    }
 
-      assert Nx.shape(outputs.pooled_state) == {1, 768}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        Nx.sum(outputs.pooled_state),
-        Nx.tensor(-2.1095),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.hidden_state) == {1, 7, 7, 40}
+    assert Nx.shape(outputs.pooled_state) == {1, 40}
 
-    test "image classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/convnext-tiny-224"})
+    assert_all_close(
+      to_channels_first(outputs.hidden_state)[[.., 1..2, 1..2, 1..2]],
+      Nx.tensor([[[[0.3924, 0.3924], [0.3924, 0.3924]], [[-0.2330, -0.2330], [-0.2330, -0.2330]]]]),
+      atol: 1.0e-4
+    )
 
-      assert %Bumblebee.Vision.ConvNext{architecture: :for_image_classification} = spec
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]],
+      Nx.tensor([[2.2793, -1.3236, -1.0714]]),
+      atol: 1.0e-3
+    )
+  end
+
+  test ":for_image_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ConvNextForImageClassification"}
+             )
+
+    assert %Bumblebee.Vision.ConvNext{architecture: :for_image_classification} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
+    }
 
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
+    outputs = Axon.predict(model, params, inputs)
 
-      assert Nx.shape(outputs.logits) == {1, 1000}
+    assert Nx.shape(outputs.logits) == {1, 2}
 
-      assert_all_close(
-        outputs.logits[[0, 0..2]],
-        Nx.tensor([-0.4239, -0.2082, 0.0709]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0047, -0.1457]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/deit_featurizer_test.exs b/test/bumblebee/vision/deit_featurizer_test.exs
index 75009bf5..9bfcc0a8 100644
--- a/test/bumblebee/vision/deit_featurizer_test.exs
+++ b/test/bumblebee/vision/deit_featurizer_test.exs
@@ -1,18 +1,16 @@
 defmodule Bumblebee.Vision.DeitFeaturizerTest do
   use ExUnit.Case, async: true
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, featurizer} =
-               Bumblebee.load_featurizer({:hf, "facebook/deit-base-distilled-patch16-224"})
+  test "encodes image" do
+    assert {:ok, featurizer} =
+             Bumblebee.load_featurizer({:hf, "facebook/deit-base-distilled-patch16-224"})
 
-      assert %Bumblebee.Vision.DeitFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.DeitFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
   end
 end
diff --git a/test/bumblebee/vision/deit_test.exs b/test/bumblebee/vision/deit_test.exs
index 32b173eb..18a2db5e 100644
--- a/test/bumblebee/vision/deit_test.exs
+++ b/test/bumblebee/vision/deit_test.exs
@@ -1,85 +1,106 @@
 defmodule Bumblebee.Vision.DeitTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/deit-base-distilled-patch16-224"},
-                 architecture: :base
-               )
-
-      assert %Bumblebee.Vision.Deit{architecture: :base} = spec
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      # Pre-trained checkpoints by default do not use
-      # the pooler layers
-      assert Nx.shape(outputs.hidden_state) == {1, 198, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[0, 0, 0..2]],
-        Nx.tensor([-0.0738, -0.2792, -0.0235]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "image classification model with teacher" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/deit-base-distilled-patch16-224"})
-
-      assert %Bumblebee.Vision.Deit{architecture: :for_image_classification_with_teacher} = spec
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 1000}
-
-      assert_all_close(
-        outputs.logits[[0, 0..2]],
-        Nx.tensor([-0.7490, 0.7397, 0.6383]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked image modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "facebook/deit-base-distilled-patch16-224"},
-                 architecture: :for_masked_image_modeling
-               )
-
-      assert %Bumblebee.Vision.Deit{architecture: :for_masked_image_modeling} = spec
-
-      # There is no pre-trained version on Hugging Face, so we use a fixed parameter
-      params =
-        update_in(params["masked_image_modeling_head.output"]["kernel"], fn x ->
-          # We use iota in the order of the pytorch kernel
-          x
-          |> Nx.transpose(axes: [3, 2, 1, 0])
-          |> Nx.shape()
-          |> Nx.iota(type: :f32)
-          |> Nx.divide(Nx.size(x))
-          |> Nx.transpose(axes: [2, 3, 1, 0])
-        end)
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 224, 224, 3}
-
-      assert_all_close(
-        to_channels_first(outputs.logits)[[0, 0, 0..2, 0..2]],
-        Nx.tensor([
-          [-0.0159, 0.0084, 0.0326],
-          [0.3719, 0.3961, 0.4204],
-          [0.7597, 0.7839, 0.8082]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-DeiTModel"})
+
+    assert %Bumblebee.Vision.Deit{architecture: :base} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 227, 32}
+    assert Nx.shape(outputs.pooled_state) == {1, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-3.0866, 0.2350, 0.2003], [-1.2774, -0.1192, -1.0468], [-1.2774, -0.1192, -1.0468]]
+      ]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]],
+      Nx.tensor([[0.1526, -0.1437, -0.0646]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_image_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DeiTForImageClassification"}
+             )
+
+    assert %Bumblebee.Vision.Deit{architecture: :for_image_classification} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[0.0481, 0.1008]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_image_classification_with_teacher" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DeiTForImageClassificationWithTeacher"}
+             )
+
+    assert %Bumblebee.Vision.Deit{architecture: :for_image_classification_with_teacher} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.0108, -0.0048]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_masked_image_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-DeiTForMaskedImageModeling"}
+             )
+
+    assert %Bumblebee.Vision.Deit{architecture: :for_masked_image_modeling} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 30, 30, 3}
+
+    assert_all_close(
+      to_channels_first(outputs.logits)[[.., 1..2, 1..2, 1..2]],
+      Nx.tensor([[[[0.1455, 0.0229], [-0.0097, 0.0525]], [[0.1889, 0.0910], [-0.1083, -0.0244]]]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/image_classification_test.exs b/test/bumblebee/vision/image_classification_test.exs
index f21e01a6..c05f8c64 100644
--- a/test/bumblebee/vision/image_classification_test.exs
+++ b/test/bumblebee/vision/image_classification_test.exs
@@ -3,51 +3,49 @@ defmodule Bumblebee.Vision.ImageClassificationTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
   @images_dir Path.expand("../../fixtures/images", __DIR__)
 
-  describe "integration" do
-    test "returns top scored labels" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "microsoft/resnet-50"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "microsoft/resnet-50"})
-
-      serving = Bumblebee.Vision.ImageClassification.image_classification(model_info, featurizer)
-
-      image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
-
-      assert %{
-               predictions: [
-                 %{label: "tiger cat", score: _},
-                 %{label: "tabby, tabby cat", score: _},
-                 %{label: "remote control, remote", score: _},
-                 %{label: "jinrikisha, ricksha, rickshaw", score: _},
-                 %{label: "Egyptian cat", score: _}
-               ]
-             } = Nx.Serving.run(serving, image)
-    end
-
-    test "supports compilation" do
-      {:ok, model_info} = Bumblebee.load_model({:hf, "microsoft/resnet-50"})
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "microsoft/resnet-50"})
-
-      serving =
-        Bumblebee.Vision.ImageClassification.image_classification(model_info, featurizer,
-          compile: [batch_size: 1],
-          defn_options: [compiler: EXLA]
-        )
-
-      image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
-
-      assert %{
-               predictions: [
-                 %{label: "tiger cat", score: _},
-                 %{label: "tabby, tabby cat", score: _},
-                 %{label: "remote control, remote", score: _},
-                 %{label: "jinrikisha, ricksha, rickshaw", score: _},
-                 %{label: "Egyptian cat", score: _}
-               ]
-             } = Nx.Serving.run(serving, image)
-    end
+  test "returns top scored labels" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "microsoft/resnet-50"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "microsoft/resnet-50"})
+
+    serving = Bumblebee.Vision.ImageClassification.image_classification(model_info, featurizer)
+
+    image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
+
+    assert %{
+             predictions: [
+               %{label: "tiger cat", score: _},
+               %{label: "tabby, tabby cat", score: _},
+               %{label: "remote control, remote", score: _},
+               %{label: "jinrikisha, ricksha, rickshaw", score: _},
+               %{label: "Egyptian cat", score: _}
+             ]
+           } = Nx.Serving.run(serving, image)
+  end
+
+  test "supports compilation" do
+    {:ok, model_info} = Bumblebee.load_model({:hf, "microsoft/resnet-50"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "microsoft/resnet-50"})
+
+    serving =
+      Bumblebee.Vision.ImageClassification.image_classification(model_info, featurizer,
+        compile: [batch_size: 1],
+        defn_options: [compiler: EXLA]
+      )
+
+    image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
+
+    assert %{
+             predictions: [
+               %{label: "tiger cat", score: _},
+               %{label: "tabby, tabby cat", score: _},
+               %{label: "remote control, remote", score: _},
+               %{label: "jinrikisha, ricksha, rickshaw", score: _},
+               %{label: "Egyptian cat", score: _}
+             ]
+           } = Nx.Serving.run(serving, image)
   end
 end
diff --git a/test/bumblebee/vision/image_embedding_test.exs b/test/bumblebee/vision/image_embedding_test.exs
index 7770e932..712442dc 100644
--- a/test/bumblebee/vision/image_embedding_test.exs
+++ b/test/bumblebee/vision/image_embedding_test.exs
@@ -3,54 +3,52 @@ defmodule Bumblebee.Vision.ImageEmbeddingTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
   @images_dir Path.expand("../../fixtures/images", __DIR__)
 
-  describe "integration" do
-    test "returns CLIP Vision embedding (without projection head) for an image" do
-      {:ok, model_info} =
-        Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-          module: Bumblebee.Vision.ClipVision
-        )
+  test "returns embedding for an image" do
+    {:ok, model_info} =
+      Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
+        module: Bumblebee.Vision.ClipVision
+      )
 
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
 
-      serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer)
-      image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
+    serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer)
+    image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
 
-      assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
-      assert Nx.shape(embedding) == {768}
+    assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
+    assert Nx.shape(embedding) == {768}
 
-      assert_all_close(
-        embedding[1..3],
-        Nx.tensor([0.0978, -0.7233, -0.7707]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      embedding[1..3],
+      Nx.tensor([0.0978, -0.7233, -0.7707]),
+      atol: 1.0e-4
+    )
+  end
 
-    test "returns normalized CLIP Vision embedding (without projection head) for an image" do
-      {:ok, model_info} =
-        Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
-          module: Bumblebee.Vision.ClipVision
-        )
+  test "returns normalized embedding for an image" do
+    {:ok, model_info} =
+      Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
+        module: Bumblebee.Vision.ClipVision
+      )
 
-      {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
+    {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
 
-      options = [
-        embedding_processor: :l2_norm
-      ]
+    options = [
+      embedding_processor: :l2_norm
+    ]
 
-      serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer, options)
-      image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
+    serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer, options)
+    image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
 
-      assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
-      assert Nx.shape(embedding) == {768}
+    assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
+    assert Nx.shape(embedding) == {768}
 
-      assert_all_close(
-        embedding[1..3],
-        Nx.tensor([0.0036, -0.0269, -0.0286]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      embedding[1..3],
+      Nx.tensor([0.0036, -0.0269, -0.0286]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/image_to_text_test.exs b/test/bumblebee/vision/image_to_text_test.exs
index 7ff840ad..5d1c9be0 100644
--- a/test/bumblebee/vision/image_to_text_test.exs
+++ b/test/bumblebee/vision/image_to_text_test.exs
@@ -3,30 +3,28 @@ defmodule Bumblebee.Vision.ImageToTextTest do
 
   import Bumblebee.TestHelpers
 
-  @moduletag model_test_tags()
+  @moduletag serving_test_tags()
 
   @images_dir Path.expand("../../fixtures/images", __DIR__)
 
-  describe "integration" do
-    test "returns top scored labels" do
-      {:ok, blip} = Bumblebee.load_model({:hf, "Salesforce/blip-image-captioning-base"})
+  test "generates text describing an image" do
+    {:ok, blip} = Bumblebee.load_model({:hf, "Salesforce/blip-image-captioning-base"})
 
-      {:ok, featurizer} =
-        Bumblebee.load_featurizer({:hf, "Salesforce/blip-image-captioning-base"})
+    {:ok, featurizer} =
+      Bumblebee.load_featurizer({:hf, "Salesforce/blip-image-captioning-base"})
 
-      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Salesforce/blip-image-captioning-base"})
+    {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Salesforce/blip-image-captioning-base"})
 
-      {:ok, generation_config} =
-        Bumblebee.load_generation_config({:hf, "Salesforce/blip-image-captioning-base"})
+    {:ok, generation_config} =
+      Bumblebee.load_generation_config({:hf, "Salesforce/blip-image-captioning-base"})
 
-      serving =
-        Bumblebee.Vision.ImageToText.image_to_text(blip, featurizer, tokenizer, generation_config)
+    serving =
+      Bumblebee.Vision.ImageToText.image_to_text(blip, featurizer, tokenizer, generation_config)
 
-      image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
+    image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))
 
-      assert %{
-               results: [%{text: "two cats sleeping on a couch"}]
-             } = Nx.Serving.run(serving, image)
-    end
+    assert %{
+             results: [%{text: "two cats sleeping on a couch"}]
+           } = Nx.Serving.run(serving, image)
   end
 end
diff --git a/test/bumblebee/vision/resnet_test.exs b/test/bumblebee/vision/resnet_test.exs
index 87ab670f..f237fefe 100644
--- a/test/bumblebee/vision/resnet_test.exs
+++ b/test/bumblebee/vision/resnet_test.exs
@@ -1,45 +1,62 @@
 defmodule Bumblebee.Vision.ResNetTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/resnet-50"}, architecture: :base)
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-ResNetModel"})
 
-      assert %Bumblebee.Vision.ResNet{architecture: :base} = spec
+    assert %Bumblebee.Vision.ResNet{architecture: :base} = spec
 
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
+    }
 
-      assert Nx.shape(outputs.pooled_state) == {1, 1, 1, 2048}
+    outputs = Axon.predict(model, params, inputs)
 
-      assert_all_close(
-        Nx.sum(outputs.pooled_state),
-        Nx.tensor(14.5119),
-        atol: 1.0e-4
-      )
-    end
+    assert Nx.shape(outputs.hidden_state) == {1, 7, 7, 40}
+    assert Nx.shape(outputs.pooled_state) == {1, 1, 1, 40}
 
-    test "image classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "microsoft/resnet-50"})
+    assert_all_close(
+      to_channels_first(outputs.hidden_state)[[.., 2..3, 2..3, 2..3]],
+      Nx.tensor([[[[0.0000, 0.0000], [0.0000, 0.0000]], [[0.9835, 0.9835], [0.9835, 0.9835]]]]),
+      atol: 1.0e-4
+    )
 
-      assert %Bumblebee.Vision.ResNet{architecture: :for_image_classification} = spec
+    assert_all_close(Nx.sum(outputs.hidden_state), Nx.tensor(209.6328), atol: 1.0e-4)
 
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
+    assert_all_close(
+      to_channels_first(outputs.pooled_state)[[.., 1..3, .., ..]],
+      Nx.tensor([[[[0.0275]], [[0.0095]], [[0.8921]]]]),
+      atol: 1.0e-4
+    )
 
-      assert Nx.shape(outputs.logits) == {1, 1000}
+    assert_all_close(Nx.sum(outputs.pooled_state), Nx.tensor(4.2782), atol: 1.0e-4)
+  end
+
+  test ":for_image_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ResNetForImageClassification"}
+             )
+
+    assert %Bumblebee.Vision.ResNet{architecture: :for_image_classification} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 3}
 
-      assert_all_close(
-        outputs.logits[[0, 0..2]],
-        Nx.tensor([-6.6223, -6.2090, -5.8592]),
-        atol: 1.0e-4
-      )
-    end
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.1053, 0.2160, -0.0331]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee/vision/vit_featurizer_test.exs b/test/bumblebee/vision/vit_featurizer_test.exs
index 4a62a38f..3587ccb7 100644
--- a/test/bumblebee/vision/vit_featurizer_test.exs
+++ b/test/bumblebee/vision/vit_featurizer_test.exs
@@ -1,17 +1,15 @@
 defmodule Bumblebee.Vision.VitFeaturizerTest do
   use ExUnit.Case, async: true
 
-  describe "integration" do
-    test "encoding model input" do
-      assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "google/vit-base-patch16-224"})
+  test "encodes image" do
+    assert {:ok, featurizer} = Bumblebee.load_featurizer({:hf, "google/vit-base-patch16-224"})
 
-      assert %Bumblebee.Vision.VitFeaturizer{} = featurizer
+    assert %Bumblebee.Vision.VitFeaturizer{} = featurizer
 
-      image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
+    image = Nx.tensor([[[50], [100]], [[150], [200]]]) |> Nx.broadcast({2, 2, 3})
 
-      inputs = Bumblebee.apply_featurizer(featurizer, image)
+    inputs = Bumblebee.apply_featurizer(featurizer, image)
 
-      assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
-    end
+    assert Nx.shape(inputs["pixel_values"]) == {1, 224, 224, 3}
   end
 end
diff --git a/test/bumblebee/vision/vit_test.exs b/test/bumblebee/vision/vit_test.exs
index d6e24deb..2d75e5eb 100644
--- a/test/bumblebee/vision/vit_test.exs
+++ b/test/bumblebee/vision/vit_test.exs
@@ -1,83 +1,83 @@
 defmodule Bumblebee.Vision.VitTest do
-  use ExUnit.Case, async: false
+  use ExUnit.Case, async: true
 
   import Bumblebee.TestHelpers
 
   @moduletag model_test_tags()
 
-  describe "integration" do
-    test "base model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "google/vit-base-patch16-224"}, architecture: :base)
-
-      assert %Bumblebee.Vision.Vit{architecture: :base} = spec
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      # Pre-trained checkpoints by default do not use
-      # the pooler layers
-      assert Nx.shape(outputs.hidden_state) == {1, 197, 768}
-
-      assert_all_close(
-        outputs.hidden_state[[0, 0, 0..2]],
-        Nx.tensor([0.4435, 0.4302, -0.1585]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "image classification model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "google/vit-base-patch16-224"})
-
-      assert %Bumblebee.Vision.Vit{architecture: :for_image_classification} = spec
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 1000}
-
-      assert_all_close(
-        outputs.logits[[0, 0..2]],
-        Nx.tensor([0.0112, -0.5065, -0.7792]),
-        atol: 1.0e-4
-      )
-    end
-
-    test "masked image modeling model" do
-      assert {:ok, %{model: model, params: params, spec: spec}} =
-               Bumblebee.load_model({:hf, "google/vit-base-patch16-224-in21k"},
-                 architecture: :for_masked_image_modeling
-               )
-
-      assert %Bumblebee.Vision.Vit{architecture: :for_masked_image_modeling} = spec
-
-      # There is no pre-trained version on Hugging Face, so we use a fixed parameter
-      params =
-        update_in(params["masked_image_modeling_head.output"]["kernel"], fn x ->
-          # We use iota in the order of the pytorch kernel
-          x
-          |> Nx.transpose(axes: [3, 2, 1, 0])
-          |> Nx.shape()
-          |> Nx.iota(type: :f32)
-          |> Nx.divide(Nx.size(x))
-          |> Nx.transpose(axes: [2, 3, 1, 0])
-        end)
-
-      inputs = %{"pixel_values" => Nx.broadcast(0.5, {1, 224, 224, 3})}
-      outputs = Axon.predict(model, params, inputs)
-
-      assert Nx.shape(outputs.logits) == {1, 224, 224, 3}
-
-      assert_all_close(
-        to_channels_first(outputs.logits)[[0, 0, 0..2, 0..2]],
-        Nx.tensor([
-          [-0.0103, -0.0275, -0.0447],
-          [-0.2853, -0.3025, -0.3197],
-          [-0.5603, -0.5774, -0.5946]
-        ]),
-        atol: 1.0e-4
-      )
-    end
+  test ":base" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-ViTModel"})
+
+    assert %Bumblebee.Vision.Vit{architecture: :base} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.hidden_state) == {1, 226, 32}
+    assert Nx.shape(outputs.pooled_state) == {1, 32}
+
+    assert_all_close(
+      outputs.hidden_state[[.., 1..3, 1..3]],
+      Nx.tensor([
+        [[-0.2075, 2.7865, 0.2361], [-0.3014, 2.5312, -0.6127], [-0.3460, 2.8741, 0.1988]]
+      ]),
+      atol: 1.0e-4
+    )
+
+    assert_all_close(
+      outputs.pooled_state[[.., 1..3]],
+      Nx.tensor([[-0.0244, -0.0515, -0.1584]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_image_classification" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ViTForImageClassification"}
+             )
+
+    assert %Bumblebee.Vision.Vit{architecture: :for_image_classification} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 2}
+
+    assert_all_close(
+      outputs.logits,
+      Nx.tensor([[-0.1596, 0.1818]]),
+      atol: 1.0e-4
+    )
+  end
+
+  test ":for_masked_image_modeling" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model(
+               {:hf, "hf-internal-testing/tiny-random-ViTForMaskedImageModeling"}
+             )
+
+    assert %Bumblebee.Vision.Vit{architecture: :for_masked_image_modeling} = spec
+
+    inputs = %{
+      "pixel_values" => Nx.broadcast(0.5, {1, 30, 30, 3})
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 30, 30, 3}
+
+    assert_all_close(
+      to_channels_first(outputs.logits)[[.., 1..2, 1..2, 1..2]],
+      Nx.tensor([[[[0.0752, -0.0192], [-0.0252, 0.0232]], [[0.0548, -0.0216], [0.0728, -0.1687]]]]),
+      atol: 1.0e-4
+    )
   end
 end
diff --git a/test/bumblebee_test.exs b/test/bumblebee_test.exs
index c689d938..a94aaad6 100644
--- a/test/bumblebee_test.exs
+++ b/test/bumblebee_test.exs
@@ -12,21 +12,22 @@ defmodule BumblebeeTest do
 
     @tag :capture_log
     test "supports sharded models" do
-      assert {:ok, %{params: params}} = Bumblebee.load_model({:hf, "sshleifer/tiny-gpt2"})
+      assert {:ok, %{params: params}} =
+               Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPT2Model"})
 
       assert {:ok, %{params: sharded_params}} =
-               Bumblebee.load_model({:hf, "jonatanklosko/test-tiny-gpt2-sharded"})
+               Bumblebee.load_model({:hf, "bumblebee-testing/tiny-random-GPT2Model-sharded"})
 
       assert Enum.sort(Map.keys(params)) == Enum.sort(Map.keys(sharded_params))
     end
 
     test "supports .safetensors params file" do
-      assert {:ok, %{params: params}} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
+      assert {:ok, %{params: params}} =
+               Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-GPT2Model"})
 
       assert {:ok, %{params: safetensors_params}} =
                Bumblebee.load_model(
-                 {:hf, "openai/whisper-tiny"},
-                 params_filename: "model.safetensors"
+                 {:hf, "bumblebee-testing/tiny-random-GPT2Model-safetensors-only"}
                )
 
       assert Enum.sort(Map.keys(params)) == Enum.sort(Map.keys(safetensors_params))
diff --git a/test/support/test_helpers.ex b/test/support/test_helpers.ex
index 8e9b0602..57c1e7ff 100644
--- a/test/support/test_helpers.ex
+++ b/test/support/test_helpers.ex
@@ -39,7 +39,11 @@ defmodule Bumblebee.TestHelpers do
   end
 
   def model_test_tags() do
-    [slow: true, capture_log: true, timeout: 600_000]
+    [model: true, capture_log: true, timeout: 60_000]
+  end
+
+  def serving_test_tags() do
+    [serving: true, slow: true, capture_log: true, timeout: 600_000]
   end
 
   def to_channels_first(tensor) do
diff --git a/test/test_helper.exs b/test/test_helper.exs
index ebe9b439..27370b09 100644
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@@ -22,4 +22,8 @@ Application.put_env(:exla, :preferred_clients, [:tpu, :cuda, :rocm, :other_host,
 
 Application.put_env(:nx, :default_backend, {EXLA.Backend, client: :host})
 
+if System.fetch_env("BUMBLEBEE_OFFLINE") == :error do
+  IO.puts("To run tests without hitting the network: BUMBLEBEE_OFFLINE=true mix test")
+end
+
 ExUnit.start(exclude: [:slow] ++ exclude_multi_device)