Rewrite tests to use tiny model checkpoints (#297)

elixir-nx · Nov 30, 2023 · 21832f3 · 21832f3
1 parent cf4dff6
commit 21832f3
Show file tree

Hide file tree

Showing 81 changed files with 3,680 additions and 3,810 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -46,7 +46,21 @@ jobs:
       - run: mix deps.compile
       - run: mix compile --warnings-as-errors
         if: ${{ matrix.lint }}
+      - name: Restore bumblebee cache
+        id: cache-bumblebee-restore
+        uses: actions/cache/restore@v3
+        with:
+          path: bumblebee_cache
+          key: ${{ runner.os }}-bumblebee-cache-${{ matrix.pair.elixir }}-${{ matrix.pair.otp }}
       - run: mix test
+        env:
+          BUMBLEBEE_CACHE_DIR: ${{ github.workspace }}/bumblebee_cache
+      - name: Save bumblebee cache
+        id: cache-bumblebee-save
+        uses: actions/cache/save@v3
+        with:
+          path: bumblebee_cache
+          key: ${{ steps.cache-bumblebee-restore.outputs.cache-primary-key }}
       - uses: technote-space/get-diff-action@v6
         with:
           PATTERNS: test/**/*_test.exs

diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
@@ -124,10 +124,11 @@ defmodule Bumblebee do
       {Bumblebee.Text.Distilbert, :for_sequence_classification},
     "DistilBertForQuestionAnswering" => {Bumblebee.Text.Distilbert, :for_question_answering},
     "DistilBertForTokenClassification" => {Bumblebee.Text.Distilbert, :for_token_classification},
+    "DistilBertForMultipleChoice" => {Bumblebee.Text.Distilbert, :for_multiple_choice},
     "GPT2ForSequenceClassification" => {Bumblebee.Text.Gpt2, :for_sequence_classification},
     "GPT2ForTokenClassification" => {Bumblebee.Text.Gpt2, :for_token_classification},
     "GPT2LMHeadModel" => {Bumblebee.Text.Gpt2, :for_causal_language_modeling},
-    "GPT2Model" => {BumbleBee.Text.Gpt2, :base},
+    "GPT2Model" => {Bumblebee.Text.Gpt2, :base},
     "GPTBigCodeModel" => {Bumblebee.Text.GptBigCode, :base},
     "GPTBigCodeForCausalLM" => {Bumblebee.Text.GptBigCode, :for_causal_language_modeling},
     "GPTBigCodeForSequenceClassification" =>
@@ -137,8 +138,7 @@ defmodule Bumblebee do
     "GPTNeoXForCausalLM" => {Bumblebee.Text.GptNeoX, :for_causal_language_modeling},
     "GPTNeoXForSequenceClassification" => {Bumblebee.Text.GptNeoX, :for_sequence_classification},
     "GPTNeoXForTokenClassification" => {Bumblebee.Text.GptNeoX, :for_token_classification},
-    "LayoutLMForMaskedLanguageModeling" =>
-      {Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
+    "LayoutLMForMaskedLM" => {Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
     "LayoutLMForQuestionAnswering" => {Bumblebee.Multimodal.LayoutLm, :for_question_answering},
     "LayoutLMForSequenceClassification" =>
       {Bumblebee.Multimodal.LayoutLm, :for_sequence_classification},

diff --git a/lib/bumblebee/huggingface/hub.ex b/lib/bumblebee/huggingface/hub.ex
@@ -51,7 +51,7 @@ defmodule Bumblebee.HuggingFace.Hub do
   @spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()}
   def cached_download(url, opts \\ []) do
     cache_dir = opts[:cache_dir] || Bumblebee.cache_dir()
-    offline = opts[:offline] || bumblebee_offline?()
+    offline = Keyword.get(opts, :offline, bumblebee_offline?())
     auth_token = opts[:auth_token]
 
     dir = Path.join(cache_dir, "huggingface")

diff --git a/lib/bumblebee/multimodal/layout_lm.ex b/lib/bumblebee/multimodal/layout_lm.ex
@@ -255,12 +255,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )
@@ -542,7 +537,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
         "language_modeling_head.norm" => "cls.predictions.transform.LayerNorm",
         "language_modeling_head.output" => "cls.predictions.decoder",
         "language_modeling_head.bias" => "cls.predictions",
-        "sequence_classification_head.output" => "cls.seq_relationship",
+        "sequence_classification_head.output" => "classifier",
         "token_classification_head.output" => "classifier",
         "multiple_choice_head.output" => "classifier",
         "question_answering_head.output" => "qa_outputs"

diff --git a/lib/bumblebee/text/bart.ex b/lib/bumblebee/text/bart.ex
@@ -652,7 +652,7 @@ defmodule Bumblebee.Text.Bart do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Model do
-    def params_mapping(_spec) do
+    def params_mapping(spec) do
       %{
         "encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
         "encoder_embedder.position_embedding" => "model.encoder.embed_positions",
@@ -690,7 +690,11 @@ defmodule Bumblebee.Text.Bart do
         "decoder.blocks.{n}.ffn.intermediate" => "model.decoder.layers.{n}.fc1",
         "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
         "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
-        "language_modeling_head.output" => "model.shared",
+        "language_modeling_head.output" =>
+          case spec.architecture do
+            :for_causal_language_modeling -> "lm_head"
+            _other -> "model.shared"
+          end,
         "language_modeling_head.logits_bias" => %{
           "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
         },

diff --git a/lib/bumblebee/text/bert.ex b/lib/bumblebee/text/bert.ex
@@ -272,12 +272,7 @@ defmodule Bumblebee.Text.Bert do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )

diff --git a/lib/bumblebee/text/blip_text.ex b/lib/bumblebee/text/blip_text.ex
@@ -387,8 +387,8 @@ defmodule Bumblebee.Text.BlipText do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Config do
-    # Support loading from the entire Clip configuration
-    def load(spec, %{"model_type" => "clip", "text_config" => data}) do
+    # Support loading from the entire Blip configuration
+    def load(spec, %{"model_type" => "blip", "text_config" => data}) do
       load(spec, data)
     end
 
@@ -419,7 +419,7 @@ defmodule Bumblebee.Text.BlipText do
     def params_mapping(spec) do
       prefix =
         case spec.architecture do
-          :base -> "text_encoder."
+          :base -> "text_model."
           :for_causal_language_modeling -> "text_decoder.bert."
         end
 

diff --git a/lib/bumblebee/text/distilbert.ex b/lib/bumblebee/text/distilbert.ex
@@ -52,10 +52,6 @@ defmodule Bumblebee.Text.Distilbert do
         doc:
           "the dropout rate for the classification head. If not specified, the value of `:dropout_rate` is used instead"
       ],
-      layer_norm_epsilon: [
-        default: 1.0e-12,
-        doc: "the epsilon used by the layer normalization layers"
-      ],
       initializer_scale: [
         default: 0.02,
         doc:
@@ -361,7 +357,7 @@ defmodule Bumblebee.Text.Distilbert do
       )
 
     Axon.add([inputs_embeddings, position_embeddings])
-    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
+    |> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
     |> Axon.dropout(rate: spec.dropout_rate, name: join(name, "dropout"))
   end
 
@@ -385,7 +381,7 @@ defmodule Bumblebee.Text.Distilbert do
       dropout_rate: spec.dropout_rate,
       attention_dropout_rate: spec.attention_dropout_rate,
       layer_norm: [
-        epsilon: spec.layer_norm_epsilon
+        epsilon: 1.0e-12
       ],
       ffn: [
         intermediate_size: spec.intermediate_size,
@@ -421,7 +417,7 @@ defmodule Bumblebee.Text.Distilbert do
       name: join(name, "dense")
     )
     |> Layers.activation(spec.activation, name: join(name, "activation"))
-    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
+    |> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
     # We reuse the kernel of input embeddings and add bias for each token
     |> Layers.dense_transposed(spec.vocab_size,
       kernel_initializer: kernel_initializer(spec),
@@ -446,15 +442,14 @@ defmodule Bumblebee.Text.Distilbert do
         convert!(data,
           vocab_size: {"vocab_size", number()},
           max_positions: {"max_position_embeddings", number()},
-          hidden_size: {"hidden_size", number()},
-          num_blocks: {"num_hidden_layers", number()},
-          num_attention_heads: {"num_attention_heads", number()},
-          intermediate_size: {"intermediate_size", number()},
-          activation: {"hidden_act", activation()},
-          dropout_rate: {"hidden_dropout_prob", number()},
-          attention_dropout_rate: {"attention_probs_dropout_prob", number()},
-          classifier_dropout_rate: {"classifier_dropout", optional(number())},
-          layer_norm_epsilon: {"layer_norm_eps", number()},
+          hidden_size: {"dim", number()},
+          num_blocks: {"n_layers", number()},
+          num_attention_heads: {"n_heads", number()},
+          intermediate_size: {"hidden_dim", number()},
+          activation: {"activation", activation()},
+          dropout_rate: {"dropout", number()},
+          attention_dropout_rate: {"attention_dropout", number()},
+          classifier_dropout_rate: {"seq_classif_dropout", optional(number())},
           initializer_scale: {"initializer_range", number()}
         ) ++ Shared.common_options_from_transformers(data, spec)
 

diff --git a/lib/bumblebee/text/llama.ex b/lib/bumblebee/text/llama.ex
@@ -209,7 +209,8 @@ defmodule Bumblebee.Text.Llama do
     logits =
       Axon.dense(outputs.hidden_state, spec.num_labels,
         kernel_initializer: kernel_initializer(spec),
-        name: "sequence_classification_head.output"
+        name: "sequence_classification_head.output",
+        use_bias: false
       )
 
     pooled_logits =

diff --git a/lib/bumblebee/text/mbart.ex b/lib/bumblebee/text/mbart.ex
@@ -690,7 +690,7 @@ defmodule Bumblebee.Text.Mbart do
   end
 
   defimpl Bumblebee.HuggingFace.Transformers.Model do
-    def params_mapping(_spec) do
+    def params_mapping(spec) do
       %{
         "encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
         "encoder_embedder.position_embedding" => "model.encoder.embed_positions",
@@ -730,7 +730,11 @@ defmodule Bumblebee.Text.Mbart do
         "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
         "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
         "decoder.norm" => "model.decoder.layer_norm",
-        "language_modeling_head.output" => "model.shared",
+        "language_modeling_head.output" =>
+          case spec.architecture do
+            :for_causal_language_modeling -> "lm_head"
+            _other -> "model.shared"
+          end,
         "language_modeling_head.logits_bias" => %{
           "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
         },

diff --git a/lib/bumblebee/text/mistral.ex b/lib/bumblebee/text/mistral.ex
@@ -202,7 +202,8 @@ defmodule Bumblebee.Text.Mistral do
     logits =
       Axon.dense(outputs.hidden_state, spec.num_labels,
         kernel_initializer: kernel_initializer(spec),
-        name: "sequence_classification_head.output"
+        name: "sequence_classification_head.output",
+        use_bias: false
       )
 
     pooled_logits =

diff --git a/lib/bumblebee/text/roberta.ex b/lib/bumblebee/text/roberta.ex
@@ -262,12 +262,7 @@ defmodule Bumblebee.Text.Roberta do
     outputs = core(inputs, spec)
 
     logits =
-      outputs.hidden_state
-      |> Axon.dropout(
-        rate: classifier_dropout_rate(spec),
-        name: "question_answering_head.dropout"
-      )
-      |> Axon.dense(2,
+      Axon.dense(outputs.hidden_state, 2,
         kernel_initializer: kernel_initializer(spec),
         name: "question_answering_head.output"
       )

diff --git a/lib/bumblebee/text/t5.ex b/lib/bumblebee/text/t5.ex
@@ -580,9 +580,8 @@ defmodule Bumblebee.Text.T5 do
   defimpl Bumblebee.HuggingFace.Transformers.Model do
     def params_mapping(spec) do
       %{
-        # encoder
-        "encoder_embedder.token_embedding" =>
-          if(spec.tie_word_embeddings, do: "shared", else: "encoder.embed_tokens"),
+        # Encoder and decoder embeddings are always shared
+        "encoder_embedder.token_embedding" => "shared",
         "encoder.blocks.{n}.self_attention_norm" => "encoder.block.{n}.layer.0.layer_norm",
         "encoder.blocks.{n}.self_attention.query" => "encoder.block.{n}.layer.0.SelfAttention.q",
         "encoder.blocks.{n}.self_attention.key" => "encoder.block.{n}.layer.0.SelfAttention.k",
@@ -599,9 +598,7 @@ defmodule Bumblebee.Text.T5 do
           ),
         "encoder.blocks.{n}.ffn.output" => "encoder.block.{n}.layer.1.DenseReluDense.wo",
         "encoder.output_norm" => "encoder.final_layer_norm",
-        # decoder
-        "decoder_embedder.token_embedding" =>
-          if(spec.tie_word_embeddings, do: "shared", else: "decoder.embed_tokens"),
+        "decoder_embedder.token_embedding" => "shared",
         "decoder.blocks.{n}.self_attention_norm" => "decoder.block.{n}.layer.0.layer_norm",
         "decoder.blocks.{n}.self_attention.query" => "decoder.block.{n}.layer.0.SelfAttention.q",
         "decoder.blocks.{n}.self_attention.key" => "decoder.block.{n}.layer.0.SelfAttention.k",
@@ -626,7 +623,6 @@ defmodule Bumblebee.Text.T5 do
           ),
         "decoder.blocks.{n}.ffn.output" => "decoder.block.{n}.layer.2.DenseReluDense.wo",
         "decoder.output_norm" => "decoder.final_layer_norm",
-        # language modeling
         "language_modeling_head.output" =>
           if(spec.tie_word_embeddings, do: "shared", else: "lm_head")
       }

diff --git a/lib/bumblebee/vision/resnet.ex b/lib/bumblebee/vision/resnet.ex
@@ -196,7 +196,7 @@ defmodule Bumblebee.Vision.ResNet do
         name: join(name, "blocks.0")
       )
 
-    for idx <- 1..(depth - 1), reduce: hidden_state do
+    for idx <- 1..(depth - 1)//1, reduce: hidden_state do
       hidden_state ->
         residual_block.(hidden_state, out_channels, out_channels,
           activation: spec.activation,