Skip to content

Commit

Permalink
Rewrite tests to use tiny model checkpoints (#297)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonatanklosko authored Nov 30, 2023
1 parent cf4dff6 commit 21832f3
Show file tree
Hide file tree
Showing 81 changed files with 3,680 additions and 3,810 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,21 @@ jobs:
- run: mix deps.compile
- run: mix compile --warnings-as-errors
if: ${{ matrix.lint }}
- name: Restore bumblebee cache
id: cache-bumblebee-restore
uses: actions/cache/restore@v3
with:
path: bumblebee_cache
key: ${{ runner.os }}-bumblebee-cache-${{ matrix.pair.elixir }}-${{ matrix.pair.otp }}
- run: mix test
env:
BUMBLEBEE_CACHE_DIR: ${{ github.workspace }}/bumblebee_cache
- name: Save bumblebee cache
id: cache-bumblebee-save
uses: actions/cache/save@v3
with:
path: bumblebee_cache
key: ${{ steps.cache-bumblebee-restore.outputs.cache-primary-key }}
- uses: technote-space/get-diff-action@v6
with:
PATTERNS: test/**/*_test.exs
Expand Down
6 changes: 3 additions & 3 deletions lib/bumblebee.ex
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,11 @@ defmodule Bumblebee do
{Bumblebee.Text.Distilbert, :for_sequence_classification},
"DistilBertForQuestionAnswering" => {Bumblebee.Text.Distilbert, :for_question_answering},
"DistilBertForTokenClassification" => {Bumblebee.Text.Distilbert, :for_token_classification},
"DistilBertForMultipleChoice" => {Bumblebee.Text.Distilbert, :for_multiple_choice},
"GPT2ForSequenceClassification" => {Bumblebee.Text.Gpt2, :for_sequence_classification},
"GPT2ForTokenClassification" => {Bumblebee.Text.Gpt2, :for_token_classification},
"GPT2LMHeadModel" => {Bumblebee.Text.Gpt2, :for_causal_language_modeling},
"GPT2Model" => {BumbleBee.Text.Gpt2, :base},
"GPT2Model" => {Bumblebee.Text.Gpt2, :base},
"GPTBigCodeModel" => {Bumblebee.Text.GptBigCode, :base},
"GPTBigCodeForCausalLM" => {Bumblebee.Text.GptBigCode, :for_causal_language_modeling},
"GPTBigCodeForSequenceClassification" =>
Expand All @@ -137,8 +138,7 @@ defmodule Bumblebee do
"GPTNeoXForCausalLM" => {Bumblebee.Text.GptNeoX, :for_causal_language_modeling},
"GPTNeoXForSequenceClassification" => {Bumblebee.Text.GptNeoX, :for_sequence_classification},
"GPTNeoXForTokenClassification" => {Bumblebee.Text.GptNeoX, :for_token_classification},
"LayoutLMForMaskedLanguageModeling" =>
{Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
"LayoutLMForMaskedLM" => {Bumblebee.Multimodal.LayoutLm, :for_masked_language_modeling},
"LayoutLMForQuestionAnswering" => {Bumblebee.Multimodal.LayoutLm, :for_question_answering},
"LayoutLMForSequenceClassification" =>
{Bumblebee.Multimodal.LayoutLm, :for_sequence_classification},
Expand Down
2 changes: 1 addition & 1 deletion lib/bumblebee/huggingface/hub.ex
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ defmodule Bumblebee.HuggingFace.Hub do
@spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()}
def cached_download(url, opts \\ []) do
cache_dir = opts[:cache_dir] || Bumblebee.cache_dir()
offline = opts[:offline] || bumblebee_offline?()
offline = Keyword.get(opts, :offline, bumblebee_offline?())
auth_token = opts[:auth_token]

dir = Path.join(cache_dir, "huggingface")
Expand Down
9 changes: 2 additions & 7 deletions lib/bumblebee/multimodal/layout_lm.ex
Original file line number Diff line number Diff line change
Expand Up @@ -255,12 +255,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
outputs = core(inputs, spec)

logits =
outputs.hidden_state
|> Axon.dropout(
rate: classifier_dropout_rate(spec),
name: "question_answering_head.dropout"
)
|> Axon.dense(2,
Axon.dense(outputs.hidden_state, 2,
kernel_initializer: kernel_initializer(spec),
name: "question_answering_head.output"
)
Expand Down Expand Up @@ -542,7 +537,7 @@ defmodule Bumblebee.Multimodal.LayoutLm do
"language_modeling_head.norm" => "cls.predictions.transform.LayerNorm",
"language_modeling_head.output" => "cls.predictions.decoder",
"language_modeling_head.bias" => "cls.predictions",
"sequence_classification_head.output" => "cls.seq_relationship",
"sequence_classification_head.output" => "classifier",
"token_classification_head.output" => "classifier",
"multiple_choice_head.output" => "classifier",
"question_answering_head.output" => "qa_outputs"
Expand Down
8 changes: 6 additions & 2 deletions lib/bumblebee/text/bart.ex
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ defmodule Bumblebee.Text.Bart do
end

defimpl Bumblebee.HuggingFace.Transformers.Model do
def params_mapping(_spec) do
def params_mapping(spec) do
%{
"encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
"encoder_embedder.position_embedding" => "model.encoder.embed_positions",
Expand Down Expand Up @@ -690,7 +690,11 @@ defmodule Bumblebee.Text.Bart do
"decoder.blocks.{n}.ffn.intermediate" => "model.decoder.layers.{n}.fc1",
"decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
"decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
"language_modeling_head.output" => "model.shared",
"language_modeling_head.output" =>
case spec.architecture do
:for_causal_language_modeling -> "lm_head"
_other -> "model.shared"
end,
"language_modeling_head.logits_bias" => %{
"bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
},
Expand Down
7 changes: 1 addition & 6 deletions lib/bumblebee/text/bert.ex
Original file line number Diff line number Diff line change
Expand Up @@ -272,12 +272,7 @@ defmodule Bumblebee.Text.Bert do
outputs = core(inputs, spec)

logits =
outputs.hidden_state
|> Axon.dropout(
rate: classifier_dropout_rate(spec),
name: "question_answering_head.dropout"
)
|> Axon.dense(2,
Axon.dense(outputs.hidden_state, 2,
kernel_initializer: kernel_initializer(spec),
name: "question_answering_head.output"
)
Expand Down
6 changes: 3 additions & 3 deletions lib/bumblebee/text/blip_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -387,8 +387,8 @@ defmodule Bumblebee.Text.BlipText do
end

defimpl Bumblebee.HuggingFace.Transformers.Config do
# Support loading from the entire Clip configuration
def load(spec, %{"model_type" => "clip", "text_config" => data}) do
# Support loading from the entire Blip configuration
def load(spec, %{"model_type" => "blip", "text_config" => data}) do
load(spec, data)
end

Expand Down Expand Up @@ -419,7 +419,7 @@ defmodule Bumblebee.Text.BlipText do
def params_mapping(spec) do
prefix =
case spec.architecture do
:base -> "text_encoder."
:base -> "text_model."
:for_causal_language_modeling -> "text_decoder.bert."
end

Expand Down
27 changes: 11 additions & 16 deletions lib/bumblebee/text/distilbert.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ defmodule Bumblebee.Text.Distilbert do
doc:
"the dropout rate for the classification head. If not specified, the value of `:dropout_rate` is used instead"
],
layer_norm_epsilon: [
default: 1.0e-12,
doc: "the epsilon used by the layer normalization layers"
],
initializer_scale: [
default: 0.02,
doc:
Expand Down Expand Up @@ -361,7 +357,7 @@ defmodule Bumblebee.Text.Distilbert do
)

Axon.add([inputs_embeddings, position_embeddings])
|> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
|> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
|> Axon.dropout(rate: spec.dropout_rate, name: join(name, "dropout"))
end

Expand All @@ -385,7 +381,7 @@ defmodule Bumblebee.Text.Distilbert do
dropout_rate: spec.dropout_rate,
attention_dropout_rate: spec.attention_dropout_rate,
layer_norm: [
epsilon: spec.layer_norm_epsilon
epsilon: 1.0e-12
],
ffn: [
intermediate_size: spec.intermediate_size,
Expand Down Expand Up @@ -421,7 +417,7 @@ defmodule Bumblebee.Text.Distilbert do
name: join(name, "dense")
)
|> Layers.activation(spec.activation, name: join(name, "activation"))
|> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "norm"))
|> Axon.layer_norm(epsilon: 1.0e-12, name: join(name, "norm"))
# We reuse the kernel of input embeddings and add bias for each token
|> Layers.dense_transposed(spec.vocab_size,
kernel_initializer: kernel_initializer(spec),
Expand All @@ -446,15 +442,14 @@ defmodule Bumblebee.Text.Distilbert do
convert!(data,
vocab_size: {"vocab_size", number()},
max_positions: {"max_position_embeddings", number()},
hidden_size: {"hidden_size", number()},
num_blocks: {"num_hidden_layers", number()},
num_attention_heads: {"num_attention_heads", number()},
intermediate_size: {"intermediate_size", number()},
activation: {"hidden_act", activation()},
dropout_rate: {"hidden_dropout_prob", number()},
attention_dropout_rate: {"attention_probs_dropout_prob", number()},
classifier_dropout_rate: {"classifier_dropout", optional(number())},
layer_norm_epsilon: {"layer_norm_eps", number()},
hidden_size: {"dim", number()},
num_blocks: {"n_layers", number()},
num_attention_heads: {"n_heads", number()},
intermediate_size: {"hidden_dim", number()},
activation: {"activation", activation()},
dropout_rate: {"dropout", number()},
attention_dropout_rate: {"attention_dropout", number()},
classifier_dropout_rate: {"seq_classif_dropout", optional(number())},
initializer_scale: {"initializer_range", number()}
) ++ Shared.common_options_from_transformers(data, spec)

Expand Down
3 changes: 2 additions & 1 deletion lib/bumblebee/text/llama.ex
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ defmodule Bumblebee.Text.Llama do
logits =
Axon.dense(outputs.hidden_state, spec.num_labels,
kernel_initializer: kernel_initializer(spec),
name: "sequence_classification_head.output"
name: "sequence_classification_head.output",
use_bias: false
)

pooled_logits =
Expand Down
8 changes: 6 additions & 2 deletions lib/bumblebee/text/mbart.ex
Original file line number Diff line number Diff line change
Expand Up @@ -690,7 +690,7 @@ defmodule Bumblebee.Text.Mbart do
end

defimpl Bumblebee.HuggingFace.Transformers.Model do
def params_mapping(_spec) do
def params_mapping(spec) do
%{
"encoder_embedder.token_embedding" => "model.encoder.embed_tokens",
"encoder_embedder.position_embedding" => "model.encoder.embed_positions",
Expand Down Expand Up @@ -730,7 +730,11 @@ defmodule Bumblebee.Text.Mbart do
"decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2",
"decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm",
"decoder.norm" => "model.decoder.layer_norm",
"language_modeling_head.output" => "model.shared",
"language_modeling_head.output" =>
case spec.architecture do
:for_causal_language_modeling -> "lm_head"
_other -> "model.shared"
end,
"language_modeling_head.logits_bias" => %{
"bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end}
},
Expand Down
3 changes: 2 additions & 1 deletion lib/bumblebee/text/mistral.ex
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ defmodule Bumblebee.Text.Mistral do
logits =
Axon.dense(outputs.hidden_state, spec.num_labels,
kernel_initializer: kernel_initializer(spec),
name: "sequence_classification_head.output"
name: "sequence_classification_head.output",
use_bias: false
)

pooled_logits =
Expand Down
7 changes: 1 addition & 6 deletions lib/bumblebee/text/roberta.ex
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,7 @@ defmodule Bumblebee.Text.Roberta do
outputs = core(inputs, spec)

logits =
outputs.hidden_state
|> Axon.dropout(
rate: classifier_dropout_rate(spec),
name: "question_answering_head.dropout"
)
|> Axon.dense(2,
Axon.dense(outputs.hidden_state, 2,
kernel_initializer: kernel_initializer(spec),
name: "question_answering_head.output"
)
Expand Down
10 changes: 3 additions & 7 deletions lib/bumblebee/text/t5.ex
Original file line number Diff line number Diff line change
Expand Up @@ -580,9 +580,8 @@ defmodule Bumblebee.Text.T5 do
defimpl Bumblebee.HuggingFace.Transformers.Model do
def params_mapping(spec) do
%{
# encoder
"encoder_embedder.token_embedding" =>
if(spec.tie_word_embeddings, do: "shared", else: "encoder.embed_tokens"),
# Encoder and decoder embeddings are always shared
"encoder_embedder.token_embedding" => "shared",
"encoder.blocks.{n}.self_attention_norm" => "encoder.block.{n}.layer.0.layer_norm",
"encoder.blocks.{n}.self_attention.query" => "encoder.block.{n}.layer.0.SelfAttention.q",
"encoder.blocks.{n}.self_attention.key" => "encoder.block.{n}.layer.0.SelfAttention.k",
Expand All @@ -599,9 +598,7 @@ defmodule Bumblebee.Text.T5 do
),
"encoder.blocks.{n}.ffn.output" => "encoder.block.{n}.layer.1.DenseReluDense.wo",
"encoder.output_norm" => "encoder.final_layer_norm",
# decoder
"decoder_embedder.token_embedding" =>
if(spec.tie_word_embeddings, do: "shared", else: "decoder.embed_tokens"),
"decoder_embedder.token_embedding" => "shared",
"decoder.blocks.{n}.self_attention_norm" => "decoder.block.{n}.layer.0.layer_norm",
"decoder.blocks.{n}.self_attention.query" => "decoder.block.{n}.layer.0.SelfAttention.q",
"decoder.blocks.{n}.self_attention.key" => "decoder.block.{n}.layer.0.SelfAttention.k",
Expand All @@ -626,7 +623,6 @@ defmodule Bumblebee.Text.T5 do
),
"decoder.blocks.{n}.ffn.output" => "decoder.block.{n}.layer.2.DenseReluDense.wo",
"decoder.output_norm" => "decoder.final_layer_norm",
# language modeling
"language_modeling_head.output" =>
if(spec.tie_word_embeddings, do: "shared", else: "lm_head")
}
Expand Down
2 changes: 1 addition & 1 deletion lib/bumblebee/vision/resnet.ex
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ defmodule Bumblebee.Vision.ResNet do
name: join(name, "blocks.0")
)

for idx <- 1..(depth - 1), reduce: hidden_state do
for idx <- 1..(depth - 1)//1, reduce: hidden_state do
hidden_state ->
residual_block.(hidden_state, out_channels, out_channels,
activation: spec.activation,
Expand Down
Loading

0 comments on commit 21832f3

Please sign in to comment.