Release 3.1.1 · huggingface/transformers.js

🤖 New models

Add support for Idefics3 (SmolVLM) in #1059

import {
  AutoProcessor,
  AutoModelForVision2Seq,
  load_image,
} from "@huggingface/transformers";

// Initialize processor and model
const model_id = "HuggingFaceTB/SmolVLM-Instruct";
const processor = await AutoProcessor.from_pretrained(model_id);
const model = await AutoModelForVision2Seq.from_pretrained(model_id, {
  dtype: {
    embed_tokens: "fp16", // "fp32", "fp16", "q8"
    vision_encoder: "q4", // "fp32", "fp16", "q8", "q4", "q4f16"
    decoder_model_merged: "q4", // "q8", "q4", "q4f16"
  }
});

// Load images
const image1 = await load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg");
const image2 = await load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg");

// Create input messages
const messages = [
  {
    role: "user",
    content: [
      { type: "image" },
      { type: "image" },
      { type: "text", text: "Can you describe the two images?" },
    ],
  },
];

// Prepare inputs
const text = processor.apply_chat_template(messages, { add_generation_prompt: true });
const inputs = await processor(text, [image1, image2], {
  // Set `do_image_splitting: true` to split images into multiple patches.
  // NOTE: This uses more memory, but can provide more accurate results.
  do_image_splitting: false,
});

// Generate outputs
const generated_ids = await model.generate({
  ...inputs,
  max_new_tokens: 500,
});
const generated_texts = processor.batch_decode(
  generated_ids.slice(null, [inputs.input_ids.dims.at(-1), null]),
  { skip_special_tokens: true },
);
console.log(generated_texts[0]);
// ' In the first image, there is a green statue of liberty on a pedestal in the middle of the water. The water is surrounded by trees and buildings in the background. In the second image, there are pink and red flowers with a bee on the pink flower.'

🐛 Bug fixes

Fix repetition penalty logits processor in #1062
Fix optional chaining for batch size calculation in PreTrainedModel by @emojiiii in #1063

📝 Documentation improvements

Add an example and type enhancement for TextStreamer by @seonglae in #1066
The smallest typo fix for webgpu.md by @JoramMillenaar in #1068

🛠️ Other improvements

Only log warning if type not explicitly set to "custom" in #1061
Improve browser vs. webworker detection in #1067

🤗 New contributors

@emojiiii made their first contribution in #1063
@seonglae made their first contribution in #1066
@JoramMillenaar made their first contribution in #1068

Full Changelog: 3.1.0...3.1.1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

3.1.1

🤖 New models

🐛 Bug fixes

📝 Documentation improvements

🛠️ Other improvements

🤗 New contributors

Contributors