From e28ac7a106ac7ebf625e6d347286b0a7bd40c104 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sun, 24 Nov 2024 23:58:22 +0000
Subject: [PATCH] Add `Qwen2VLForConditionalGeneration` unit tests

---
 tests/tiny_random.test.js | 91 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/tests/tiny_random.test.js b/tests/tiny_random.test.js
index be87f555d..12f2d797f 100644
--- a/tests/tiny_random.test.js
+++ b/tests/tiny_random.test.js
@@ -52,6 +52,7 @@ import {
   WhisperForConditionalGeneration,
   VisionEncoderDecoderModel,
   Florence2ForConditionalGeneration,
+  Qwen2VLForConditionalGeneration,
   MarianMTModel,
 
   // Pipelines
@@ -833,6 +834,96 @@ describe("Tiny random models", () => {
     });
   });
 
+  describe("qwen2_vl", () => {
+    const CONVERSATION = [
+      {
+        role: "user",
+        content: [{ type: "text", text: "Hello" }],
+      },
+    ];
+
+    // Example adapted from https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
+    const CONVERSATION_WITH_IMAGE = [
+      {
+        role: "user",
+        content: [{ type: "image" }, { type: "text", text: "Describe this image." }],
+      },
+    ];
+    // Empty white image
+    const dims = [224, 224, 3];
+    const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims);
+
+    describe("Qwen2VLForConditionalGeneration", () => {
+      const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration";
+
+      /** @type {Qwen2VLForConditionalGeneration} */
+      let model;
+      /** @type {Qwen2VLProcessor} */
+      let processor;
+      beforeAll(async () => {
+        model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, {
+          // TODO move to config
+          ...DEFAULT_MODEL_OPTIONS,
+        });
+        processor = await AutoProcessor.from_pretrained(model_id);
+      }, MAX_MODEL_LOAD_TIME);
+
+      it(
+        "forward",
+        async () => {
+          const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, {
+            add_generation_prompt: true,
+          });
+          const inputs = await processor(text, image);
+          const { logits } = await model(inputs);
+          expect(logits.dims).toEqual([1, 89, 152064]);
+          expect(logits.mean().item()).toBeCloseTo(-0.0011299321195110679, 5);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "text-only (batch_size=1)",
+        async () => {
+          const text = processor.apply_chat_template(CONVERSATION, {
+            add_generation_prompt: true,
+          });
+          const inputs = await processor(text);
+          const generate_ids = await model.generate({
+            ...inputs,
+            max_new_tokens: 10,
+          });
+
+          const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
+          expect(new_tokens.tolist()).toEqual([[24284n, 63986n, 108860n, 84530n, 8889n, 23262n, 128276n, 64948n, 136757n, 138348n]]);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      it(
+        "text + image (batch_size=1)",
+        async () => {
+          const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, {
+            add_generation_prompt: true,
+          });
+          const inputs = await processor(text, image);
+          const generate_ids = await model.generate({
+            ...inputs,
+            max_new_tokens: 10,
+          });
+
+          const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
+          expect(new_tokens.tolist()).toEqual([[24284n, 35302n, 60575n, 38679n, 113390n, 115118n, 137596n, 38241n, 96726n, 142301n]]);
+        },
+        MAX_TEST_EXECUTION_TIME,
+      );
+
+      afterAll(async () => {
+        await model?.dispose();
+      }, MAX_MODEL_DISPOSE_TIME);
+    });
+  });
+
   describe("vision-encoder-decoder", () => {
     describe("VisionEncoderDecoderModel", () => {
       const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2";