Qwen2 VL Integration (#72)

* initial integration * update nb * add qwen gptq and awq models * add 72b models * update readme * update nb * update nb
dnth · Nov 20, 2024 · e6c32cf · e6c32cf
1 parent 88e6181
commit e6c32cf
Show file tree

Hide file tree

Showing 4 changed files with 345 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -442,6 +442,10 @@ See an example implementation of the Molmo model [here](https://github.com/dnth/
                 <td><a href="https://huggingface.co/microsoft/Florence-2-base-ft">Florence-2 Series</a></td>
                 <td><pre lang="python"><code>xinfer.create_model("microsoft/Florence-2-base-ft")</code></pre></td>
             </tr>
+            <tr>
+                <td><a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct">Qwen2-VL Series</a></td>
+                <td><pre lang="python"><code>xinfer.create_model("Qwen/Qwen2-VL-2B-Instruct")</code></pre></td>
+            </tr>
         </tbody>
     </table>
 </body>

diff --git a/nbs/qwen2vl.ipynb b/nbs/qwen2vl.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\">                               Available Models                                </span>\n",
+       "┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Implementation </span>┃<span style=\"font-weight: bold\"> Model ID                             </span>┃<span style=\"font-weight: bold\"> Input --&gt; Output    </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-AWQ       </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct           </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8  </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4  </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-AWQ        </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct            </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8  </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4  </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-AWQ        </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers   </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct            </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
+       "└────────────────┴──────────────────────────────────────┴─────────────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[3m                               Available Models                                \u001b[0m\n",
+       "┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mImplementation\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel ID                            \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mInput --> Output   \u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-GPTQ-Int8\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-GPTQ-Int4\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-AWQ      \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct          \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-GPTQ-Int8 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-AWQ       \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct           \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-GPTQ-Int8 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-GPTQ-Int4 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-AWQ       \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "│\u001b[36m \u001b[0m\u001b[36mtransformers  \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct           \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
+       "└────────────────┴──────────────────────────────────────┴─────────────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import xinfer\n",
+    "\n",
+    "xinfer.list_models(\"qwen2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[32m2024-11-20 23:07:01.171\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4\u001b[0m\n",
+      "\u001b[32m2024-11-20 23:07:01.171\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
+      "\u001b[32m2024-11-20 23:07:01.172\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: auto\u001b[0m\n",
+      "/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n",
+      "  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):\n",
+      "/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.\n",
+      "  def backward(ctx, grad_output):\n",
+      "/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n",
+      "  @custom_fwd(cast_inputs=torch.float16)\n",
+      "CUDA extension not installed.\n",
+      "CUDA extension not installed.\n",
+      "/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/transformers/modeling_utils.py:4779: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = xinfer.create_model(\"Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4\", device=\"cuda\", dtype=\"auto\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The image features a plush toy resembling a rabbit. The toy is dressed in a light-colored outfit with green and yellow floral patterns. It has a blue hat with a bow on top and is sitting on a fluffy, pink surface. The background is a solid pink color, providing a simple and clean backdrop that highlights the toy.'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "image1 = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/000b9c365c9e307a.jpg\"\n",
+    "image2 = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/00aa2580828a9009.jpg\"\n",
+    "\n",
+    "prompt = \"Describe the image.\"\n",
+    "\n",
+    "model.infer(image1, prompt).text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Result(categories=None, boxes=None, masks=None, poses=None, text='The image features a plush toy resembling a rabbit. The toy is dressed in a light-colored outfit with green and yellow floral patterns. It has a blue hat with a bow on top and is sitting on a fluffy, pink surface. The background is a solid pink color, providing a simple and clean backdrop that highlights the toy.'),\n",
+       " Result(categories=None, boxes=None, masks=None, poses=None, text='The image depicts a lively street scene with a parade taking place. In the foreground, a man is clapping, while another person is gesturing with their hand. The street is lined with spectators, and there are several flags and vehicles participating in the parade. The background features a clock tower and a church with a steeple, indicating a town or city setting. The weather appears overcast, and the overall atmosphere is festive and community-oriented.')]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.infer_batch([image1, image2], prompt)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "xinfer",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/xinfer/transformers/__init__.py b/xinfer/transformers/__init__.py
@@ -5,6 +5,7 @@
 from .joycaption import JoyCaption
 from .llama32 import Llama32Vision, Llama32VisionInstruct
 from .moondream import Moondream
+from .qwen2_vl import Qwen2VL
 from .vision2seq import Vision2SeqModel
 from .vlrm_blip2 import VLRMBlip2