Skip to content

Commit

Permalink
Qwen2 VL Integration (#72)
Browse files Browse the repository at this point in the history
* initial integration

* update nb

* add qwen gptq and awq models

* add 72b models

* update readme

* update nb

* update nb
  • Loading branch information
dnth authored Nov 20, 2024
1 parent 88e6181 commit e6c32cf
Show file tree
Hide file tree
Showing 4 changed files with 345 additions and 0 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,10 @@ See an example implementation of the Molmo model [here](https://github.com/dnth/
<td><a href="https://huggingface.co/microsoft/Florence-2-base-ft">Florence-2 Series</a></td>
<td><pre lang="python"><code>xinfer.create_model("microsoft/Florence-2-base-ft")</code></pre></td>
</tr>
<tr>
<td><a href="https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct">Qwen2-VL Series</a></td>
<td><pre lang="python"><code>xinfer.create_model("Qwen/Qwen2-VL-2B-Instruct")</code></pre></td>
</tr>
</tbody>
</table>
</body>
Expand Down
164 changes: 164 additions & 0 deletions nbs/qwen2vl.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\"> Available Models </span>\n",
"┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Implementation </span>┃<span style=\"font-weight: bold\"> Model ID </span>┃<span style=\"font-weight: bold\"> Input --&gt; Output </span>┃\n",
"┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct-AWQ </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-72B-Instruct </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct-AWQ </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-7B-Instruct </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4 </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct-AWQ </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"│<span style=\"color: #008080; text-decoration-color: #008080\"> transformers </span>│<span style=\"color: #800080; text-decoration-color: #800080\"> Qwen/Qwen2-VL-2B-Instruct </span>│<span style=\"color: #008000; text-decoration-color: #008000\"> image-text --&gt; text </span>│\n",
"└────────────────┴──────────────────────────────────────┴─────────────────────┘\n",
"</pre>\n"
],
"text/plain": [
"\u001b[3m Available Models \u001b[0m\n",
"┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓\n",
"\u001b[1m \u001b[0m\u001b[1mImplementation\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mModel ID \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mInput --> Output \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-GPTQ-Int8\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-GPTQ-Int4\u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct-AWQ \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-72B-Instruct \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-GPTQ-Int8 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-GPTQ-Int4 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct-AWQ \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-7B-Instruct \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-GPTQ-Int8 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-GPTQ-Int4 \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct-AWQ \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"\u001b[36m \u001b[0m\u001b[36mtransformers \u001b[0m\u001b[36m \u001b[0m│\u001b[35m \u001b[0m\u001b[35mQwen/Qwen2-VL-2B-Instruct \u001b[0m\u001b[35m \u001b[0m│\u001b[32m \u001b[0m\u001b[32mimage-text --> text\u001b[0m\u001b[32m \u001b[0m│\n",
"└────────────────┴──────────────────────────────────────┴─────────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import xinfer\n",
"\n",
"xinfer.list_models(\"qwen2\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-11-20 23:07:01.171\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m63\u001b[0m - \u001b[1mModel: Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4\u001b[0m\n",
"\u001b[32m2024-11-20 23:07:01.171\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m64\u001b[0m - \u001b[1mDevice: cuda\u001b[0m\n",
"\u001b[32m2024-11-20 23:07:01.172\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxinfer.models\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m65\u001b[0m - \u001b[1mDtype: auto\u001b[0m\n",
"/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n",
" def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):\n",
"/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: `torch.cuda.amp.custom_bwd(args...)` is deprecated. Please use `torch.amp.custom_bwd(args..., device_type='cuda')` instead.\n",
" def backward(ctx, grad_output):\n",
"/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: `torch.cuda.amp.custom_fwd(args...)` is deprecated. Please use `torch.amp.custom_fwd(args..., device_type='cuda')` instead.\n",
" @custom_fwd(cast_inputs=torch.float16)\n",
"CUDA extension not installed.\n",
"CUDA extension not installed.\n",
"/home/dnth/mambaforge-pypy3/envs/xinfer/lib/python3.10/site-packages/transformers/modeling_utils.py:4779: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n",
" warnings.warn(\n"
]
}
],
"source": [
"model = xinfer.create_model(\"Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4\", device=\"cuda\", dtype=\"auto\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The image features a plush toy resembling a rabbit. The toy is dressed in a light-colored outfit with green and yellow floral patterns. It has a blue hat with a bow on top and is sitting on a fluffy, pink surface. The background is a solid pink color, providing a simple and clean backdrop that highlights the toy.'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image1 = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/000b9c365c9e307a.jpg\"\n",
"image2 = \"https://raw.githubusercontent.com/dnth/x.infer/main/assets/demo/00aa2580828a9009.jpg\"\n",
"\n",
"prompt = \"Describe the image.\"\n",
"\n",
"model.infer(image1, prompt).text"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Result(categories=None, boxes=None, masks=None, poses=None, text='The image features a plush toy resembling a rabbit. The toy is dressed in a light-colored outfit with green and yellow floral patterns. It has a blue hat with a bow on top and is sitting on a fluffy, pink surface. The background is a solid pink color, providing a simple and clean backdrop that highlights the toy.'),\n",
" Result(categories=None, boxes=None, masks=None, poses=None, text='The image depicts a lively street scene with a parade taking place. In the foreground, a man is clapping, while another person is gesturing with their hand. The street is lined with spectators, and there are several flags and vehicles participating in the parade. The background features a clock tower and a church with a steeple, indicating a town or city setting. The weather appears overcast, and the overall atmosphere is festive and community-oriented.')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.infer_batch([image1, image2], prompt)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "xinfer",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions xinfer/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .joycaption import JoyCaption
from .llama32 import Llama32Vision, Llama32VisionInstruct
from .moondream import Moondream
from .qwen2_vl import Qwen2VL
from .vision2seq import Vision2SeqModel
from .vlrm_blip2 import VLRMBlip2

Expand Down
Loading

0 comments on commit e6c32cf

Please sign in to comment.