diff --git a/.gitignore b/.gitignore
index fbc703a..4db8e17 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,13 @@ test
build/
package-lock.json
*.egg-info
-*.onnx
__pycache__
.build
.swiftpm
-node_modules
\ No newline at end of file
+.hf_token
+node_modules
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
diff --git a/README.md b/README.md
index 031c484..32957e7 100755
--- a/README.md
+++ b/README.md
@@ -20,9 +20,11 @@ For Content Understanding and Generation
Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+
+Python • JavaScript • Swift
---
@@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or
The exact behavior is controlled by prompts.
```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..5626d39
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,10 @@
+# UForm for JavaScript
+
+
+
+```bash
+pnpm add uform
+npm add uform
+yarn add uform
+```
+
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..7331231
--- /dev/null
+++ b/package.json
@@ -0,0 +1,11 @@
+{
+ "name": "uform",
+ "private": true,
+ "version": "2.0.2",
+ "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+ "dependencies": {
+ "@huggingface/hub": "^0.14.8",
+ "@xenova/transformers": "^2.17.0",
+ "onnxruntime-web": "^1.17.3"
+ }
+}
diff --git a/python/scripts/bench.py b/python/scripts/bench.py
index 49c7004..8bcaf37 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench.py
@@ -13,7 +13,7 @@
)
from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
dtype = torch.bfloat16
low_cpu_mem_usage = False
diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
deleted file mode 100644
index ce8cf10..0000000
--- a/python/scripts/export.ipynb
+++ /dev/null
@@ -1,666 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --upgrade \"uform[torch]\" coremltools"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
- " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
- " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
- " warn(f\"Failed to load image Python extension: {e}\")\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Fetching 5 files: 0%| | 0/5 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "(torch.Size([1, 197, 384]),\n",
- " torch.Size([1, 64, 768]),\n",
- " torch.Size([1, 256]),\n",
- " torch.Size([1, 256]))"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import uform\n",
- "from PIL import Image\n",
- "\n",
- "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
- "text = 'a small red panda in a zoo'\n",
- "image = Image.open('../../assets/unum.png')\n",
- "\n",
- "image_data = processor.preprocess_image(image)\n",
- "text_data = processor.preprocess_text(text)\n",
- "\n",
- "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
- "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
- "\n",
- "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.text_encoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.image_encoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "First layer of image_encoder: patch_embed\n",
- "First layer of text_encoder: word_embeddings\n"
- ]
- }
- ],
- "source": [
- "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
- "for name, module in model.image_encoder.named_children():\n",
- " print(f\"First layer of image_encoder: {name}\")\n",
- " break # We break after the first layer\n",
- "\n",
- "for name, module in model.text_encoder.named_children():\n",
- " print(f\"First layer of text_encoder: {name}\")\n",
- " break # We break after the first layer"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ONNX"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## CoreML"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
- "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
- ]
- }
- ],
- "source": [
- "import coremltools as ct\n",
- "import torch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
- "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
- "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
- "text_features = ct.TensorType(name=\"features\")\n",
- "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
- "image_features = ct.TensorType(name=\"features\")\n",
- "image_embeddings = ct.TensorType(name=\"embeddings\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "VisualEncoder(\n",
- " original_name=VisualEncoder\n",
- " (patch_embed): Conv2d(original_name=Conv2d)\n",
- " (blocks): Sequential(\n",
- " original_name=Sequential\n",
- " (0): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (1): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (2): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (3): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (4): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (5): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (6): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (7): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (8): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (9): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (10): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (11): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " )\n",
- " (norm): LayerNorm(original_name=LayerNorm)\n",
- " (embedding_projection): Linear(original_name=Linear)\n",
- ")"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "module = model.image_encoder\n",
- "module.eval()\n",
- "module.return_features = True\n",
- "\n",
- "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
- "traced_script_module"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Tuple detected at graph output. This will be flattened in the converted model.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
- "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
- "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
- "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
- ]
- }
- ],
- "source": [
- "coreml_model = ct.convert(\n",
- " traced_script_module, source=\"pytorch\",\n",
- " inputs=[image_input], outputs=[image_features, image_embeddings],\n",
- " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
- "\n",
- "coreml_model.author = 'Unum Cloud'\n",
- "coreml_model.license = 'Apache 2.0'\n",
- "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "TextEncoder(\n",
- " original_name=TextEncoder\n",
- " (word_embeddings): Embedding(original_name=Embedding)\n",
- " (position_embeddings): Embedding(original_name=Embedding)\n",
- " (layer_norm): LayerNorm(original_name=LayerNorm)\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " (blocks): ModuleList(\n",
- " original_name=ModuleList\n",
- " (0): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (1): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (2): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
- " (crossattn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (3): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
- " (crossattn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " )\n",
- " (embedding_projection): Linear(original_name=Linear)\n",
- " (matching_head): Linear(original_name=Linear)\n",
- " (context_projection): Linear(original_name=Linear)\n",
- ")"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "module = model.text_encoder\n",
- "module.eval()\n",
- "module.return_features = True\n",
- "\n",
- "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
- "traced_script_module"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Tuple detected at graph output. This will be flattened in the converted model.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
- "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
- "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
- "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
- ]
- }
- ],
- "source": [
- "coreml_model = ct.convert(\n",
- " traced_script_module, source=\"pytorch\",\n",
- " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
- " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
- "\n",
- "coreml_model.author = 'Unum Cloud'\n",
- "coreml_model.license = 'Apache 2.0'\n",
- "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "base",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
new file mode 100644
index 0000000..369c938
--- /dev/null
+++ b/python/scripts/export_encoders.ipynb
@@ -0,0 +1,436 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install --upgrade \"uform[torch]\" coremltools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import uform\n",
+ "from PIL import Image\n",
+ "\n",
+ "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+ "text = 'a small red panda in a zoo'\n",
+ "image = Image.open('../../assets/unum.png')\n",
+ "\n",
+ "image_data = processor.preprocess_image(image)\n",
+ "text_data = processor.preprocess_text(text)\n",
+ "\n",
+ "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
+ "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+ "\n",
+ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.text_encoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.image_encoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
+ "for name, module in model.image_encoder.named_children():\n",
+ " print(f\"First layer of image_encoder: {name}\")\n",
+ " break # We break after the first layer\n",
+ "\n",
+ "for name, module in model.text_encoder.named_children():\n",
+ " print(f\"First layer of text_encoder: {name}\")\n",
+ " break # We break after the first layer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## CoreML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import coremltools as ct\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+ "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+ "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+ "text_features = ct.TensorType(name=\"features\")\n",
+ "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+ "image_features = ct.TensorType(name=\"features\")\n",
+ "image_embeddings = ct.TensorType(name=\"embeddings\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.image_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "\n",
+ "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+ "traced_script_module"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coreml_model = ct.convert(\n",
+ " traced_script_module, source=\"pytorch\",\n",
+ " inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+ " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+ "\n",
+ "coreml_model.author = 'Unum Cloud'\n",
+ "coreml_model.license = 'Apache 2.0'\n",
+ "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+ "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.text_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "\n",
+ "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+ "traced_script_module"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coreml_model = ct.convert(\n",
+ " traced_script_module, source=\"pytorch\",\n",
+ " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+ " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+ "\n",
+ "coreml_model.author = 'Unum Cloud'\n",
+ "coreml_model.license = 'Apache 2.0'\n",
+ "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+ "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PyTorch\n",
+ "\n",
+ "Let's ensure that the input layers and the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from safetensors import safe_open\n",
+ "from safetensors.torch import save_file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.image_encoder.eval()\n",
+ "model.image_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(model.image_encoder.state_dict(), 'image.pt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(model.image_encoder.state_dict(), \"image.safetensors\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.text_encoder.eval()\n",
+ "model.text_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(model.text_encoder.state_dict(), 'text.pt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(model.text_encoder.state_dict(), \"text.safetensors\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n",
+ "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+ "\n",
+ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## ONNX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install onnx onnxconverter-common"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.onnx import export as onnx_export"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.text_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "onnx_export(\n",
+ " module,\n",
+ " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+ " \"text.onnx\", \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['input_ids', 'attention_mask'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'input_ids' : {0 : 'batch_size'}, \n",
+ " 'attention_mask' : {0 : 'batch_size'}, \n",
+ " 'features' : {0 : 'batch_size'}, \n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "from onnxconverter_common import float16\n",
+ "\n",
+ "module = onnx.load(\"text.onnx\")\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, \"text.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now repeat the same for images."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = model.image_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "torch.onnx.export(\n",
+ " module,\n",
+ " image_data, \n",
+ " \"image.onnx\", \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['input'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'input' : {0 : 'batch_size'},\n",
+ " 'features' : {0 : 'batch_size'},\n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "from onnxconverter_common import float16\n",
+ "\n",
+ "module = onnx.load(\"image.onnx\")\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, \"image.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n",
+ "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py
similarity index 99%
rename from python/scripts/test_embeddings.py
rename to python/scripts/test_encoders.py
index 9cdd4c5..e7541c1 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_encoders.py
@@ -22,7 +22,7 @@
onnx_available = False
torch_models = [
- "unum-cloud/uform-vl2-english-small",
+ "unum-cloud/uform2-vl-english-small",
"unum-cloud/uform-vl-english",
"unum-cloud/uform-vl-multilingual-v2",
]
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index cdb1250..f5a15c2 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -7,8 +7,8 @@
class Modality(Enum):
- TEXT = "text"
- IMAGE = "image"
+ TEXT_ENCODER = "text_encoder"
+ IMAGE_ENCODER = "image_encoder"
def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]:
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..c9f8dc3 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -5,7 +5,7 @@
from PIL import Image
from transformers import TextStreamer
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
EOS_TOKEN = 32001
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
new file mode 100644
index 0000000..6792120
--- /dev/null
+++ b/python/uform/gen_model.py
@@ -0,0 +1 @@
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift
index 5efb87f..889cdb6 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EmbeddingsTests.swift
@@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase {
let api = HubApi(hfToken: "xxx")
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
@@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase {
// A better option is to fetch directly from HuggingFace, similar to how users would do that:
let api = HubApi(hfToken: "xxx")
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
let imageModel = try await ImageEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform2-vl-english-small",
hubApi: api
)
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..1eebf29
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,44 @@
+# UForm for Swift
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.forward(with: text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+ let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+ let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+ throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.forward(with: cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+
+### Computing Distances
\ No newline at end of file