diff --git a/.gitignore b/.gitignore index fbc703a..4db8e17 100755 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,13 @@ test build/ package-lock.json *.egg-info -*.onnx __pycache__ .build .swiftpm -node_modules \ No newline at end of file +.hf_token +node_modules + +# Tensors & ML Model +*.onnx +*.pt +*.safetensors diff --git a/README.md b/README.md index 031c484..32957e7 100755 --- a/README.md +++ b/README.md @@ -20,9 +20,11 @@ For Content Understanding and Generation

Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips +Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX +ONNX • CoreML • PyTorch +
+Python • JavaScript • Swift

--- @@ -279,7 +281,7 @@ The generative model can be used to caption images, summarize their content, or The exact behavior is controlled by prompts. ```python -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen') processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen') diff --git a/javascript/README.md b/javascript/README.md new file mode 100644 index 0000000..5626d39 --- /dev/null +++ b/javascript/README.md @@ -0,0 +1,10 @@ +# UForm for JavaScript + + + +```bash +pnpm add uform +npm add uform +yarn add uform +``` + diff --git a/package.json b/package.json new file mode 100644 index 0000000..7331231 --- /dev/null +++ b/package.json @@ -0,0 +1,11 @@ +{ + "name": "uform", + "private": true, + "version": "2.0.2", + "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", + "dependencies": { + "@huggingface/hub": "^0.14.8", + "@xenova/transformers": "^2.17.0", + "onnxruntime-web": "^1.17.3" + } +} diff --git a/python/scripts/bench.py b/python/scripts/bench.py index 49c7004..8bcaf37 100644 --- a/python/scripts/bench.py +++ b/python/scripts/bench.py @@ -13,7 +13,7 @@ ) from uform import get_model -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor dtype = torch.bfloat16 low_cpu_mem_usage = False diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb deleted file mode 100644 index ce8cf10..0000000 --- a/python/scripts/export.ipynb +++ /dev/null @@ -1,666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scripts for Exporting PyTorch Models to ONNX and CoreML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade \"uform[torch]\" coremltools" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n", - " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n", - " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n", - " warn(f\"Failed to load image Python extension: {e}\")\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fadffc0299c04e249fd4f7a5b40ba0af", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextEncoder(\n", - " original_name=TextEncoder\n", - " (word_embeddings): Embedding(original_name=Embedding)\n", - " (position_embeddings): Embedding(original_name=Embedding)\n", - " (layer_norm): LayerNorm(original_name=LayerNorm)\n", - " (dropout): Dropout(original_name=Dropout)\n", - " (blocks): ModuleList(\n", - " original_name=ModuleList\n", - " (0): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (1): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (2): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (3): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " )\n", - " (embedding_projection): Linear(original_name=Linear)\n", - " (matching_head): Linear(original_name=Linear)\n", - " (context_projection): Linear(original_name=Linear)\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "module = model.text_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", - "traced_script_module" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Tuple detected at graph output. This will be flattened in the converted model.\n", - "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00 MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb new file mode 100644 index 0000000..369c938 --- /dev/null +++ b/python/scripts/export_encoders.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scripts for Exporting PyTorch Models to ONNX and CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade \"uform[torch]\" coremltools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uform\n", + "from PIL import Image\n", + "\n", + "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n", + "text = 'a small red panda in a zoo'\n", + "image = Image.open('../../assets/unum.png')\n", + "\n", + "image_data = processor.preprocess_image(image)\n", + "text_data = processor.preprocess_text(text)\n", + "\n", + "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n", + "for name, module in model.image_encoder.named_children():\n", + " print(f\"First layer of image_encoder: {name}\")\n", + " break # We break after the first layer\n", + "\n", + "for name, module in model.text_encoder.named_children():\n", + " print(f\"First layer of text_encoder: {name}\")\n", + " break # We break after the first layer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import coremltools as ct\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", + "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", + "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", + "text_features = ct.TensorType(name=\"features\")\n", + "text_embeddings = ct.TensorType(name=\"embeddings\")\n", + "image_features = ct.TensorType(name=\"features\")\n", + "image_embeddings = ct.TensorType(name=\"embeddings\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[image_input], outputs=[image_features, image_embeddings],\n", + " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", + " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "Let's ensure that the input layers and the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.image_encoder.eval()\n", + "model.image_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.image_encoder.state_dict(), 'image.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.image_encoder.state_dict(), \"image.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.text_encoder.eval()\n", + "model.text_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(model.text_encoder.state_dict(), 'text.pt')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(model.text_encoder.state_dict(), \"text.safetensors\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_features, image_embedding = model.encode_image(image_data.to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.safetensors image.safetensors\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.safetensors text.safetensors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.pt image.pt\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.pt text.pt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install onnx onnxconverter-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.onnx import export as onnx_export" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "onnx_export(\n", + " module,\n", + " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", + " \"text.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input_ids', 'attention_mask'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input_ids' : {0 : 'batch_size'}, \n", + " 'attention_mask' : {0 : 'batch_size'}, \n", + " 'features' : {0 : 'batch_size'}, \n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"text.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"text.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now repeat the same for images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = model.image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "torch.onnx.export(\n", + " module,\n", + " image_data, \n", + " \"image.onnx\", \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input' : {0 : 'batch_size'},\n", + " 'features' : {0 : 'batch_size'},\n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16\n", + "\n", + "module = onnx.load(\"image.onnx\")\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, \"image.onnx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform2-vl-english-small image.onnx image.onnx\n", + "!huggingface-cli upload unum-cloud/uform2-vl-english-small text.onnx text.onnx" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py similarity index 100% rename from python/scripts/test_generative.py rename to python/scripts/test_decoders.py diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_encoders.py similarity index 99% rename from python/scripts/test_embeddings.py rename to python/scripts/test_encoders.py index 9cdd4c5..e7541c1 100644 --- a/python/scripts/test_embeddings.py +++ b/python/scripts/test_encoders.py @@ -22,7 +22,7 @@ onnx_available = False torch_models = [ - "unum-cloud/uform-vl2-english-small", + "unum-cloud/uform2-vl-english-small", "unum-cloud/uform-vl-english", "unum-cloud/uform-vl-multilingual-v2", ] diff --git a/python/uform/__init__.py b/python/uform/__init__.py index cdb1250..f5a15c2 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -7,8 +7,8 @@ class Modality(Enum): - TEXT = "text" - IMAGE = "image" + TEXT_ENCODER = "text_encoder" + IMAGE_ENCODER = "image_encoder" def get_checkpoint(model_name: str, token: Optional[str], modalities: Tuple[str, Modality]) -> Tuple[str, Mapping, str]: diff --git a/python/uform/chat.py b/python/uform/chat.py index 5ef44b7..c9f8dc3 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -5,7 +5,7 @@ from PIL import Image from transformers import TextStreamer -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor EOS_TOKEN = 32001 diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py new file mode 100644 index 0000000..6792120 --- /dev/null +++ b/python/uform/gen_model.py @@ -0,0 +1 @@ +from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift index 5efb87f..889cdb6 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EmbeddingsTests.swift @@ -27,7 +27,7 @@ final class TokenizerTests: XCTestCase { let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) @@ -78,11 +78,11 @@ final class TokenizerTests: XCTestCase { // A better option is to fetch directly from HuggingFace, similar to how users would do that: let api = HubApi(hfToken: "xxx") let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform2-vl-english-small", hubApi: api ) diff --git a/swift/README.md b/swift/README.md new file mode 100644 index 0000000..1eebf29 --- /dev/null +++ b/swift/README.md @@ -0,0 +1,44 @@ +# UForm for Swift + +UForm offers first-party support for Swift. +To get started, add UForm to your project using Swift Package Manager. + +```bash +swift package init --type executable +swift package add uform +``` + +Then, import UForm in your Swift code: + +```swift +import UForm +``` + +## Embeddings + +### Text Embeddings + +```swift +let textModel = try await TextEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." +let textEmbedding: Embedding = try textModel.forward(with: text) +let textVector: [Float32] = textEmbedding.asFloats() +``` + +### Image Embeddings + +```swift +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform2-vl-english-small") +let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" +guard let url = URL(string: imageURL), + let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), + let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) { + throw Exception("Could not load image from URL: \(imageURL)") +} + +var imageEmbedding: Embedding = try imageModel.forward(with: cgImage) +var imageVector: [Float32] = embedding.asFloats() +``` + + +### Computing Distances \ No newline at end of file