From 4f1568fb799b3d150e459a633ad8705efd0ca089 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Sat, 20 Apr 2024 00:47:58 +0000 Subject: [PATCH] Fix: Rename image inputs --- python/scripts/export_encoders.ipynb | 20 +++++++------------- swift/Encoders.swift | 4 ++-- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb index a8d2ac3..029e60a 100644 --- a/python/scripts/export_encoders.ipynb +++ b/python/scripts/export_encoders.ipynb @@ -8,7 +8,7 @@ "\n", "Depending on the backend, we prefer different qunatization schemes.\n", "\n", - "- For ONNX we use `int8` quantization.\n", + "- For ONNX we use `uint8` quantization.\n", "- For PyTorch we use `bfloat16` quantization.\n", "- For CoreML we use `float32` representation." ] @@ -19,6 +19,7 @@ "metadata": {}, "outputs": [], "source": [ + "!pip uninstall -y uform\n", "!pip install --upgrade \"uform[torch]\" coremltools" ] }, @@ -42,7 +43,7 @@ "import uform\n", "from PIL import Image\n", "\n", - "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n", + "model, processor = uform.get_model('unum-cloud/' + model_name)\n", "text = 'a small red panda in a zoo'\n", "image = Image.open('../../assets/unum.png')\n", "\n", @@ -122,7 +123,7 @@ "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n", "\n", "```python\n", - " image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n", + " image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n", " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", "```\n", @@ -155,7 +156,7 @@ "metadata": {}, "outputs": [], "source": [ - "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n", + "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n", "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", "text_features = ct.TensorType(name=\"features\")\n", @@ -403,10 +404,10 @@ " export_params=True,\n", " opset_version=15,\n", " do_constant_folding=True,\n", - " input_names = ['input'], \n", + " input_names = ['images'], \n", " output_names = ['features', 'embeddings'],\n", " dynamic_axes={\n", - " 'input' : {0 : 'batch_size'},\n", + " 'images' : {0 : 'batch_size'},\n", " 'features' : {0 : 'batch_size'},\n", " 'embeddings' : {0 : 'batch_size'}})" ] @@ -632,13 +633,6 @@ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/swift/Encoders.swift b/swift/Encoders.swift index 44c6e71..3582e91 100644 --- a/swift/Encoders.swift +++ b/swift/Encoders.swift @@ -402,14 +402,14 @@ class ImageInput: MLFeatureProvider { } var featureNames: Set { - return Set(["input"]) + return Set(["images"]) } // The model expects the input IDs to be an array of integers // of length `sequenceLength`, padded with `paddingID` if necessary func featureValue(for featureName: String) -> MLFeatureValue? { switch featureName { - case "input": + case "images": return precomputedFeature default: return nil