From 4f1568fb799b3d150e459a633ad8705efd0ca089 Mon Sep 17 00:00:00 2001
From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com>
Date: Sat, 20 Apr 2024 00:47:58 +0000
Subject: [PATCH] Fix: Rename image inputs

---
 python/scripts/export_encoders.ipynb | 20 +++++++-------------
 swift/Encoders.swift                 |  4 ++--
 2 files changed, 9 insertions(+), 15 deletions(-)
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
index a8d2ac3..029e60a 100644
--- a/python/scripts/export_encoders.ipynb
+++ b/python/scripts/export_encoders.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "Depending on the backend, we prefer different qunatization schemes.\n",
     "\n",
-    "- For ONNX we use `int8` quantization.\n",
+    "- For ONNX we use `uint8` quantization.\n",
     "- For PyTorch we use `bfloat16` quantization.\n",
     "- For CoreML we use `float32` representation."
    ]
@@ -19,6 +19,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "!pip uninstall -y uform\n",
     "!pip install --upgrade \"uform[torch]\" coremltools"
    ]
   },
@@ -42,7 +43,7 @@
     "import uform\n",
     "from PIL import Image\n",
     "\n",
-    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+    "model, processor = uform.get_model('unum-cloud/' + model_name)\n",
     "text = 'a small red panda in a zoo'\n",
     "image = Image.open('../../assets/unum.png')\n",
     "\n",
@@ -122,7 +123,7 @@
     "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
     "\n",
     "```python\n",
-    "        image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "        image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n",
     "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
     "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
     "```\n",
@@ -155,7 +156,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
+    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data.shape, 1))\n",
     "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
     "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
     "text_features = ct.TensorType(name=\"features\")\n",
@@ -403,10 +404,10 @@
     "    export_params=True,\n",
     "    opset_version=15,\n",
     "    do_constant_folding=True,\n",
-    "    input_names = ['input'], \n",
+    "    input_names = ['images'], \n",
     "    output_names = ['features', 'embeddings'],\n",
     "    dynamic_axes={\n",
-    "        'input' : {0 : 'batch_size'},\n",
+    "        'images' : {0 : 'batch_size'},\n",
     "        'features' : {0 : 'batch_size'},\n",
     "        'embeddings' : {0 : 'batch_size'}})"
    ]
@@ -632,13 +633,6 @@
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
     "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
index 44c6e71..3582e91 100644
--- a/swift/Encoders.swift
+++ b/swift/Encoders.swift
@@ -402,14 +402,14 @@ class ImageInput: MLFeatureProvider {
     }
 
     var featureNames: Set<String> {
-        return Set(["input"])
+        return Set(["images"])
     }
 
     // The model expects the input IDs to be an array of integers
     // of length `sequenceLength`, padded with `paddingID` if necessary
     func featureValue(for featureName: String) -> MLFeatureValue? {
         switch featureName {
-        case "input":
+        case "images":
             return precomputedFeature
         default:
             return nil