From b5c728c82163ec5479f717c55e9be9d3913ef961 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 00:08:22 +0000
Subject: [PATCH 01/15] Add slice tensor operator

---
 src/ops/registry.js | 11 +++++++++++
 src/utils/tensor.js | 23 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)
diff --git a/src/ops/registry.js b/src/ops/registry.js
index 9b65fa4a8..f641fe878 100644
--- a/src/ops/registry.js
+++ b/src/ops/registry.js
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
         }
         return this._top_k;
     }
+
+    static get slice() {
+        if (!this._slice) {
+            this._slice = wrap(
+                [8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
+                this.session_options,
+                'y',
+            )
+        }
+        return this._slice;
+    }
 }
diff --git a/src/utils/tensor.js b/src/utils/tensor.js
index 8b8133770..6c5c85bff 100644
--- a/src/utils/tensor.js
+++ b/src/utils/tensor.js
@@ -971,6 +971,29 @@ export async function topk(x, k) {
     });
 }
 
+
+const arrayToIndexTensor = (array) => new Tensor('int64', array, [array.length]);
+/**
+ * Slice a multidimensional float32 tensor.
+ * @param {Tensor} data: Tensor of data to extract slices from
+ * @param {number[]} starts: 1-D array of starting indices of corresponding axis in axes
+ * @param {number[]} ends: 1-D array of ending indices (exclusive) of corresponding axis in axes
+ * @param {number[]} axes: 1-D array of axes that starts and ends apply to
+ * @param {number[]} [steps]: 1-D array of slice step of corresponding axis in axes.
+ * @returns {Promise<Tensor>} Sliced data tensor.
+ */
+export async function slice(data, starts, ends, axes, steps) {
+    const op = await TensorOpRegistry.slice;
+    return await op({
+        x: data, 
+        s: arrayToIndexTensor(starts), 
+        e: arrayToIndexTensor(ends), 
+        a: arrayToIndexTensor(axes), 
+        t: arrayToIndexTensor(steps ?? new Array(axes.length).fill(1)),
+    });
+}
+
+
 /**
  * Perform mean pooling of the last hidden state followed by a normalization step.
  * @param {Tensor} last_hidden_state Tensor of shape [batchSize, seqLength, embedDim]

From f1f93ab5d5af90161ba5c3017ac53fe73d152fc6 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 00:08:44 +0000
Subject: [PATCH 02/15] Add slice unit test

---
 tests/utils/tensor.test.js | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js
index 622a14281..6fe0881c3 100644
--- a/tests/utils/tensor.test.js
+++ b/tests/utils/tensor.test.js
@@ -1,6 +1,9 @@
-import { Tensor, cat, mean, stack, layer_norm } from "../../src/transformers.js";
+import { Tensor, cat, mean, stack, layer_norm, slice } from "../../src/transformers.js";
+import { init } from "../init.js";
 import { compare } from "../test_utils.js";
 
+init();
+
 describe("Tensor operations", () => {
   describe("cat", () => {
     it("should concatenate on dim=0", async () => {
@@ -204,4 +207,27 @@ describe("Tensor operations", () => {
       compare(norm, target, 1e-3);
     });
   });
+
+  describe("slice", () => {
+    it("should slice", async () => {
+      const input = new Tensor('float32', [
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+      ], [3, 3]);
+
+      const target = new Tensor('float32', [
+        1, 2,
+        4, 5,
+      ], [2, 2]);
+
+      const starts = [0, 0];
+      const ends = [2, 2];
+      const axes = [0, 1];
+      const steps = [1, 1];
+
+      const sliced = await slice(input, starts, ends, axes, steps);
+      compare(sliced, target, 1e-3);
+    });
+  })
 });

From 98ed2e9ca332af947473cdae4997b218fb0faa48 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 00:17:15 +0000
Subject: [PATCH 03/15] Add support for Phi3V & Phi3.5V

---
 README.md                                    |   3 +-
 docs/snippets/6_supported-models.snippet     |   3 +-
 src/base/image_processors_utils.js           |   4 +-
 src/configs.js                               |   5 +-
 src/models.js                                |  92 ++++++++++++++
 src/models/image_processors.js               |   1 +
 src/models/phi3_v/image_processing_phi3_v.js | 127 +++++++++++++++++++
 src/models/phi3_v/processing_phi3_v.js       |  52 ++++++++
 src/models/processors.js                     |   1 +
 9 files changed, 283 insertions(+), 5 deletions(-)
 create mode 100644 src/models/phi3_v/image_processing_phi3_v.js
 create mode 100644 src/models/phi3_v/processing_phi3_v.js

diff --git a/README.md b/README.md
index e56a7faaf..68d79e27b 100644
--- a/README.md
+++ b/README.md
@@ -381,7 +381,8 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/abs/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v2) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **Phi3V** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v4) by Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Qin Cai, Vishrav Chaudhary, Dong Chen, Dongdong Chen, Weizhu Chen, Yen-Chun Chen, Yi-Ling Chen, Hao Cheng, Parul Chopra, Xiyang Dai, Matthew Dixon, Ronen Eldan, Victor Fragoso, Jianfeng Gao, Mei Gao, Min Gao, Amit Garg, Allie Del Giorno, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Wenxiang Hu, Jamie Huynh, Dan Iter, Sam Ade Jacobs, Mojan Javaheripi, Xin Jin, Nikos Karampatziakis, Piero Kauffmann, Mahoud Khademi, Dongwoo Kim, Young Jin Kim, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Yunsheng Li, Chen Liang, Lars Liden, Xihui Lin, Zeqi Lin, Ce Liu, Liyuan Liu, Mengchen Liu, Weishung Liu, Xiaodong Liu, Chong Luo, Piyush Madan, Ali Mahmoudzadeh, David Majercak, Matt Mazzola, Caio César Teodoro Mendes, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Liliang Ren, Gustavo de Rosa, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Yelong Shen, Swadheen Shukla, Xia Song, Masahiro Tanaka, Andrea Tupini, Praneetha Vaddamanu, Chunyu Wang, Guanhua Wang, Lijuan Wang , Shuohang Wang, Xin Wang, Yu Wang, Rachel Ward, Wen Wen, Philipp Witte, Haiping Wu, Xiaoxia Wu, Michael Wyatt, Bin Xiao, Can Xu, Jiahang Xu, Weijian Xu, Jilong Xue, Sonali Yadav, Fan Yang, Jianwei Yang, Yifan Yang, Ziyi Yang, Donghan Yu, Lu Yuan, Chenruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **PyAnnote** released in the repository [pyannote/pyannote-audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index ad4f6cdc4..aa971793e 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -96,7 +96,8 @@
 1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from IBM) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/abs/2306.09364) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from Princeton University, IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Phi](https://huggingface.co/docs/transformers/main/model_doc/phi)** (from Microsoft) released with the papers - [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li, [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
-1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **[Phi3](https://huggingface.co/docs/transformers/main/model_doc/phi3)** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v2) by Marah Abdin, Sam Ade Jacobs, Ammar Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Caio César Teodoro Mendes, Weizhu Chen, Vishrav Chaudhary, Parul Chopra, Allie Del Giorno, Gustavo de Rosa, Matthew Dixon, Ronen Eldan, Dan Iter, Amit Garg, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Jamie Huynh, Mojan Javaheripi, Xin Jin, Piero Kauffmann, Nikos Karampatziakis, Dongwoo Kim, Mahoud Khademi, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Chen Liang, Weishung Liu, Eric Lin, Zeqi Lin, Piyush Madan, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Xia Song, Masahiro Tanaka, Xin Wang, Rachel Ward, Guanhua Wang, Philipp Witte, Michael Wyatt, Can Xu, Jiahang Xu, Sonali Yadav, Fan Yang, Ziyi Yang, Donghan Yu, Chengruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
+1. **Phi3V** (from Microsoft) released with the paper [Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone](https://arxiv.org/abs/2404.14219v4) by Marah Abdin, Jyoti Aneja, Hany Awadalla, Ahmed Awadallah, Ammar Ahmad Awan, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Jianmin Bao, Harkirat Behl, Alon Benhaim, Misha Bilenko, Johan Bjorck, Sébastien Bubeck, Martin Cai, Qin Cai, Vishrav Chaudhary, Dong Chen, Dongdong Chen, Weizhu Chen, Yen-Chun Chen, Yi-Ling Chen, Hao Cheng, Parul Chopra, Xiyang Dai, Matthew Dixon, Ronen Eldan, Victor Fragoso, Jianfeng Gao, Mei Gao, Min Gao, Amit Garg, Allie Del Giorno, Abhishek Goswami, Suriya Gunasekar, Emman Haider, Junheng Hao, Russell J. Hewett, Wenxiang Hu, Jamie Huynh, Dan Iter, Sam Ade Jacobs, Mojan Javaheripi, Xin Jin, Nikos Karampatziakis, Piero Kauffmann, Mahoud Khademi, Dongwoo Kim, Young Jin Kim, Lev Kurilenko, James R. Lee, Yin Tat Lee, Yuanzhi Li, Yunsheng Li, Chen Liang, Lars Liden, Xihui Lin, Zeqi Lin, Ce Liu, Liyuan Liu, Mengchen Liu, Weishung Liu, Xiaodong Liu, Chong Luo, Piyush Madan, Ali Mahmoudzadeh, David Majercak, Matt Mazzola, Caio César Teodoro Mendes, Arindam Mitra, Hardik Modi, Anh Nguyen, Brandon Norick, Barun Patra, Daniel Perez-Becker, Thomas Portet, Reid Pryzant, Heyang Qin, Marko Radmilac, Liliang Ren, Gustavo de Rosa, Corby Rosset, Sambudha Roy, Olatunji Ruwase, Olli Saarikivi, Amin Saied, Adil Salim, Michael Santacroce, Shital Shah, Ning Shang, Hiteshi Sharma, Yelong Shen, Swadheen Shukla, Xia Song, Masahiro Tanaka, Andrea Tupini, Praneetha Vaddamanu, Chunyu Wang, Guanhua Wang, Lijuan Wang , Shuohang Wang, Xin Wang, Yu Wang, Rachel Ward, Wen Wen, Philipp Witte, Haiping Wu, Xiaoxia Wu, Michael Wyatt, Bin Xiao, Can Xu, Jiahang Xu, Weijian Xu, Jilong Xue, Sonali Yadav, Fan Yang, Jianwei Yang, Yifan Yang, Ziyi Yang, Donghan Yu, Lu Yuan, Chenruidong Zhang, Cyril Zhang, Jianwen Zhang, Li Lyna Zhang, Yi Zhang, Yue Zhang, Yunan Zhang, Xiren Zhou.
 1. **[PVT](https://huggingface.co/docs/transformers/main/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
 1. **PyAnnote** released in the repository [pyannote/pyannote-audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin.
 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu.
diff --git a/src/base/image_processors_utils.js b/src/base/image_processors_utils.js
index 6788258f6..30111ad0c 100644
--- a/src/base/image_processors_utils.js
+++ b/src/base/image_processors_utils.js
@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
      * Pad the image by a certain amount.
      * @param {Float32Array} pixelData The pixel data to pad.
      * @param {number[]} imgDims The dimensions of the image (height, width, channels).
-     * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
+     * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
      * @param {Object} options The options for padding.
      * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
      * @param {boolean} [options.center=false] Whether to center the image.
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
         if (typeof padSize === 'number') {
             paddedImageWidth = padSize;
             paddedImageHeight = padSize;
+        } else if (padSize === 'square') {
+            paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
         } else {
             paddedImageWidth = padSize.width;
             paddedImageHeight = padSize.height;
diff --git a/src/configs.js b/src/configs.js
index a40bb59d9..8964c6506 100644
--- a/src/configs.js
+++ b/src/configs.js
@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
         case 'gpt_neox':
         case 'stablelm':
         case 'opt':
-        case 'phi':
-        case 'phi3':
         case 'falcon':
             mapping['num_heads'] = 'num_attention_heads';
             mapping['num_layers'] = 'num_hidden_layers';
@@ -112,6 +110,9 @@ function getNormalizedConfig(config) {
         case 'starcoder2':
         case 'qwen2':
         case 'qwen2_vl':
+        case 'phi':
+        case 'phi3':
+        case 'phi3_v':
             mapping['num_heads'] = 'num_key_value_heads';
             mapping['num_layers'] = 'num_hidden_layers';
             mapping['hidden_size'] = 'hidden_size';
diff --git a/src/models.js b/src/models.js
index f8242b5f0..085feb8b6 100644
--- a/src/models.js
+++ b/src/models.js
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
     ImageTextToText: 6,
     Musicgen: 7,
     MultiModality: 8,
+    Phi3V: 9,
 }
 //////////////////////////////////////////////////
 
@@ -906,6 +907,10 @@ export class PreTrainedModel extends Callable {
                 this._forward = imageTextToTextForward;
                 this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
                 break;
+            case MODEL_TYPES.Phi3V:
+                this.can_generate = true;
+                this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
+                break;
 
             case MODEL_TYPES.MultiModality:
                 this.can_generate = true;
@@ -1070,6 +1075,18 @@ export class PreTrainedModel extends Callable {
                 }, options),
             ]);
 
+        } else if (modelType === MODEL_TYPES.Phi3V) {
+            info = await Promise.all([
+                constructSessions(pretrained_model_name_or_path, {
+                    prepare_inputs_embeds: 'prepare_inputs_embeds',
+                    model: 'model',
+                    vision_encoder: 'vision_encoder',
+                }, options),
+                getOptionalConfigs(pretrained_model_name_or_path, {
+                    generation_config: 'generation_config.json',
+                }, options),
+            ]);
+
         } else { // should be MODEL_TYPES.EncoderOnly
             if (modelType !== MODEL_TYPES.EncoderOnly) {
                 const type = modelName ?? config?.model_type;
@@ -3612,6 +3629,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
 }
 //////////////////////////////////////////////////
 
+export class Phi3VPreTrainedModel extends PreTrainedModel {
+    forward_params = [
+        'input_ids',
+        'inputs_embeds',
+        'attention_mask',
+        'position_ids',
+        'pixel_values',
+        'image_sizes',
+        'past_key_values',
+    ];
+}
+export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
+
+    async forward({
+        // Produced by the tokenizer/processor:
+        input_ids = null,
+        attention_mask = null,
+        pixel_values = null,
+        image_sizes = null,
+
+        // Used during generation:
+        position_ids = null,
+        inputs_embeds = null,
+        past_key_values = null,
+
+        // Generic generation parameters
+        generation_config = null,
+        logits_processor = null,
+
+        // TODO: needed?
+        ...kwargs
+    }) {
+        if (!inputs_embeds) {
+            let image_features;
+            if (pixel_values && input_ids.dims[1] !== 1) {
+                if (!image_sizes) {
+                    throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
+                }
+
+                // Encode the image
+                ({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
+                    pixel_values,
+                    image_sizes,
+                }));
+            } else {
+                const hidden_size = this.config.normalized_config.hidden_size;
+                image_features = new Tensor(
+                    'float32',
+                    [],
+                    [0, hidden_size],
+                );
+            }
+
+            ({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
+                input_ids,
+                image_features,
+            }));
+        }
+
+        const outputs = await decoderForward(this, {
+            inputs_embeds,
+            past_key_values,
+            attention_mask,
+            position_ids,
+            generation_config,
+            logits_processor,
+        }, false);
+        return outputs;
+    }
+}
+
 //////////////////////////////////////////////////
 export class CLIPPreTrainedModel extends PreTrainedModel { }
 
@@ -6994,6 +7082,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
     ['falcon', ['FalconForCausalLM', FalconForCausalLM]],
     ['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
     ['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
+
+    // Also image-text-to-text
+    ['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
 ]);
 
 const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
@@ -7231,6 +7322,7 @@ const CUSTOM_MAPPING = [
     // OVERRIDE:
     // TODO: Refactor to allow class to specify model
     ['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
+    ['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
 
     ['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
     ['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
diff --git a/src/models/image_processors.js b/src/models/image_processors.js
index 02815771c..fd002c81c 100644
--- a/src/models/image_processors.js
+++ b/src/models/image_processors.js
@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
 export * from './nougat/image_processing_nougat.js'
 export * from './owlv2/image_processing_owlv2.js'
 export * from './owlvit/image_processing_owlvit.js'
+export * from './phi3_v/image_processing_phi3_v.js'
 export * from './pvt/image_processing_pvt.js'
 export * from './qwen2_vl/image_processing_qwen2_vl.js'
 export * from './rt_detr/image_processing_rt_detr.js'
diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/phi3_v/image_processing_phi3_v.js
new file mode 100644
index 000000000..c84fcc710
--- /dev/null
+++ b/src/models/phi3_v/image_processing_phi3_v.js
@@ -0,0 +1,127 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
+
+const IMAGE_SIZE = 336;
+const SLICE_AXES = [2, 3]; // axes to slice on
+const { floor, sqrt } = Math;
+
+export class Phi3VImageProcessor extends ImageProcessor {
+    constructor(config) {
+        super({
+            ...config,
+            do_normalize: true,
+            do_pad: true,
+            pad_size: 'square',
+            do_convert_rgb: true,
+        });
+    }
+    calc_num_image_tokens_from_image_size(width, height) {
+        // @ts-expect-error
+        const { num_img_tokens } = this.config;
+        return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
+    }
+
+    /** @type {ImageProcessor['pad_image']} */
+    pad_image(pixelData, imgDims, padSize, options = {}) {
+        // Pad with white pixels
+        // NOTE: Since padding is done after normalization, we need to fill with the normalized values
+        const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
+        return super.pad_image(pixelData, imgDims, padSize, {
+            center: true,
+            constant_values,
+            ...options,
+        });
+    }
+
+    async _call(images, {
+        num_crops = null,
+    } = {}) {
+        num_crops ??= this.config.num_crops;
+        if (num_crops === 1 || sqrt(num_crops) % 1 !== 0) {
+            // Disallow num_crops==1 since it won't add extra information
+            throw new Error("num_crops must be a square number not equal to 1");
+        }
+
+        if (!Array.isArray(images)) {
+            images = [images];
+        }
+
+        const num_images = images.length;
+        const imageData = await Promise.all(images.map(x => this.preprocess(x)));
+
+        const original_sizes = imageData.map(x => x.original_size);
+        const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
+
+        // Process each image in batch
+        const all_pixel_values = [];
+        for (const { pixel_values } of imageData) {
+            pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
+
+            const [height, width] = pixel_values.dims.slice(-2);
+
+            // Global image (Tensor of shape [num_channels, height, width])
+            const batch_pixel_values = await interpolate_4d(pixel_values, {
+                size: [IMAGE_SIZE, IMAGE_SIZE],
+                mode: 'bicubic',
+            });
+
+            if (num_crops > 0) {
+                const patches = [];
+                const sqrt_patches = sqrt(num_crops);
+                const patch_width = floor(width / sqrt_patches);
+                const patch_height = floor(height / sqrt_patches);
+                for (let y = 0; y < sqrt_patches; ++y) {
+                    for (let x = 0; x < sqrt_patches; ++x) {
+                        let start_x, start_y, end_x, end_y;
+                        if (y === sqrt_patches - 1) { // At bottom
+                            start_y = height - patch_height;
+                            end_y = height;
+                        } else {
+                            start_y = y * patch_height;
+                            end_y = (y + 1) * patch_height;
+                        }
+                        if (x === sqrt_patches - 1) { // At right
+                            start_x = width - patch_width;
+                            end_x = width;
+                        } else {
+                            start_x = x * patch_width;
+                            end_x = (x + 1) * patch_width;
+                        }
+
+                        const starts = [start_y, start_x];
+                        const ends = [end_y, end_x];
+                        const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
+                        patches.push(patch);
+                    }
+                }
+
+                const resized_tensors = await interpolate_4d(cat(patches, 0), {
+                    size: [IMAGE_SIZE, IMAGE_SIZE],
+                    mode: 'bicubic',
+                }); // [num_crops, 3, 336, 336]
+
+                // Concatenate the global image with the patches
+                all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
+            } else {
+                // Only use the global image
+                all_pixel_values.push(batch_pixel_values);
+            }
+        }
+
+        // [num_images, 1 + num_crops, num_channels=3, height, width]
+        const pixel_values = stack(all_pixel_values, 0);
+
+        const image_sizes = new Tensor(
+            'int64',
+            new Array(num_images).fill([IMAGE_SIZE, IMAGE_SIZE]).flat(),
+            [num_images, 2],
+        )
+
+        const image_tokens = this.calc_num_image_tokens_from_image_size(IMAGE_SIZE, IMAGE_SIZE);
+        const num_img_tokens = new Array(num_images).fill(image_tokens);
+
+        return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
+    }
+}
diff --git a/src/models/phi3_v/processing_phi3_v.js b/src/models/phi3_v/processing_phi3_v.js
new file mode 100644
index 000000000..36a8bd06f
--- /dev/null
+++ b/src/models/phi3_v/processing_phi3_v.js
@@ -0,0 +1,52 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+
+const IMAGE_TOKEN = "<|image|>";
+const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
+
+export class Phi3VProcessor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+
+    /**
+     * 
+     * @param {string|string[]} text 
+     * @param {RawImage|RawImage[]} images 
+     * @param  {...any} args 
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, ...args) {
+
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+
+        let text_inputs, image_inputs;
+        if (images) {
+            image_inputs = await this.image_processor(images);
+            const { num_img_tokens } = image_inputs;
+
+            // The original implementation adds a bos_token before the image tokens
+            // TODO: Check if this affects performance, since it looks like a bug in the original implementation
+            const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
+
+            text_inputs = this.tokenizer(prompt_chunks, {
+                padding: true,
+                truncation: true,
+            });
+
+            // The model expects image tokens to be negative, so we negate the image token ids
+            const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
+            text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
+        } else {
+            text_inputs = this.tokenizer(text);
+        }
+
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}
diff --git a/src/models/processors.js b/src/models/processors.js
index ee388851c..d254ad118 100644
--- a/src/models/processors.js
+++ b/src/models/processors.js
@@ -4,6 +4,7 @@ export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
+export * from './phi3_v/processing_phi3_v.js';
 export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';

From 72735c4d811d0fb648c68a41b63a3f20f121222e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 13:48:43 +0000
Subject: [PATCH 04/15] Improve padding and use smart resizing

---
 src/models/phi3_v/image_processing_phi3_v.js | 43 ++++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/phi3_v/image_processing_phi3_v.js
index c84fcc710..26d354548 100644
--- a/src/models/phi3_v/image_processing_phi3_v.js
+++ b/src/models/phi3_v/image_processing_phi3_v.js
@@ -5,7 +5,7 @@ import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js
 
 const IMAGE_SIZE = 336;
 const SLICE_AXES = [2, 3]; // axes to slice on
-const { floor, sqrt } = Math;
+const { ceil, floor, sqrt } = Math;
 
 export class Phi3VImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -13,8 +13,9 @@ export class Phi3VImageProcessor extends ImageProcessor {
             ...config,
             do_normalize: true,
             do_pad: true,
-            pad_size: 'square',
+            pad_size: 'custom',
             do_convert_rgb: true,
+            do_resize: true, // Smart resizing "hd_transform"
         });
     }
     calc_num_image_tokens_from_image_size(width, height) {
@@ -23,12 +24,46 @@ export class Phi3VImageProcessor extends ImageProcessor {
         return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
     }
 
+    /** @type {ImageProcessor['get_resize_output_image_size']} */
+    get_resize_output_image_size(image, size) {
+        // @ts-expect-error
+        const hd_num = this.config.num_crops;
+        const [width, height] = image.size
+
+        let ratio = width / height;
+        let scale = 1;
+
+        // Calculate the scaling factor
+        while (scale * Math.ceil(scale / ratio) <= hd_num) {
+            scale += 1;
+        }
+        scale -= 1;
+
+        // Compute the new dimensions
+        const new_w = Math.floor(scale * 336);
+        const new_h = Math.floor(new_w / ratio);
+
+        return [new_w, new_h]
+    }
+
+
     /** @type {ImageProcessor['pad_image']} */
     pad_image(pixelData, imgDims, padSize, options = {}) {
-        // Pad with white pixels
+        // Phi3V uses a custom padding strategy:
+        // - Pad the shortest edge to a multiple of 336
+        // - Longest edge remains unchanged
+        // - Pad with white pixels
+        const [imageHeight, imageWidth] = imgDims;
+        let height = imageHeight, width = imageWidth;
+        if (imageHeight < imageWidth) {
+            height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
+        } else {
+            width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
+        }
+
         // NOTE: Since padding is done after normalization, we need to fill with the normalized values
         const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
-        return super.pad_image(pixelData, imgDims, padSize, {
+        return super.pad_image(pixelData, imgDims, { width, height }, {
             center: true,
             constant_values,
             ...options,

From f40f47ac497b138c0f360aa38e26bb8d25fdca5a Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 13:51:03 +0000
Subject: [PATCH 05/15] Formatting

---
 tests/utils/tensor.test.js | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js
index 672109985..a412f8c2b 100644
--- a/tests/utils/tensor.test.js
+++ b/tests/utils/tensor.test.js
@@ -210,16 +210,9 @@ describe("Tensor operations", () => {
 
   describe("slice", () => {
     it("should slice", async () => {
-      const input = new Tensor('float32', [
-        1, 2, 3,
-        4, 5, 6,
-        7, 8, 9,
-      ], [3, 3]);
+      const input = new Tensor("float32", [1, 2, 3, 4, 5, 6, 7, 8, 9], [3, 3]);
 
-      const target = new Tensor('float32', [
-        1, 2,
-        4, 5,
-      ], [2, 2]);
+      const target = new Tensor("float32", [1, 2, 4, 5], [2, 2]);
 
       const starts = [0, 0];
       const ends = [2, 2];
@@ -229,7 +222,7 @@ describe("Tensor operations", () => {
       const sliced = await slice(input, starts, ends, axes, steps);
       compare(sliced, target, 1e-3);
     });
-  })
+  });
   describe("to", () => {
     it("float32 to int32 (number to number)", async () => {
       const t1 = new Tensor("float32", [1, 2, 3, 4, 5, 6], [2, 3]);

From 6d989a095acc5974396692be8229de62e3eae858 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 16:40:54 +0000
Subject: [PATCH 06/15] Relax precision for musicgen test

---
 tests/models/musicgen/test_modeling_musicgen.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/musicgen/test_modeling_musicgen.js b/tests/models/musicgen/test_modeling_musicgen.js
index 7ebf808ed..e16cf022b 100644
--- a/tests/models/musicgen/test_modeling_musicgen.js
+++ b/tests/models/musicgen/test_modeling_musicgen.js
@@ -27,7 +27,7 @@ export default () => {
         const decoder_input_ids = full([inputs.input_ids.dims[0] * model.config.decoder.num_codebooks, 1], pad_token_id);
         const { logits } = await model({ ...inputs, decoder_input_ids });
         expect(logits.dims).toEqual([8, 1, 99]);
-        expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 5);
+        expect(logits.mean().item()).toBeCloseTo(-0.0018370470497757196, 4);
       },
       MAX_TEST_EXECUTION_TIME,
     );

From 3faee12b5ce84649f7c54ff1f3cbff3aeb46271f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 16:41:21 +0000
Subject: [PATCH 07/15] Add interpolate upscale unit test

---
 tests/utils/tensor_ops.test.js | 276 ++++++++++++++++++++++++---------
 1 file changed, 204 insertions(+), 72 deletions(-)

diff --git a/tests/utils/tensor_ops.test.js b/tests/utils/tensor_ops.test.js
index 3227d5f58..a0ad1f076 100644
--- a/tests/utils/tensor_ops.test.js
+++ b/tests/utils/tensor_ops.test.js
@@ -1,4 +1,4 @@
-import { Tensor, interpolate_4d, matmul, rfft } from "../../src/transformers.js";
+import { Tensor, interpolate_4d, matmul, rfft, slice } from "../../src/transformers.js";
 import { init } from "../init.js";
 
 // Initialise the testing environment
@@ -6,7 +6,7 @@ init();
 
 function expectToBeCloseToArray(actual, expected) {
   expect(actual.length).toEqual(expected.length);
-  actual.forEach((x, i) => expect(x).toBeCloseTo(expected[i]));
+  actual.forEach((x, i) => expect(x).toBeCloseTo(expected[i]), 6);
 }
 
 function range(start, stop = undefined, step = 1) {
@@ -24,100 +24,211 @@ function range(start, stop = undefined, step = 1) {
 
 describe("Tensor operations", () => {
   describe("interpolate", () => {
-    const input = new Tensor(
-      "float32",
-      new Float32Array(2 * 3 * 4 * 5).map((_, i) => i),
-      [2, 3, 4, 5],
-    );
-
-    const size = [2, 3, 3, 2];
-    it("bilinear", async () => {
-      const resized = await interpolate_4d(input, { mode: "bilinear", size });
-      const target = new Float32Array(
-        [
+    describe("downscale", () => {
+      const input = new Tensor(
+        "float32",
+        new Float32Array(2 * 3 * 4 * 5).map((_, i) => i),
+        [2, 3, 4, 5],
+      );
+
+      const size = [2, 3, 3, 2];
+      it("bilinear", async () => {
+        const resized = await interpolate_4d(input, { mode: "bilinear", size });
+        const target = new Float32Array(
           [
             [
-              [1.5833335, 4.0833335],
-              [8.25, 10.75],
-              [14.916668, 17.416668],
-            ],
-            [
-              [21.583332, 24.083334],
-              [28.25, 30.75],
-              [34.916668, 37.416668],
+              [
+                [1.5833335, 4.0833335],
+                [8.25, 10.75],
+                [14.916668, 17.416668],
+              ],
+              [
+                [21.583332, 24.083334],
+                [28.25, 30.75],
+                [34.916668, 37.416668],
+              ],
+              [
+                [41.583332, 44.083332],
+                [48.25, 50.75],
+                [54.916668, 57.416668],
+              ],
             ],
             [
-              [41.583332, 44.083332],
-              [48.25, 50.75],
-              [54.916668, 57.416668],
+              [
+                [61.583332, 64.083336],
+                [68.25, 70.75],
+                [74.916664, 77.41667],
+              ],
+              [
+                [81.58333, 84.083336],
+                [88.25, 90.75],
+                [94.91667, 97.41667],
+              ],
+              [
+                [101.583336, 104.08333],
+                [108.25, 110.75],
+                [114.916664, 117.416664],
+              ],
             ],
-          ],
+          ].flat(Infinity),
+        );
+
+        expectToBeCloseToArray(target, resized.data);
+      });
+
+      it("bicubic", async () => {
+        const resized = await interpolate_4d(input, { mode: "bicubic", size });
+
+        const target = new Float32Array(
           [
             [
-              [61.583332, 64.083336],
-              [68.25, 70.75],
-              [74.916664, 77.41667],
+              [
+                [1.2987545, 3.9628172],
+                [8.167969, 10.832031],
+                [15.037184, 17.701244],
+              ],
+              [
+                [21.298756, 23.962818],
+                [28.167969, 30.832031],
+                [35.037186, 37.701252],
+              ],
+              [
+                [41.298756, 43.96282],
+                [48.16797, 50.83203],
+                [55.037193, 57.701256],
+              ],
             ],
             [
-              [81.58333, 84.083336],
-              [88.25, 90.75],
-              [94.91667, 97.41667],
+              [
+                [61.29875, 63.96282],
+                [68.16797, 70.83203],
+                [75.03719, 77.701256],
+              ],
+              [
+                [81.29875, 83.96282],
+                [88.16797, 90.83203],
+                [95.03721, 97.70126],
+              ],
+              [
+                [101.29875, 103.962814],
+                [108.16797, 110.83203],
+                [115.03721, 117.70127],
+              ],
             ],
-            [
-              [101.583336, 104.08333],
-              [108.25, 110.75],
-              [114.916664, 117.416664],
-            ],
-          ],
-        ].flat(Infinity),
-      );
+          ].flat(Infinity),
+        );
 
-      expectToBeCloseToArray(target, resized.data);
+        expectToBeCloseToArray(target, resized.data);
+      });
     });
+    describe("upscale", () => {
+      const input = new Tensor(
+        "float32",
+        new Float32Array(2 * 3 * 3 * 2).map((_, i) => i),
+        [2, 3, 3, 2],
+      );
 
-    it("bicubic", async () => {
-      const resized = await interpolate_4d(input, { mode: "bicubic", size });
-
-      const target = new Float32Array(
-        [
+      const size = [2, 3, 4, 5];
+      it("bilinear", async () => {
+        const resized = await interpolate_4d(input, { mode: "bilinear", size });
+        const target = new Float32Array(
           [
             [
-              [1.2987545, 3.9628172],
-              [8.167969, 10.832031],
-              [15.037184, 17.701244],
-            ],
-            [
-              [21.298756, 23.962818],
-              [28.167969, 30.832031],
-              [35.037186, 37.701252],
+              [
+                [0.0, 0.1, 0.5, 0.9, 1.0],
+                [1.25, 1.35, 1.75, 2.15, 2.25],
+                [2.75, 2.85, 3.25, 3.65, 3.75],
+                [4.0, 4.1, 4.5, 4.9, 5.0],
+              ],
+              [
+                [6.0, 6.1, 6.5, 6.9, 7.0],
+                [7.25, 7.35, 7.75, 8.15, 8.25],
+                [8.75, 8.85, 9.25, 9.65, 9.75],
+                [10.0, 10.1, 10.5, 10.9, 11.0],
+              ],
+              [
+                [12.0, 12.1, 12.5, 12.9, 13.0],
+                [13.25, 13.35, 13.75, 14.15, 14.25],
+                [14.75, 14.85, 15.25, 15.65, 15.75],
+                [16.0, 16.1, 16.5, 16.9, 17.0],
+              ],
             ],
             [
-              [41.298756, 43.96282],
-              [48.16797, 50.83203],
-              [55.037193, 57.701256],
+              [
+                [18.0, 18.1, 18.5, 18.9, 19.0],
+                [19.25, 19.35, 19.75, 20.15, 20.25],
+                [20.75, 20.85, 21.25, 21.65, 21.75],
+                [22.0, 22.1, 22.5, 22.9, 23.0],
+              ],
+              [
+                [24.0, 24.1, 24.5, 24.9, 25.0],
+                [25.25, 25.35, 25.75, 26.15, 26.25],
+                [26.75, 26.85, 27.25, 27.65, 27.75],
+                [28.0, 28.1, 28.5, 28.9, 29.0],
+              ],
+              [
+                [30.0, 30.1, 30.5, 30.9, 31.0],
+                [31.25, 31.35, 31.75, 32.15, 32.25],
+                [32.75, 32.85, 33.25, 33.65, 33.75],
+                [34.0, 34.1, 34.5, 34.9, 35.0],
+              ],
             ],
-          ],
+          ].flat(Infinity),
+        );
+
+        expectToBeCloseToArray(target, resized.data);
+      });
+
+      it("bicubic", async () => {
+        const resized = await interpolate_4d(input, { mode: "bicubic", size });
+
+        const target = new Float32Array(
           [
             [
-              [61.29875, 63.96282],
-              [68.16797, 70.83203],
-              [75.03719, 77.701256],
-            ],
-            [
-              [81.29875, 83.96282],
-              [88.16797, 90.83203],
-              [95.03721, 97.70126],
+              [
+                [-0.253804475069046, -0.06155451014637947, 0.3564453125, 0.7744455337524414, 0.9666945934295654],
+                [0.9493208527565002, 1.1415706872940063, 1.5595703125, 1.977570652961731, 2.1698191165924072],
+                [2.8301806449890137, 3.022430181503296, 3.4404296875, 3.8584301471710205, 4.050677299499512],
+                [4.033306121826172, 4.225555419921875, 4.6435546875, 5.061554908752441, 5.253802299499512],
+              ],
+              [
+                [5.746196269989014, 5.938446998596191, 6.3564453125, 6.774445533752441, 6.966691493988037],
+                [6.94932222366333, 7.14157247543335, 7.5595703125, 7.977570056915283, 8.169816970825195],
+                [8.830181121826172, 9.022432327270508, 9.4404296875, 9.858429908752441, 10.050675392150879],
+                [10.033307075500488, 10.225557327270508, 10.6435546875, 11.061556816101074, 11.253799438476562],
+              ],
+              [
+                [11.746198654174805, 11.938446998596191, 12.3564453125, 12.774446487426758, 12.966689109802246],
+                [12.949322700500488, 13.141572952270508, 13.5595703125, 13.977571487426758, 14.16981315612793],
+                [14.830183029174805, 15.022432327270508, 15.4404296875, 15.858430862426758, 16.05067253112793],
+                [16.033309936523438, 16.225557327270508, 16.6435546875, 17.061555862426758, 17.25379753112793],
+              ],
             ],
             [
-              [101.29875, 103.962814],
-              [108.16797, 110.83203],
-              [115.03721, 117.70127],
+              [
+                [17.746200561523438, 17.938447952270508, 18.3564453125, 18.774446487426758, 18.966686248779297],
+                [18.949325561523438, 19.14157485961914, 19.5595703125, 19.977571487426758, 20.169809341430664],
+                [20.830184936523438, 21.02243423461914, 21.4404296875, 21.858430862426758, 22.050668716430664],
+                [22.03331184387207, 22.225557327270508, 22.6435546875, 23.061555862426758, 23.25379180908203],
+              ],
+              [
+                [23.746200561523438, 23.93844985961914, 24.3564453125, 24.77444839477539, 24.96668243408203],
+                [24.949325561523438, 25.141576766967773, 25.5595703125, 25.977571487426758, 26.1698055267334],
+                [26.830184936523438, 27.022436141967773, 27.4404296875, 27.858430862426758, 28.05066680908203],
+                [28.033313751220703, 28.225557327270508, 28.6435546875, 29.061555862426758, 29.25379180908203],
+              ],
+              [
+                [29.74620246887207, 29.93844985961914, 30.3564453125, 30.77444839477539, 30.96668243408203],
+                [30.949325561523438, 31.141578674316406, 31.5595703125, 31.977571487426758, 32.16980743408203],
+                [32.8301887512207, 33.022438049316406, 33.4404296875, 33.858428955078125, 34.050662994384766],
+                [34.03330993652344, 34.22556686401367, 34.6435546875, 35.06155014038086, 35.253787994384766],
+              ],
             ],
-          ],
-        ].flat(Infinity),
-      );
+          ].flat(Infinity),
+        );
 
-      expectToBeCloseToArray(target, resized.data);
+        expectToBeCloseToArray(target, resized.data);
+      });
     });
   });
 
@@ -188,4 +299,25 @@ describe("Tensor operations", () => {
       expectToBeCloseToArray(target, result.data);
     });
   });
+
+  describe("slice", () => {
+    it("should slice", async () => {
+      const input = new Tensor("float32", [1, 2, 3, 4, 5, 6, 7, 8, 9], [3, 3]);
+
+      const target = new Float32Array(
+        [
+          [1, 2],
+          [4, 5],
+        ].flat(Infinity),
+      );
+
+      const starts = [0, 0];
+      const ends = [2, 2];
+      const axes = [0, 1];
+      const steps = [1, 1];
+
+      const result = await slice(input, starts, ends, axes, steps);
+      expectToBeCloseToArray(target, result.data);
+    });
+  });
 });

From 753782fe9991e4299371ffc6e30df1b9fc5f92e9 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 16:41:43 +0000
Subject: [PATCH 08/15] Move slice tensor op test

---
 tests/utils/tensor.test.js | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/tests/utils/tensor.test.js b/tests/utils/tensor.test.js
index a412f8c2b..684ded602 100644
--- a/tests/utils/tensor.test.js
+++ b/tests/utils/tensor.test.js
@@ -1,4 +1,4 @@
-import { Tensor, cat, mean, stack, layer_norm, slice } from "../../src/transformers.js";
+import { Tensor, cat, mean, stack, layer_norm } from "../../src/transformers.js";
 import { init } from "../init.js";
 import { compare } from "../test_utils.js";
 
@@ -208,21 +208,6 @@ describe("Tensor operations", () => {
     });
   });
 
-  describe("slice", () => {
-    it("should slice", async () => {
-      const input = new Tensor("float32", [1, 2, 3, 4, 5, 6, 7, 8, 9], [3, 3]);
-
-      const target = new Tensor("float32", [1, 2, 4, 5], [2, 2]);
-
-      const starts = [0, 0];
-      const ends = [2, 2];
-      const axes = [0, 1];
-      const steps = [1, 1];
-
-      const sliced = await slice(input, starts, ends, axes, steps);
-      compare(sliced, target, 1e-3);
-    });
-  });
   describe("to", () => {
     it("float32 to int32 (number to number)", async () => {
       const t1 = new Tensor("float32", [1, 2, 3, 4, 5, 6], [2, 3]);

From 535bb05ae72e0937cca492826c9ca5783ab2522e Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 17:09:21 +0000
Subject: [PATCH 09/15] Fix `image_sizes` and `num_img_tokens`

---
 src/models/phi3_v/image_processing_phi3_v.js | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/phi3_v/image_processing_phi3_v.js
index 26d354548..1f70c58bd 100644
--- a/src/models/phi3_v/image_processing_phi3_v.js
+++ b/src/models/phi3_v/image_processing_phi3_v.js
@@ -150,12 +150,13 @@ export class Phi3VImageProcessor extends ImageProcessor {
 
         const image_sizes = new Tensor(
             'int64',
-            new Array(num_images).fill([IMAGE_SIZE, IMAGE_SIZE]).flat(),
+            reshaped_input_sizes.flat(),
             [num_images, 2],
         )
 
-        const image_tokens = this.calc_num_image_tokens_from_image_size(IMAGE_SIZE, IMAGE_SIZE);
-        const num_img_tokens = new Array(num_images).fill(image_tokens);
+        const num_img_tokens = reshaped_input_sizes.map(
+            ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
+        )
 
         return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
     }

From a6000c0f7211ae272d8a8b6754642e1aaf6fd0b4 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 17:12:06 +0000
Subject: [PATCH 10/15] Add phi3v image processing unit tests

---
 .../phi3_v/test_image_processing_phi3_v.js    | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 tests/models/phi3_v/test_image_processing_phi3_v.js

diff --git a/tests/models/phi3_v/test_image_processing_phi3_v.js b/tests/models/phi3_v/test_image_processing_phi3_v.js
new file mode 100644
index 000000000..ac12f732f
--- /dev/null
+++ b/tests/models/phi3_v/test_image_processing_phi3_v.js
@@ -0,0 +1,101 @@
+import { AutoImageProcessor, Phi3VImageProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+const TARGET_IMAGE_SIZE = [3, 336, 336];
+
+export default () => {
+  // Phi3VImageProcessor
+  // - custom image processing (patching)
+  describe("Phi3VImageProcessor", () => {
+    const model_id = "onnx-community/Phi-3.5-vision-instruct";
+
+    /** @type {Record<string, import('../../../src/utils/image.js').RawImage>} */
+    const images = {};
+    /** @type {Phi3VImageProcessor} */
+    let processor;
+    beforeAll(async () => {
+      processor = await AutoImageProcessor.from_pretrained(model_id);
+
+      // Load images
+      const gradient_image = await load_cached_image("gradient_1280x640");
+      const white_image = await load_cached_image("white_image");
+
+      images.gradient_image = gradient_image;
+      images.white_image = white_image;
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    it(
+      "num_crops=0",
+      async () => {
+        const { pixel_values } = await processor(images.gradient_image, { num_crops: 0 });
+        expect(pixel_values.dims).toEqual([1, 1, 3, 336, 336]);
+        expect(pixel_values.mean().item()).toBeCloseTo(0.18679802119731903, 2);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "square image (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.white_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+        expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[2.050372362136841, 2.050372362136841, 2.050372362136841, 2.050372362136841, 2.050372362136841]], 1);
+        expect(pixel_values.mean().item()).toBeCloseTo(2.050372362136841, 1);
+
+        expect(image_sizes.tolist()).toEqual([[672n, 672n]]);
+        expect(num_img_tokens).toEqual([757]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "non-square image (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.gradient_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+
+        // NOTE: We use a slighly different cropping strategy to the python implementation,
+        // meaning the following tests would fail.
+        // expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[
+        //   0.18679802119731903, -0.5585645437240601, 0.9321606755256653, 0.0, 0.0,
+        // ]], 1);
+        // expect(pixel_values.mean().item()).toBeCloseTo(0.11207880824804306, 6);
+
+        expect(image_sizes.tolist()).toEqual([[336n, 672n]]);
+        expect(num_img_tokens).toEqual([457]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+    it(
+      "multiple images (num_crops=0)",
+      async () => {
+        const { pixel_values, image_sizes, num_img_tokens } = await processor([images.gradient_image, images.white_image], { num_crops: 0 });
+        expect(pixel_values.dims).toEqual([2, 1, ...TARGET_IMAGE_SIZE]);
+        expect(image_sizes.tolist()).toEqual([
+          [336n, 672n],
+          [672n, 672n],
+        ]);
+        expect(num_img_tokens).toEqual([457, 757]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+    it(
+      "multiple images (num_crops=4)",
+      async () => {
+        const num_crops = 4;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor([images.gradient_image, images.white_image], { num_crops });
+        expect(pixel_values.dims).toEqual([2, 1 + num_crops, ...TARGET_IMAGE_SIZE]);
+        expect(image_sizes.tolist()).toEqual([
+          [336n, 672n],
+          [672n, 672n],
+        ]);
+        expect(num_img_tokens).toEqual([457, 757]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};

From 0ab878ee72fd63f5c78741efd22ce3f6b24bed8b Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 17:24:51 +0000
Subject: [PATCH 11/15] Use ONNX slice op for improved performance

---
 .../idefics3/image_processing_idefics3.js     | 37 ++++++++++++-------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/src/models/idefics3/image_processing_idefics3.js b/src/models/idefics3/image_processing_idefics3.js
index 0da6c2cc7..8864661c9 100644
--- a/src/models/idefics3/image_processing_idefics3.js
+++ b/src/models/idefics3/image_processing_idefics3.js
@@ -3,7 +3,7 @@
 import {
     ImageProcessor,
 } from "../../base/image_processors_utils.js";
-import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
+import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
 
 export class Idefics3ImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
             const optimal_width = Math.ceil(width / num_splits_w);
 
             // Iterate through each row and column
-            for (let r = 0; r < num_splits_h; r++) {
-                for (let c = 0; c < num_splits_w; c++) {
-                    // Calculate the starting point of the crop
-                    const start_x = c * optimal_width;
-                    const start_y = r * optimal_height;
-
-                    // Calculate the ending point of the crop
-                    const end_x = Math.min(start_x + optimal_width, width);
-                    const end_y = Math.min(start_y + optimal_height, height);
-
-                    // Crop the image
-                    frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
+            for (let r = 0; r < num_splits_h; ++r) {
+                for (let c = 0; c < num_splits_w; ++c) {
+                    let start_x, start_y, end_x, end_y;
+                    if (r === num_splits_h - 1) { // At bottom
+                        start_y = height - optimal_height;
+                        end_y = height;
+                    } else {
+                        start_y = r * optimal_height;
+                        end_y = (r + 1) * optimal_height;
+                    }
+                    if (c === num_splits_w - 1) { // At right
+                        start_x = width - optimal_width;
+                        end_x = width;
+                    } else {
+                        start_x = c * optimal_width;
+                        end_x = (c + 1) * optimal_width;
+                    }
+
+                    const starts = [start_y, start_x];
+                    const ends = [end_y, end_x];
+
+                    const patch = await slice(pixel_values, starts, ends, [2, 3]);
+                    frames.push(patch);
                 }
             }
 

From 678c549ff1672a9594b5f172ae8d8bc21e98ff37 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 20:25:48 +0000
Subject: [PATCH 12/15] Fix phi3v image processing

---
 src/models/phi3_v/image_processing_phi3_v.js | 36 ++++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/models/phi3_v/image_processing_phi3_v.js b/src/models/phi3_v/image_processing_phi3_v.js
index 1f70c58bd..5e032b294 100644
--- a/src/models/phi3_v/image_processing_phi3_v.js
+++ b/src/models/phi3_v/image_processing_phi3_v.js
@@ -17,6 +17,8 @@ export class Phi3VImageProcessor extends ImageProcessor {
             do_convert_rgb: true,
             do_resize: true, // Smart resizing "hd_transform"
         });
+
+        this._num_crops = config.num_crops;
     }
     calc_num_image_tokens_from_image_size(width, height) {
         // @ts-expect-error
@@ -26,8 +28,7 @@ export class Phi3VImageProcessor extends ImageProcessor {
 
     /** @type {ImageProcessor['get_resize_output_image_size']} */
     get_resize_output_image_size(image, size) {
-        // @ts-expect-error
-        const hd_num = this.config.num_crops;
+        const hd_num = this._num_crops;
         const [width, height] = image.size
 
         let ratio = width / height;
@@ -50,16 +51,11 @@ export class Phi3VImageProcessor extends ImageProcessor {
     /** @type {ImageProcessor['pad_image']} */
     pad_image(pixelData, imgDims, padSize, options = {}) {
         // Phi3V uses a custom padding strategy:
-        // - Pad the shortest edge to a multiple of 336
-        // - Longest edge remains unchanged
+        // - Pad to a multiple of 336
         // - Pad with white pixels
         const [imageHeight, imageWidth] = imgDims;
-        let height = imageHeight, width = imageWidth;
-        if (imageHeight < imageWidth) {
-            height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
-        } else {
-            width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
-        }
+        const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
+        const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
 
         // NOTE: Since padding is done after normalization, we need to fill with the normalized values
         const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
@@ -73,10 +69,10 @@ export class Phi3VImageProcessor extends ImageProcessor {
     async _call(images, {
         num_crops = null,
     } = {}) {
-        num_crops ??= this.config.num_crops;
-        if (num_crops === 1 || sqrt(num_crops) % 1 !== 0) {
-            // Disallow num_crops==1 since it won't add extra information
-            throw new Error("num_crops must be a square number not equal to 1");
+        // @ts-expect-error
+        this._num_crops = num_crops ??= this.config.num_crops;
+        if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
+            throw new Error("num_crops must be a square number >= 4");
         }
 
         if (!Array.isArray(images)) {
@@ -141,6 +137,7 @@ export class Phi3VImageProcessor extends ImageProcessor {
                 all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
             } else {
                 // Only use the global image
+                // NOTE: Not currently supported in modelling code
                 all_pixel_values.push(batch_pixel_values);
             }
         }
@@ -148,15 +145,18 @@ export class Phi3VImageProcessor extends ImageProcessor {
         // [num_images, 1 + num_crops, num_channels=3, height, width]
         const pixel_values = stack(all_pixel_values, 0);
 
+        // Calculate padded image sizes
+        const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
+
         const image_sizes = new Tensor(
             'int64',
-            reshaped_input_sizes.flat(),
+            sizes.flat(),
             [num_images, 2],
-        )
+        );
 
-        const num_img_tokens = reshaped_input_sizes.map(
+        const num_img_tokens = sizes.map(
             ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
-        )
+        );
 
         return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
     }

From 907cb076295284744b49f680604e5cf509d71705 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 20:26:03 +0000
Subject: [PATCH 13/15] Support passing parameters to phi3v processor call
 function

---
 src/models/phi3_v/processing_phi3_v.js | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/models/phi3_v/processing_phi3_v.js b/src/models/phi3_v/processing_phi3_v.js
index 36a8bd06f..d07e9b176 100644
--- a/src/models/phi3_v/processing_phi3_v.js
+++ b/src/models/phi3_v/processing_phi3_v.js
@@ -17,7 +17,11 @@ export class Phi3VProcessor extends Processor {
      * @param  {...any} args 
      * @returns {Promise<any>}
      */
-    async _call(text, images = null, ...args) {
+    async _call(text, images = null, {
+        padding = true,
+        truncation = true,
+        num_crops = null,
+    } = {}) {
 
         if (!Array.isArray(text)) {
             text = [text];
@@ -25,17 +29,14 @@ export class Phi3VProcessor extends Processor {
 
         let text_inputs, image_inputs;
         if (images) {
-            image_inputs = await this.image_processor(images);
+            image_inputs = await this.image_processor(images, { num_crops });
             const { num_img_tokens } = image_inputs;
 
             // The original implementation adds a bos_token before the image tokens
             // TODO: Check if this affects performance, since it looks like a bug in the original implementation
             const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
 
-            text_inputs = this.tokenizer(prompt_chunks, {
-                padding: true,
-                truncation: true,
-            });
+            text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
 
             // The model expects image tokens to be negative, so we negate the image token ids
             const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];

From 7832324dd506ba86b0f8f69fa52412dc2374cbec Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 11 Dec 2024 20:26:35 +0000
Subject: [PATCH 14/15] Update phi3v processor unit tests

---
 .../phi3_v/test_image_processing_phi3_v.js    | 28 +++-----
 tests/processors.test.js                      | 69 ++++++++++++++++++-
 2 files changed, 78 insertions(+), 19 deletions(-)

diff --git a/tests/models/phi3_v/test_image_processing_phi3_v.js b/tests/models/phi3_v/test_image_processing_phi3_v.js
index ac12f732f..cbf21e897 100644
--- a/tests/models/phi3_v/test_image_processing_phi3_v.js
+++ b/tests/models/phi3_v/test_image_processing_phi3_v.js
@@ -26,16 +26,6 @@ export default () => {
       images.white_image = white_image;
     }, MAX_PROCESSOR_LOAD_TIME);
 
-    it(
-      "num_crops=0",
-      async () => {
-        const { pixel_values } = await processor(images.gradient_image, { num_crops: 0 });
-        expect(pixel_values.dims).toEqual([1, 1, 3, 336, 336]);
-        expect(pixel_values.mean().item()).toBeCloseTo(0.18679802119731903, 2);
-      },
-      MAX_TEST_EXECUTION_TIME,
-    );
-
     it(
       "square image (num_crops=4)",
       async () => {
@@ -70,19 +60,21 @@ export default () => {
       },
       MAX_TEST_EXECUTION_TIME,
     );
+
     it(
-      "multiple images (num_crops=0)",
+      "single image (num_crops=16)",
       async () => {
-        const { pixel_values, image_sizes, num_img_tokens } = await processor([images.gradient_image, images.white_image], { num_crops: 0 });
-        expect(pixel_values.dims).toEqual([2, 1, ...TARGET_IMAGE_SIZE]);
-        expect(image_sizes.tolist()).toEqual([
-          [336n, 672n],
-          [672n, 672n],
-        ]);
-        expect(num_img_tokens).toEqual([457, 757]);
+        const num_crops = 16;
+        const { pixel_values, image_sizes, num_img_tokens } = await processor(images.gradient_image, { num_crops });
+        expect(pixel_values.dims).toEqual([1, 1 + num_crops, 3, 336, 336]);
+        expect(pixel_values.mean().item()).toBeCloseTo(0.4677375257015228, 1);
+
+        expect(image_sizes.tolist()).toEqual([[1008n, 1680n]]);
+        expect(num_img_tokens).toEqual([2353]);
       },
       MAX_TEST_EXECUTION_TIME,
     );
+
     it(
       "multiple images (num_crops=4)",
       async () => {
diff --git a/tests/processors.test.js b/tests/processors.test.js
index e35e555d2..2b727eeb9 100644
--- a/tests/processors.test.js
+++ b/tests/processors.test.js
@@ -37,8 +37,8 @@ const avg = (array) => sum(array) / array.length;
 const MODELS = {
   florence2: "Xenova/tiny-random-Florence2ForConditionalGeneration",
   qwen2_vl: "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration",
-  idefics3: "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration",
   paligemma: "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration",
+  phi3_v: "onnx-community/Phi-3.5-vision-instruct",
 };
 
 describe("Processors", () => {
@@ -517,5 +517,72 @@ describe("Processors", () => {
       },
       MAX_TEST_TIME,
     );
+
+    describe(
+      "Phi3VProcessor",
+      () => {
+        /** @type {import('../src/transformers.js').Phi3VProcessor} */
+        let processor;
+        let images = {};
+
+        beforeAll(async () => {
+          processor = await AutoProcessor.from_pretrained(MODELS.phi3_v, {
+            // Use legacy to match python version
+            legacy: true,
+          });
+          images = {
+            white_image: await load_cached_image("white_image"),
+          };
+        });
+
+        const create_prompt = (text, images = []) => {
+          const placeholder = images.map((_, i) => `<|image_${i + 1}|>\n`).join("");
+          const messages = [{ role: "user", content: placeholder + text }];
+          const prompt = processor.tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true });
+          return prompt;
+        };
+
+        it("Text-only", async () => {
+          const prompt = create_prompt("Hi there.");
+          const { input_ids, pixel_values } = await processor(prompt);
+          expect(input_ids.dims).toEqual([1, 11]);
+          expect(pixel_values).toBeUndefined();
+        });
+
+        it("Single image & text", async () => {
+          const imgs = [images.white_image];
+          const prompt = create_prompt("Describe this image.", imgs);
+          const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+          expect(input_ids.dims).toEqual([1, /* 773 */ 770]);
+          expect(attention_mask.dims).toEqual(input_ids.dims);
+          expect(pixel_values.dims).toEqual([1, 5, 3, 336, 336]);
+          expect(image_sizes.tolist()).toEqual([[672n, 672n]]);
+        });
+
+        it("Single image (num_crops=16) & text", async () => {
+          const imgs = [images.white_image];
+          const prompt = create_prompt("Describe this image.", imgs);
+          const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs, { num_crops: 16 });
+          expect(input_ids.dims).toEqual([1, /* 2525 */ 2522]);
+          expect(attention_mask.dims).toEqual(input_ids.dims);
+          expect(pixel_values.dims).toEqual([1, 17, 3, 336, 336]);
+          expect(image_sizes.tolist()).toEqual([[1344n, 1344n]]);
+        });
+
+        it("Multiple images & text", async () => {
+          const imgs = [images.white_image, images.white_image];
+          const prompt = create_prompt("Describe these images.", imgs);
+          const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+          expect(input_ids.dims).toEqual([1, /* 1533 */ 1527]);
+          expect(attention_mask.dims).toEqual(input_ids.dims);
+          expect(pixel_values.dims).toEqual([2, 5, 3, 336, 336]);
+          expect(image_sizes.tolist()).toEqual([
+            [672n, 672n],
+            [672n, 672n],
+          ]);
+        });
+      },
+      MAX_TEST_TIME,
+    );
   });
 });

From 7a58f424f890a47c03fea2c17fe79e330fd1da98 Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Sun, 15 Dec 2024 13:20:39 +0000
Subject: [PATCH 15/15] Move phi3_v processor unit test to folder

---
 tests/models/phi3_v/test_processor_phi3_v.js | 87 ++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 tests/models/phi3_v/test_processor_phi3_v.js

diff --git a/tests/models/phi3_v/test_processor_phi3_v.js b/tests/models/phi3_v/test_processor_phi3_v.js
new file mode 100644
index 000000000..6896046ef
--- /dev/null
+++ b/tests/models/phi3_v/test_processor_phi3_v.js
@@ -0,0 +1,87 @@
+import { AutoProcessor, Phi3VProcessor } from "../../../src/transformers.js";
+
+import { load_cached_image } from "../../asset_cache.js";
+import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";
+
+export default () => {
+  const model_id = "onnx-community/Phi-3.5-vision-instruct";
+
+  describe("Phi3VProcessor", () => {
+    /** @type {Phi3VProcessor} */
+    let processor;
+    let images = {};
+
+    beforeAll(async () => {
+      processor = await AutoProcessor.from_pretrained(model_id, {
+        // Use legacy to match python version
+        legacy: true,
+      });
+      images = {
+        white_image: await load_cached_image("white_image"),
+      };
+    }, MAX_PROCESSOR_LOAD_TIME);
+
+    const create_prompt = (text, images = []) => {
+      const placeholder = images.map((_, i) => `<|image_${i + 1}|>\n`).join("");
+      const messages = [{ role: "user", content: placeholder + text }];
+      const prompt = processor.tokenizer.apply_chat_template(messages, { tokenize: false, add_generation_prompt: true });
+      return prompt;
+    };
+
+    it(
+      "Text-only",
+      async () => {
+        const prompt = create_prompt("Hi there.");
+        const { input_ids, pixel_values } = await processor(prompt);
+        expect(input_ids.dims).toEqual([1, 11]);
+        expect(pixel_values).toBeUndefined();
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Single image & text",
+      async () => {
+        const imgs = [images.white_image];
+        const prompt = create_prompt("Describe this image.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+        expect(input_ids.dims).toEqual([1, /* 773 */ 770]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([1, 5, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([[672n, 672n]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Single image (num_crops=16) & text",
+      async () => {
+        const imgs = [images.white_image];
+        const prompt = create_prompt("Describe this image.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs, { num_crops: 16 });
+        expect(input_ids.dims).toEqual([1, /* 2525 */ 2522]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([1, 17, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([[1344n, 1344n]]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+
+    it(
+      "Multiple images & text",
+      async () => {
+        const imgs = [images.white_image, images.white_image];
+        const prompt = create_prompt("Describe these images.", imgs);
+        const { input_ids, attention_mask, pixel_values, image_sizes } = await processor(prompt, imgs);
+        expect(input_ids.dims).toEqual([1, /* 1533 */ 1527]);
+        expect(attention_mask.dims).toEqual(input_ids.dims);
+        expect(pixel_values.dims).toEqual([2, 5, 3, 336, 336]);
+        expect(image_sizes.tolist()).toEqual([
+          [672n, 672n],
+          [672n, 672n],
+        ]);
+      },
+      MAX_TEST_EXECUTION_TIME,
+    );
+  });
+};