From 73ab4d0b76b92426ae73cb74018f2662cdefd062 Mon Sep 17 00:00:00 2001
From: Lutz Roeder <lutzroeder@users.noreply.github.com>
Date: Sat, 18 Jan 2025 13:51:13 -0800
Subject: [PATCH] Update executorch.js (#1175)

---
 source/executorch.js         | 104 ++++++++++++++--------
 source/python.js             |   2 +-
 source/pytorch-metadata.json | 168 +++++++++++++++++++++++++++++++++++
 tools/pytorch_script.py      |  38 ++++++++
 4 files changed, 276 insertions(+), 36 deletions(-)

diff --git a/source/executorch.js b/source/executorch.js
index acafd213f2..b6f13f9188 100644
--- a/source/executorch.js
+++ b/source/executorch.js
@@ -49,10 +49,14 @@ executorch.Graph = class {
         this.outputs = [];
         this.nodes = [];
         const values = new Map();
-        values.map = (arg) => {
-            if (!values.has(arg)) {
-                const v = plan.values[arg].val;
-                if (v instanceof executorch.schema.Tensor || v instanceof executorch.schema.TensorList) {
+        values.map = (index, output) => {
+            if (!values.has(index)) {
+                const v = plan.values[index].val;
+                const tensor = v instanceof executorch.schema.Tensor || v instanceof executorch.schema.TensorList;
+                if (output && !tensor) {
+                    const value = [new executorch.Value(index.toString(), null, null)];
+                    values.set(index, { type: null, value });
+                } else if (tensor) {
                     const tensors = v instanceof executorch.schema.Tensor ? [v] : Array.from(v.items).map((arg) => plan.values[arg].val);
                     const list = [];
                     for (let i = 0; i < tensors.length; i++) {
@@ -62,26 +66,27 @@ executorch.Graph = class {
                         if (v.data_buffer_idx > 0) {
                             initializer = new executorch.Tensor(tensor);
                         }
-                        const identifier = tensors.length > 1 ? `${arg}.${i}` : arg.toString();
-                        list.push(new executorch.Value(identifier, type, initializer));
+                        const identifier = tensors.length > 1 ? `${index}.${i}` : index.toString();
+                        const value = new executorch.Value(identifier, type, initializer);
+                        list.push(value);
                     }
-                    values.set(arg, { type: null, value: list });
+                    values.set(index, { type: null, value: list });
                 } else if (v instanceof executorch.schema.Bool) {
-                    values.set(arg, { type: 'int64', value: v.bool_val });
+                    values.set(index, { type: 'int64', value: v.bool_val });
                 } else if (v instanceof executorch.schema.Int) {
-                    values.set(arg, { type: 'int64', value: v.int_val });
+                    values.set(index, { type: 'int64', value: v.int_val });
                 } else if (v instanceof executorch.schema.IntList) {
                     const list = v.items.map((index) => plan.values[index].val.int_val);
-                    values.set(arg, { type: 'int64[]', value: list });
+                    values.set(index, { type: 'int64[]', value: list });
                 } else if (v instanceof executorch.schema.Double) {
-                    values.set(arg, { type: 'float64', value: v.double_val });
+                    values.set(index, { type: 'float64', value: v.double_val });
                 } else if (v instanceof executorch.schema.Null) {
-                    values.set(arg, { type: 'attribute', value: null });
+                    values.set(index, { type: 'attribute', value: null });
                 } else {
                     throw new Error('Value type not implemented.');
                 }
             }
-            return values.get(arg);
+            return values.get(index);
         };
         for (const input of plan.inputs) {
             const value = values.map(input);
@@ -128,6 +133,7 @@ executorch.Node = class {
         this.name = '';
         this.inputs = [];
         this.outputs = [];
+        this.attributes = [];
         const instr_args = instruction.instr_args;
         if (instr_args instanceof executorch.schema.KernelCall) {
             const op = plan.operators[instr_args.op_index];
@@ -135,6 +141,9 @@ executorch.Node = class {
             const identifier = op.overload ? `${op.name}.${op.overload}` : op.name;
             const schemas = execution.invoke('torch._C._jit_get_schemas_for_operator', [op.name]);
             const schema = schemas.find((schema) => schema.name === op.name && schema.overload_name === op.overload);
+            if (!schema) {
+                throw new executorch.Error(`Operator schema for '${identifier}' not found.`);
+            }
             const category = schema && schema.category ? schema.category : '';
             const alias = (arg) => arg && arg.alias_info && arg.alias_info.before_set.length === 1 ? arg.alias_info.before_set[0] : null;
             const outputs = new Set(schema && Array.isArray(schema.returns) ? schema.returns.map((arg) => alias(arg)).filter((alias) => alias !== null) : []);
@@ -143,33 +152,59 @@ executorch.Node = class {
             let i = 0;
             const args = instr_args.args;
             for (; i < schema.arguments.length; i++) {
-                const v = args[i];
+                const index = args[i];
                 const arg = schema && i < schema.arguments.length ? schema.arguments[i] : null;
                 const output = arg ? alias(schema.arguments[i]) : null;
                 if (output && outputs.has(output)) {
-                    inputs.set(output, v);
+                    inputs.set(output, index);
                     continue;
                 }
                 const name = arg ? arg.name : i.toString();
-                const value = values.map(v);
+                const value = values.map(index);
                 const argument = new executorch.Argument(name, value.value, value.type);
                 this.inputs.push(argument);
             }
             for (let j = 0; j < schema.returns.length; j++) {
                 const ret = schema.returns[j];
                 const output = alias(ret);
-                const v = output && inputs.has(output) ? inputs.get(output) : args[i++];
+                let index = args[i++];
+                index = output && inputs.has(output) ? inputs.get(output) : index;
                 const name = ret.name;
-                const value = values.map(v);
+                const value = values.map(index, true);
                 const argument = new executorch.Argument(name || '', value.value, value.type);
                 this.outputs.push(argument);
             }
         } else if (instr_args instanceof executorch.schema.DelegateCall) {
             const delegate = plan.delegates[instr_args.delegate_index];
+            const args = instr_args.args;
             const name = delegate.id;
             this.type = { name };
+            switch (name) {
+                case 'XnnpackBackend': {
+                    const input = values.map(args[0]);
+                    const output = values.map(args[1], true);
+                    this.inputs.push(new executorch.Argument('input', input.value, input.type));
+                    this.outputs.push(new executorch.Argument('output', output.value, output.type));
+                    break;
+                }
+                case 'CoreMLBackend': {
+                    const input = values.map(args[0]);
+                    const output = values.map(args[1], true);
+                    this.inputs.push(new executorch.Argument('input', input.value, input.type));
+                    this.outputs.push(new executorch.Argument('output', output.value, output.type));
+                    break;
+                }
+                default: {
+                    throw new executorch.Error(`ExecuTorch delegate '${name}' not implemented.`);
+                }
+            }
+            for (const spec of delegate.compile_specs) {
+                const value = ArrayBuffer.isView(spec.value) ? Array.from(spec.value) : spec.value;
+                const attribute = new executorch.Argument(spec.key, value);
+                this.attributes.push(attribute);
+            }
         } else {
-            throw new Error('Instruction argument not implemented.');
+            throw new Error(`Instruction type '${instr_args.constructor.name}' not implemented.`);
         }
     }
 };
@@ -177,23 +212,22 @@ executorch.Node = class {
 executorch.TensorType = class {
 
     constructor(tensor) {
-        const ScalarType = executorch.schema.ScalarType;
-        switch (tensor.scalar_type) {
-
-            case ScalarType.BOOL: this.dataType = 'boolean'; break;
-            case ScalarType.BYTE: this.dataType = 'uint8'; break;
-            case ScalarType.CHAR: this.dataType = 'int8'; break;
-            case ScalarType.SHORT: this.dataType = 'int16'; break;
-            case ScalarType.INT: this.dataType = 'int32'; break;
-            case ScalarType.LONG: this.dataType = 'int64'; break;
-            case ScalarType.HALF: this.dataType = 'float16'; break;
-            case ScalarType.FLOAT: this.dataType = 'float32'; break;
-            case ScalarType.DOUBLE: this.dataType = 'float64'; break;
-            case ScalarType.UINT16: this.dataType = 'uint16'; break;
-            case ScalarType.UINT32: this.dataType = 'uint32'; break;
-            case ScalarType.UINT64: this.dataType = 'uint64'; break;
-            default: throw new executorch.Error(`Unknown tensor data type '${tensor.scalar_type}'.`);
+        executorch.TensorType._types = executorch.TensorType._types || [
+            'uint8',
+            'int8', 'int16', 'int32', 'int64',
+            'float16', 'float32', 'float64',
+            'complex16', 'complex32', 'complex64',
+            'boolean',
+            'qint8', 'quint8', 'qint32',
+            'bfloat16',
+            'quint4x2', 'quint2x4', 'bits1x8', 'bits2x4', 'bits4x2', 'bits8', 'bits16',
+            'float8e5m2', 'float8e4m3fn', 'float8e5m2fnuz', 'float8e4m3fnuz',
+            'uint16', 'uint32', 'uint64'
+        ];
+        if (tensor.scalar_type >= executorch.TensorType._types.length) {
+            throw new executorch.Error(`Unknown tensor data type '${tensor.scalar_type}'.`);
         }
+        this.dataType = executorch.TensorType._types.length[tensor.scalar_type];
         this.shape = new executorch.TensorShape(Array.from(tensor.sizes));
     }
 
diff --git a/source/python.js b/source/python.js
index 43122e4b7d..22d21e5b67 100644
--- a/source/python.js
+++ b/source/python.js
@@ -18340,7 +18340,7 @@ python.Execution = class {
         torch.quint2x4 = new torch.dtype(17, 'quint2x4');
         torch.bits1x8 = new torch.dtype(18, 'bits1x8');
         torch.bits2x4 = new torch.dtype(19, 'bits2x4');
-        torch.bits2x4 = new torch.dtype(20, 'bits2x4');
+        torch.bits4x2 = new torch.dtype(20, 'bits4x2');
         torch.bits8 = new torch.dtype(21, 'bits8');
         torch.bits16 = new torch.dtype(22, 'bits16');
         torch.float8_e5m2 = new torch.dtype(23, 'float8_e5m2', 1);
diff --git a/source/pytorch-metadata.json b/source/pytorch-metadata.json
index c6c63ab853..b5f35c77b0 100755
--- a/source/pytorch-metadata.json
+++ b/source/pytorch-metadata.json
@@ -1172,6 +1172,24 @@
   {
     "name": "aten::unsqueeze_copy.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)"
   },
+  {
+    "name": "aten::squeeze_copy(Tensor self) -> Tensor"
+  },
+  {
+    "name": "aten::squeeze_copy.dim(Tensor self, int dim) -> Tensor"
+  },
+  {
+    "name": "aten::squeeze_copy.dims(Tensor self, int[] dim) -> Tensor"
+  },
+  {
+    "name": "aten::squeeze_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "aten::squeeze_copy.dim_out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "aten::squeeze_copy.dims_out(Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)"
+  },
   {
     "name": "aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]"
   },
@@ -1214,6 +1232,33 @@
   {
     "name": "aten::view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
   },
+  {
+    "name": "aten::copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor"
+  },
+  {
+    "name": "aten::copy.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "aten::copy.t(t[](a) self) -> t[]"
+  },
+  {
+    "name": "aten::copy.Dict_str(Dict(str, t)(a) self) -> Dict(str, t)"
+  },
+  {
+    "name": "aten::copy.Dict_int(Dict(int, t)(a) self) -> Dict(int, t)"
+  },
+  {
+    "name": "aten::copy.Dict_bool(Dict(bool, t)(a) self) -> Dict(bool, t)"
+  },
+  {
+    "name": "aten::copy.Dict_float(Dict(float, t)(a) self) -> Dict(float, t)"
+  },
+  {
+    "name": "aten::copy.Dict_complex(Dict(complex, t)(a) self) -> Dict(complex, t)"
+  },
+  {
+    "name": "aten::copy.Dict_Tensor(Dict(Tensor, t)(a) self) -> Dict(Tensor, t)"
+  },
   {
     "name": "aten::smooth_l1_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta, *, Tensor(a!) grad_input) -> Tensor(a!)"
   },
@@ -1797,6 +1842,12 @@
   {
     "name": "aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!)"
   },
+  {
+    "name": "aten::expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor"
+  },
+  {
+    "name": "aten::expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)"
+  },
   {
     "name": "aten::view_copy(Tensor self, SymInt[] size) -> Tensor"
   },
@@ -2653,6 +2704,9 @@
   {
     "name": "aten::masked_fill.Tensor_out(Tensor self, Tensor mask, Tensor value, *, Tensor(a!) out) -> Tensor(a!)"
   },
+  {
+    "name": "aten::_local_scalar_dense(Tensor self) -> Scalar"
+  },
   {
     "name": "aten::_unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)"
   },
@@ -7352,6 +7406,120 @@
   {
     "name": "executorch_prim::et_view.default(Tensor self, int[] size) -> (Tensor out)"
   },
+  {
+    "name": "executorch_prim::sub.Scalar(Scalar a, Scalar b) -> Scalar"
+  },
+  {
+    "name": "executorch_prim::mul.Scalar(Scalar a, Scalar b) -> Scalar"
+  },
+  {
+    "name": "executorch_prim::floordiv.Scalar(Scalar a, Scalar b) -> Scalar"
+  },
+  {
+    "name": "executorch_prim::add.Scalar(Scalar a, Scalar b) -> Scalar"
+  },
+  {
+    "name": "llama::sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, Tensor(b!) value_cache, int start_pos, int seq_len, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)"
+  },
+  {
+    "name": "llama::sdpa.out(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "quantized_decomposed::choose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)"
+  },
+  {
+    "name": "quantized_decomposed::embedding_4bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::embedding_4bit.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "quantized_decomposed::embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::add(Tensor a, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, Tensor b, float b_scale, int b_zero_point, int b_quant_min, int b_quant_max, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max) -> Tensor qc"
+  },
+  {
+    "name": "quantized_decomposed::add.scalar(Tensor qa, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, ScalarType a_dtype, Scalar b, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max, ScalarType out_dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::add_relu(Tensor a, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, Tensor b, float b_scale, int b_zero_point, int b_quant_min, int b_quant_max, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max) -> Tensor qc"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::fake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)"
+  },
+  {
+    "name": "quantized_decomposed::mixed_linear(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+  },
+  {
+    "name": "quantized_decomposed::choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::dequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)"
+  },
+  {
+    "name": "quantized_decomposed::mixed_mm(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points) -> Tensor"
+  },
+  {
+    "name": "quantized_decomposed::_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor)"
+  },
+  {
+    "name": "quantized_decomposed::quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensor"
+  },
+  {
+    "name": "llama::fast_hadamard_transform.out(Tensor mat, *, Tensor(a!) out) -> Tensor(a!)"
+  },
   {
     "name": "__torch__.torch.classes.rnn.CellParamsBase",
     "inputs": [
diff --git a/tools/pytorch_script.py b/tools/pytorch_script.py
index a48e96ee9a..7391a1d99d 100644
--- a/tools/pytorch_script.py
+++ b/tools/pytorch_script.py
@@ -59,6 +59,10 @@ def _write_metadata(metadata):
     'aten::fft(Tensor self, int signal_ndim, bool normalized=False) -> Tensor',
     'aten::grid_sampler.legacy(Tensor input, Tensor grid, int interpolation_mode, int padding_mode) -> Tensor',
     'executorch_prim::et_view.default(Tensor self, int[] size) -> (Tensor out)',
+    'executorch_prim::add.Scalar(Scalar a, Scalar b) -> Scalar',
+    'executorch_prim::sub.Scalar(Scalar a, Scalar b) -> Scalar',
+    'executorch_prim::mul.Scalar(Scalar a, Scalar b) -> Scalar',
+    'executorch_prim::floordiv.Scalar(Scalar a, Scalar b) -> Scalar',
     'neuron::_execute_neuron(__torch__.torch.classes.neuron.Model _0, Tensor[] _1) -> Tensor[] _0',
     'neuron::_from_neuron(Tensor _0) -> Tensor _0',
     'neuron::_init_neuron() -> ()',
@@ -211,9 +215,43 @@ def _write_metadata(metadata):
     'horizon::scale_quanti(Tensor x, Tensor scale, Tensor zero_point, int d, int min, int max, bool flag1, bool flat2, str str1, str str2) -> Tensor',
     'prim::isinstance(Any to_check) -> bool',
     'prim::shape(Tensor self) -> int[]',
+    'llama::fast_hadamard_transform.out(Tensor mat, *, Tensor(a!) out) -> Tensor(a!)',
+    'llama::sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, Tensor(b!) value_cache, int start_pos, int seq_len, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)',
+    'llama::sdpa.out(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(a!) out) -> Tensor(a!)',
     'quantized_decomposed::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)',
     'quantized_decomposed::dequantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)',
     'quantized_decomposed::dequantize_per_tensor.Tensor_out(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None, Tensor(a!) out) -> Tensor(a!)',
+    'quantized_decomposed::choose_qparams.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)',
+    'quantized_decomposed::embedding_4bit(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor',
+    'quantized_decomposed::embedding_4bit.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor',
+    'quantized_decomposed::embedding_4bit.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)',
+    'quantized_decomposed::embedding_4bit.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)',
+    'quantized_decomposed::dequantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor',
+    'quantized_decomposed::dequantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor',
+    'quantized_decomposed::dequantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor',
+    'quantized_decomposed::add(Tensor a, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, Tensor b, float b_scale, int b_zero_point, int b_quant_min, int b_quant_max, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max) -> Tensor qc',
+    'quantized_decomposed::add.scalar(Tensor qa, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, ScalarType a_dtype, Scalar b, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max, ScalarType out_dtype) -> Tensor',
+    'quantized_decomposed::add_relu(Tensor a, float a_scale, int a_zero_point, int a_quant_min, int a_quant_max, Tensor b, float b_scale, int b_zero_point, int b_quant_min, int b_quant_max, float out_scale, int out_zero_point, int out_quant_min, int out_quant_max) -> Tensor qc',
+    'quantized_decomposed::dequantize_per_channel(Tensor input, Tensor scales, Tensor? zero_points, int axis, int quant_min, int quant_max, ScalarType dtype, *, ScalarType? out_dtype=None) -> Tensor',
+    'quantized_decomposed::fake_quant_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max) -> Tensor',
+    'quantized_decomposed::quantize_per_channel(Tensor input, Tensor scales, Tensor zero_points, int axis, int quant_min, int quant_max, ScalarType dtype) -> Tensor',
+    'quantized_decomposed::choose_qparams_symmetric.tensor(Tensor input, int quant_min, int quant_max, float eps, ScalarType dtype) -> (Tensor, Tensor)',
+    'quantized_decomposed::mixed_linear(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, ScalarType? dtype=None) -> Tensor',
+    'quantized_decomposed::dequantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, ScalarType output_dtype) -> Tensor',
+    'quantized_decomposed::quantize_per_tensor(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor',
+    'quantized_decomposed::quantize_per_tensor.tensor(Tensor input, Tensor scale, Tensor zero_point, int quant_min, int quant_max, ScalarType dtype) -> Tensor',
+    'quantized_decomposed::quantize_per_tensor.tensor2(Tensor input, Tensor scale, Tensor zero_point, Tensor quant_min, Tensor quant_max, ScalarType dtype) -> Tensor',
+    'quantized_decomposed::choose_qparams_per_token_asymmetric(Tensor input, ScalarType dtype) -> (Tensor, Tensor)',
+    'quantized_decomposed::choose_qparams_per_token(Tensor input, ScalarType dtype) -> (Tensor, Tensor)',
+    'quantized_decomposed::quantize_per_channel_group(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size) -> Tensor',
+    'quantized_decomposed::dequantize_per_channel_group(Tensor input, Tensor scales, Tensor? zero_points, int quant_min, int quant_max, ScalarType dtype, int group_size, ScalarType output_dtype) -> Tensor',
+    'quantized_decomposed::embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor',
+    'quantized_decomposed::embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor',
+    'quantized_decomposed::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)',
+    'quantized_decomposed::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)',
+    'quantized_decomposed::mixed_mm(Tensor input, Tensor weight, Tensor weight_scales, Tensor? weight_zero_points) -> Tensor',
+    'quantized_decomposed::_choose_qparams_per_token_asymmetric_impl(Tensor input, ScalarType dtype) -> (Tensor, Tensor)',
+    'quantized_decomposed::quantize_per_token(Tensor input, Tensor scales, Tensor zero_points, int quant_min, int quant_max, ScalarType dtype) -> Tensor',
     'torch_sparse::hgt_sample(Dict(str, Tensor) _0, Dict(str, Tensor) _1, Dict(str, Tensor) _2, Dict(str, int[]) _3, int _4) -> (Dict(str, Tensor) _0, Dict(str, Tensor) _1, Dict(str, Tensor) _2, Dict(str, Tensor) _3)',
     'torch_sparse::cuda_version() -> int _0',
     'torch_sparse::random_walk(Tensor _0, Tensor _1, Tensor _2, int _3) -> Tensor _0',